Skip to content

Commit

Permalink
update scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
ndleah committed Jan 26, 2022
1 parent 4f6a203 commit 2d2373e
Show file tree
Hide file tree
Showing 9 changed files with 677 additions and 419 deletions.
99 changes: 60 additions & 39 deletions scripts/01-EDA-pt1.R
Original file line number Diff line number Diff line change
@@ -1,23 +1,32 @@
########################################################
# DATA UNDERSTANDING: EDA PART 1 - THE DATASET
########################################################
# ------------------------------------------------------
# LOAD THE LIBRARIES
# ------------------------------------------------------
library(here) # assess the file path
library(DataExplorer) # EDA visualizations
library(tidyverse) # data wrangling
library(kableExtra) # write table


# ------------------------------------------------------
# OVERVIEW OF THE DATASET
# ------------------------------------------------------
#################################################################
## MLAA - Assignment AT1a ##
## MLR model on Financial Data set ##
## Author: Leah Nguyen ##
#################################################################

#################################################################
## Task 1 - EDA (Pt.1) ##
#################################################################

##----------------------------------------------------------------
## Load the Libraries --
##----------------------------------------------------------------
library(here) # assess the file path
library(DataExplorer) # EDA visualizations
library(tidyverse) # data wrangling
library(kableExtra) # write table
library(bannerCommenter) # create comment banner


##----------------------------------------------------------------
## Dataset Overview --
##----------------------------------------------------------------
# Load Data File
df <- read_csv(here("dataset/transactions.csv"))


# Quick overview of the dataset then transpose for better table display:
# Quick overview of the data set then transpose for better table display.
# The information including:
# * index and column data types
# * non-null values
# * memory usage
Expand All @@ -26,14 +35,23 @@ df_overview <- introduce(df) %>% t()

# turn the table into the Markdown format
df_overview %>% kbl() %>% kable_styling(bootstrap_options = "striped", full_width = F)
# rows 94247
# columns 5
# discrete_columns 4
# continuous_columns 1
# all_missing_columns 0
# total_missing_values 0
# complete_rows 94247
# total_observations 471235
# memory_usage 4201744


# inspect columns data type
sapply(df, class)
# date customer_id industry location monthly_amount
# "Date" "character" "character" "character" "numeric"


# ------------------------------------------------------
# MISSING VALUE
# ------------------------------------------------------
# Check for missing value in each column by plotting
plot_missing(df)

Expand All @@ -47,24 +65,34 @@ missing_df <- data.frame(


# check if there is there is missing values assigned under new category
## 1. date column
##::::::::::::::::::
## 1. date column
##::::::::::::::::::
sprintf(paste0("Is there any missing value observation categories in date column (T/F)?: ",
missing_df[1] %in% c("NA","N/A","NULL","")))
# "FALSE"


## 2. customer_id column
##::::::::::::::::::::::::::
## 2. customer_id column
##::::::::::::::::::::::::::
sprintf(paste0("Is there any missing value observation categories in customer_id column (T/F)?: ",
missing_df[1] %in% c("NA","N/A","NULL","")))
# "FALSE"


# 3. Check for any transaction with zero values
sprintf(paste0("How many rows contained 0 value in monthly transaction amount?: ",
sum(df$monthly_amount==0)))
# "FALSE"

##---------------------------------------------------------------
## Data Distribution --
##---------------------------------------------------------------

# ------------------------------------------------------
# DATA DISTRIBUTION: DATA SKEWNESS/IMBALANCE
# ------------------------------------------------------
##:::::::::::::::::::::::::::::::
## 1. Data Skewness/Imbalance
##:::::::::::::::::::::::::::::::

# combine 2 plots into 1 plot
par(mfrow=c(1,2))
Expand All @@ -76,7 +104,6 @@ hist(df$industry,
xlim = c(0,10),
ylim=c(0,50000),
las=0)

## 2. MONTHLY_AMOUNT group by LOCATION
hist(df$location,
main = "Trans by Location",
Expand All @@ -85,9 +112,11 @@ hist(df$location,
ylim=c(0,50000),
las=0)

# ------------------------------------------------------
# DATA DISTRIBUTION: OUTLIERS
# ------------------------------------------------------

##:::::::::::::::::::::::::::::::
## 2. Data Outliers
##:::::::::::::::::::::::::::::::

# combine 2 plots into 1 plot
par(mfrow=c(1,2))
# plot boxplot to check for outliers
Expand All @@ -100,7 +129,6 @@ boxplot(monthly_amount~industry,
horizontal=TRUE) +
scale_fill_grey() +
theme_classic()

# 2. location
boxplot(monthly_amount~location,
data=df,
Expand All @@ -111,10 +139,9 @@ boxplot(monthly_amount~location,
scale_fill_grey() +
theme_classic()


# ------------------------------------------------------
# DATA TRANSFORMATION
# ------------------------------------------------------
##---------------------------------------------------------------
## Data Transformation --
##---------------------------------------------------------------
# convert date column into the date format
df$date <- as.Date(df$date,"%d/%m/%Y")

Expand All @@ -134,9 +161,3 @@ df$industry <- as.character(df$industry)
# filter out value with 0 transaction amount
df<-filter(df, monthly_amount!=0)


# ------------------------------------------------------
# VARIABLE CORRELATION
# ------------------------------------------------------
plot_correlation(df)

Loading

0 comments on commit 2d2373e

Please sign in to comment.