update scripts

ndleah · Jan 26, 2022 · 2d2373e · 2d2373e
1 parent 4f6a203
commit 2d2373e
Show file tree

Hide file tree

Showing 9 changed files with 677 additions and 419 deletions.
diff --git a/scripts/01-EDA-pt1.R b/scripts/01-EDA-pt1.R
@@ -1,23 +1,32 @@
-########################################################
-# DATA UNDERSTANDING: EDA PART 1 - THE DATASET
-########################################################
-# ------------------------------------------------------
-# LOAD THE LIBRARIES
-# ------------------------------------------------------
-library(here)         # assess the file path
-library(DataExplorer) # EDA visualizations
-library(tidyverse)    # data wrangling
-library(kableExtra)   # write table
-
-
-# ------------------------------------------------------
-# OVERVIEW OF THE DATASET
-# ------------------------------------------------------
+#################################################################
+##                   MLAA - Assignment AT1a                    ##
+##               MLR model on Financial Data set               ##
+##                    Author: Leah Nguyen                      ##
+#################################################################
+
+#################################################################
+##                     Task 1 - EDA (Pt.1)                     ##
+#################################################################
+
+##----------------------------------------------------------------
+##  Load the Libraries                                          --
+##----------------------------------------------------------------
+library(here)            # assess the file path
+library(DataExplorer)    # EDA visualizations
+library(tidyverse)       # data wrangling
+library(kableExtra)      # write table
+library(bannerCommenter) # create comment banner
+
+
+##----------------------------------------------------------------
+##  Dataset Overview                                            --
+##----------------------------------------------------------------
 # Load Data File
 df <- read_csv(here("dataset/transactions.csv"))
 
 
-# Quick overview of the dataset then transpose for better table display:
+# Quick overview of the data set then transpose for better table display. 
+# The information including:
 # * index and column data types
 # * non-null values
 # * memory usage
@@ -26,14 +35,23 @@ df_overview <- introduce(df) %>% t()
 
 # turn the table into the Markdown format
 df_overview %>% kbl() %>% kable_styling(bootstrap_options = "striped", full_width = F)
+# rows	94247
+# columns	5
+# discrete_columns	4
+# continuous_columns	1
+# all_missing_columns	0
+# total_missing_values	0
+# complete_rows	94247
+# total_observations	471235
+# memory_usage	4201744
 
 
 # inspect columns data type
 sapply(df, class) 
+# date    customer_id       industry       location monthly_amount 
+# "Date"    "character"    "character"    "character"      "numeric" 
+
 
-# ------------------------------------------------------
-# MISSING VALUE
-# ------------------------------------------------------
 # Check for missing value in each column by plotting
 plot_missing(df)
 
@@ -47,24 +65,34 @@ missing_df <- data.frame(
 
 
 # check if there is there is missing values assigned under new category
-## 1. date column
+##::::::::::::::::::
+##  1. date column  
+##::::::::::::::::::
 sprintf(paste0("Is there any missing value observation categories in date column (T/F)?: ", 
                missing_df[1] %in% c("NA","N/A","NULL","")))
+# "FALSE"
 
 
-## 2. customer_id column
+##::::::::::::::::::::::::::
+##  2. customer_id column   
+##::::::::::::::::::::::::::
 sprintf(paste0("Is there any missing value observation categories in customer_id column (T/F)?: ", 
                missing_df[1] %in% c("NA","N/A","NULL","")))
+# "FALSE"
 
 
 # 3. Check for any transaction with zero values
 sprintf(paste0("How many rows contained 0 value in monthly transaction amount?: ", 
                sum(df$monthly_amount==0)))
+# "FALSE"
 
+##---------------------------------------------------------------
+##  Data Distribution                                          --
+##---------------------------------------------------------------
 
-# ------------------------------------------------------
-# DATA DISTRIBUTION: DATA SKEWNESS/IMBALANCE
-# ------------------------------------------------------
+##:::::::::::::::::::::::::::::::
+##  1. Data Skewness/Imbalance   
+##:::::::::::::::::::::::::::::::
 
 # combine 2 plots into 1 plot
 par(mfrow=c(1,2))
@@ -76,7 +104,6 @@ hist(df$industry,
      xlim = c(0,10), 
      ylim=c(0,50000), 
      las=0)
-
 ## 2. MONTHLY_AMOUNT group by LOCATION
 hist(df$location,
      main = "Trans by Location", 
@@ -85,9 +112,11 @@ hist(df$location,
      ylim=c(0,50000), 
      las=0)
 
-# ------------------------------------------------------
-# DATA DISTRIBUTION: OUTLIERS
-# ------------------------------------------------------
+
+##:::::::::::::::::::::::::::::::
+##  2. Data Outliers   
+##:::::::::::::::::::::::::::::::
+
 # combine 2 plots into 1 plot
 par(mfrow=c(1,2)) 
 # plot boxplot to check for outliers
@@ -100,7 +129,6 @@ boxplot(monthly_amount~industry,
         horizontal=TRUE) + 
   scale_fill_grey() + 
   theme_classic()
-
 # 2. location
 boxplot(monthly_amount~location, 
         data=df, 
@@ -111,10 +139,9 @@ boxplot(monthly_amount~location,
   scale_fill_grey() + 
   theme_classic()
 
-
-# ------------------------------------------------------
-# DATA TRANSFORMATION
-# ------------------------------------------------------
+##---------------------------------------------------------------
+##  Data Transformation                                        --
+##---------------------------------------------------------------
 # convert date column into the date format
 df$date <- as.Date(df$date,"%d/%m/%Y")
 
@@ -134,9 +161,3 @@ df$industry <- as.character(df$industry)
 # filter out value with 0 transaction amount
 df<-filter(df, monthly_amount!=0)
 
-
-# ------------------------------------------------------
-# VARIABLE CORRELATION
-# ------------------------------------------------------
-plot_correlation(df)
-