From 6eb0c0fd1fdf469785da8174f0e7e4dd22a5ddce Mon Sep 17 00:00:00 2001
From: Leah Nguyen <nduongthucanh@gmail.com>
Date: Wed, 13 Apr 2022 13:54:47 +1000
Subject: [PATCH] remove R scripts

---
 scripts/01-EDA-pt1.R                | 163 --------------
 scripts/02-EDA-pt2.R                | 254 ---------------------
 scripts/03-feature-engineering.R    | 112 ----------
 scripts/04-basic-model-fitting.R    | 333 ----------------------------
 scripts/05-advanced-model-fitting.R | 121 ----------
 scripts/appendix.R                  |  15 --
 6 files changed, 998 deletions(-)
 delete mode 100644 scripts/01-EDA-pt1.R
 delete mode 100644 scripts/02-EDA-pt2.R
 delete mode 100644 scripts/03-feature-engineering.R
 delete mode 100644 scripts/04-basic-model-fitting.R
 delete mode 100644 scripts/05-advanced-model-fitting.R
 delete mode 100644 scripts/appendix.R

diff --git a/scripts/01-EDA-pt1.R b/scripts/01-EDA-pt1.R
deleted file mode 100644
index ee52db1..0000000
--- a/scripts/01-EDA-pt1.R
+++ /dev/null
@@ -1,163 +0,0 @@
-#################################################################
-##                   MLAA - Assignment AT1a                    ##
-##               MLR model on Financial Data set               ##
-##                    Author: Leah Nguyen                      ##
-#################################################################
-
-#################################################################
-##                     Task 1 - EDA (Pt.1)                     ##
-#################################################################
-
-##----------------------------------------------------------------
-##  Load the Libraries                                          --
-##----------------------------------------------------------------
-library(here)            # assess the file path
-library(DataExplorer)    # EDA visualizations
-library(tidyverse)       # data wrangling
-library(kableExtra)      # write table
-library(bannerCommenter) # create comment banner
-
-
-##----------------------------------------------------------------
-##  Dataset Overview                                            --
-##----------------------------------------------------------------
-# Load Data File
-df <- read_csv(here("dataset/transactions.csv"))
-
-
-# Quick overview of the data set then transpose for better table display. 
-# The information including:
-# * index and column data types
-# * non-null values
-# * memory usage
-df_overview <- introduce(df) %>% t()          
-                                             
-
-# turn the table into the Markdown format
-df_overview %>% kbl() %>% kable_styling(bootstrap_options = "striped", full_width = F)
-# rows	94247
-# columns	5
-# discrete_columns	4
-# continuous_columns	1
-# all_missing_columns	0
-# total_missing_values	0
-# complete_rows	94247
-# total_observations	471235
-# memory_usage	4201744
-
-
-# inspect columns data type
-sapply(df, class) 
-# date    customer_id       industry       location monthly_amount 
-# "Date"    "character"    "character"    "character"      "numeric" 
-
-
-# Check for missing value in each column by plotting
-plot_missing(df)
-
-
-# convert character values of columns to upper case for better missing value inspection
-missing_df <- data.frame(
-  lapply(df, function(v) {
-  if (is.character(v)) return(toupper(v))
-  else return(v)
-}))
-
-
-# check if there is there is missing values assigned under new category
-##::::::::::::::::::
-##  1. date column  
-##::::::::::::::::::
-sprintf(paste0("Is there any missing value observation categories in date column (T/F)?: ", 
-               missing_df[1] %in% c("NA","N/A","NULL","")))
-# "FALSE"
-
-
-##::::::::::::::::::::::::::
-##  2. customer_id column   
-##::::::::::::::::::::::::::
-sprintf(paste0("Is there any missing value observation categories in customer_id column (T/F)?: ", 
-               missing_df[1] %in% c("NA","N/A","NULL","")))
-# "FALSE"
-
-
-# 3. Check for any transaction with zero values
-sprintf(paste0("How many rows contained 0 value in monthly transaction amount?: ", 
-               sum(df$monthly_amount==0)))
-# "FALSE"
-
-##---------------------------------------------------------------
-##  Data Distribution                                          --
-##---------------------------------------------------------------
-
-##:::::::::::::::::::::::::::::::
-##  1. Data Skewness/Imbalance   
-##:::::::::::::::::::::::::::::::
-
-# combine 2 plots into 1 plot
-par(mfrow=c(1,2))
-# plot data distribution 
-# 1. MONTHLY_AMOUNT group by INDUSTRY
-hist(df$industry,
-     main = "Trans by Industry", 
-     xlab="Industry", 
-     xlim = c(0,10), 
-     ylim=c(0,50000), 
-     las=0)
-## 2. MONTHLY_AMOUNT group by LOCATION
-hist(df$location,
-     main = "Trans by Location", 
-     xlab="Location", 
-     xlim = c(0,10), 
-     ylim=c(0,50000), 
-     las=0)
-
-
-##:::::::::::::::::::::::::::::::
-##  2. Data Outliers   
-##:::::::::::::::::::::::::::::::
-
-# combine 2 plots into 1 plot
-par(mfrow=c(1,2)) 
-# plot boxplot to check for outliers
-# 1. industry
-boxplot(monthly_amount~industry, 
-        data=df, 
-        main='Transaction vs. Industry',
-        xlab='Industry', 
-        ylab='Transaction Amount', 
-        horizontal=TRUE) + 
-  scale_fill_grey() + 
-  theme_classic()
-# 2. location
-boxplot(monthly_amount~location, 
-        data=df, 
-        main='Transaction vs. Location',
-        xlab='Location', 
-        ylab='Transaction Amount', 
-        horizontal=TRUE) + 
-  scale_fill_grey() + 
-  theme_classic()
-
-##---------------------------------------------------------------
-##  Data Transformation                                        --
-##---------------------------------------------------------------
-# convert date column into the date format
-df$date <- as.Date(df$date,"%d/%m/%Y")
-
-
-# convert customer_id column into character format
-df$customer_id = as.character(df$customer_id, format = "")
-
-
-# convert location column into character format
-df$location <- as.character(df$location)
-
-
-# convert industry column into character format
-df$industry <- as.character(df$industry)
-
-
-# filter out value with 0 transaction amount
-df<-filter(df, monthly_amount!=0)
-
diff --git a/scripts/02-EDA-pt2.R b/scripts/02-EDA-pt2.R
deleted file mode 100644
index 882ca7b..0000000
--- a/scripts/02-EDA-pt2.R
+++ /dev/null
@@ -1,254 +0,0 @@
-#################################################################
-##                   MLAA - Assignment AT1a                    ##
-##               MLR model on Financial Data set               ##
-##                    Author: Leah Nguyen                      ##
-#################################################################
-
-#################################################################
-##                     Task 1 - EDA (Pt.2)                     ##
-#################################################################
-
-##----------------------------------------------------------------
-##  Load the Libraries                                          --
-##----------------------------------------------------------------
-library(ggplot2)      # data visualization
-library(forecast)     # times-series forecasting
-library(ggradar)      # plot seasonal trend
-library(sqldf)        # using SQL
-library(dplyr)        # data processing
-library(ggpubr)       # combine plots into single page
-theme_set(theme_pubr())
-library(reshape2)     # transpose table
-library(fmsb)         # create radar chart
-
-
-##----------------------------------------------------------------
-##  Data Visualization                                          --
-##----------------------------------------------------------------
-
-##:::::::::::::::::::::::::::::::::::::::::
-##  1. Transaction Amount & # Transaction  
-##:::::::::::::::::::::::::::::::::::::::::
-
-# create new df contain total transaction amount
-transaction_amount <- sqldf(
-  "SELECT
-  date,
-  'Transaction Amount' as type,             -- specify value type
-  SUM(monthly_amount) AS value              -- sum total transaction amount
-FROM df
-GROUP BY date                                -- filter by date
-ORDER BY date
-"
-)
-
-# create new df contain number of transaction
-transaction_count <- sqldf(
-  "SELECT
-  date,
-    'Transaction Count' as type,             -- specify value type
-  COUNT(*) as value                         -- count total number of transactions
-FROM df
-GROUP BY date                                -- filter by date
-ORDER BY date
-"
-)
-
-# merge 2 df into 1 new TRANSACTION df vertically 
-transaction_df <- rbind(transaction_amount, 
-                        transaction_count)
-
-# plot transaction amount over time
-monthly_amount_plot <- transaction_df %>%
-  # filter by transaction amount only
-  filter(type=="Transaction Amount") %>%   
-  # assign x and y-axis from the dataset
-  ggplot(aes(x = date, y = value/1e6)) +     
-  # add the line graph, color, and the size
-  geom_line(color = "indianred", size=1.6) + 
-  # the relationship graph between x and y
-  geom_smooth(formula = y~x,                    
-              method = 'loess') +
-  labs(x = "Year", 
-       y = "Total transaction amount (in millions)",
-       title = "Monthly Transaction Amount",
-       subtitle = "2013 to 2016") +
-  theme_minimal()
-
-
-# plot total transaction number over time
-monthly_count_plot <- transaction_df %>%
-  # filter by total transaction number count only
-  filter(type=="Transaction Count") %>% 
-  # assign x and y-axis from the dataset
-  ggplot(aes(x = date, y = value)) +    
-  # add the line graph, color, and the size
-  geom_line(color = "indianred", size=1.6) +   
-  # the relationship graph between x and y
-  geom_smooth(formula = y~x,                   
-              method = 'loess') +
-  labs(x = "Year", 
-       y = "Total transaction number",
-       title = "Total Transaction Number",
-       subtitle = "2013 to 2016") +
-  theme_minimal()
-
-## combine individual plots into a single page  
-ggarrange(monthly_amount_plot,
-          monthly_count_plot,
-          ncol = 2, nrow = 1)
-
-
-## Create new MONTHLY_TREND df to plot seasonal transaction trend
-new_ts_df <- sqldf(
-  "SELECT
-   strftime('%m', date) as month,   --extract month from date column                   
-   strftime('%Y',                   --extract year from date column
-   date * 3600 * 24,
-   'unixepoch') as year,
-  SUM(monthly_amount) AS transaction_amount
-FROM df
-GROUP BY
-  month,
-  year
-ORDER BY
-  month,
-  year
-"
-)
-
-## transpose the dataset to prepare for the data visualization
-monthly_trend <- recast(new_ts_df, 
-                        year + variable ~ month, 
-                        id.var = c("month", "year"))
-monthly_trend <- data.frame(monthly_trend[,-1],            # use the first column as the data index
-                            row.names = monthly_trend[,1]) # use the first row as the header
-monthly_trend <- subset(monthly_trend, select = -variable) # remove the unecessary column
-
-
-## create new vector specify month column names
-colnames(monthly_trend) <- c('January', 'February', 'March', 'April', 
-                             'May', 'June', 'July', 'August', 'September', 
-                             'October', 'November', 'December')
-
-
-## To use the fmsb package, I have to add 2 lines to the dataframe: 
-## the max and min of each variable to show on the plot!
-data <- rbind(rep(1400e6,12) , rep(0,12) , monthly_trend)
-
-
-## create color vector
-colr_1 <- rgb(0.2,0.5,0.5,0.9)
-colr_2 <- rgb(0.8,0.2,0.5,0.9)
-colr_3 <- rgb(0.7,0.5,0.1,0.9)
-colr_4 <- "#FC4E07"
-
-
-## set color theme for radar border
-colors_border=c(colr_1, 
-                colr_2, 
-                colr_3, 
-                colr_4)
-
-
-## plot with default options:
-seasonal_mul_plot <- radarchart(data, axistype=0,
-                                #custom polygon
-                                pcol=colors_border, plwd=2 , plty=1,
-                                #custom the grid
-                                cglcol="grey", cglty=1, axislabcol="grey", cglwd=0.8,
-                                #custom labels
-                                vlcex=0.8, title=paste("Transaction Seasonal Trend"), cex.main = 1
-  )
-
-## Add a legend
-legend(seasonal_mul_plot, 
-       x=1.5, y=0.5, 
-       legend = 
-         rownames(data[-c(1,2),]), 
-       bty = "n", pch=15 ,
-       col=colors_border, 
-       text.col = "black", 
-       cex=1.2, 
-       pt.cex=1, 
-       title = "Year")
-
-
-# Individual Years Bar Charts
-ggplot(new_ts_df) +
-  aes(x = month, fill = year, weight = transaction_amount/1e6) +
-  geom_bar() +
-  scale_fill_brewer(palette = "Dark2", 
-                    direction = -1) +
-  labs(y = "Total transaction amount (in millions)", 
-       title = "Monthly Seasonal Trend", 
-       subtitle = "Individual Year (from 2013 to 2016)") +
-  coord_flip() +
-  theme_minimal() +
-  theme(legend.position = "none", 
-        plot.title = element_text(size = 15L)) +
-  facet_wrap(vars(year), scales = "free", nrow = 1L)
-
-
-##::::::::::::::::::::::::::::::::::::::::::::::::
-##  3. Transaction Amount by Location & Industry  
-##::::::::::::::::::::::::::::::::::::::::::::::::
-
-# create new INDUSTRY df contain total transaction by industry over time
-industry <- sqldf(
-  "SELECT
-  date,
-  industry,
-  SUM(monthly_amount) AS transaction_amount, -- sum total transaction amount
-  COUNT(*) as transaction_count              -- count total number of transactions
-FROM df
-GROUP BY 
-  date,                                      -- filter by date
-  industry                                   -- filter by industry
-ORDER BY date
-"
-)
-
-# create new LOCATION df contain total transaction by location over time
-location <- sqldf(
-  "SELECT
-  date,
-  location,
-  SUM(monthly_amount) AS transaction_amount, -- sum total transaction amount
-  COUNT(*) as transaction_count              -- count total number of transactions
-FROM df
-GROUP BY 
-  date,                                      -- filter by date
-  location                                   -- filter by location
-ORDER BY date
-"
-)
-
-# plot transaction info by industry
-industry_amount_plot <- ggplot(industry) +
-  aes(x = date, y = transaction_amount, colour = industry) +
-  geom_line(size = 1) +
-  scale_color_hue(direction = 1) +
-  labs(x = "Year", 
-       y = "Total transaction amount (in millions)",
-       title = "Transaction Amount",
-       subtitle = "By industry (from 2013 to 2016)") +
-  theme_minimal()
-
-# plot transaction info by location
-location_amount_plot <- ggplot(location) +
-  aes(x = date, y = transaction_amount, colour = location) +
-  geom_line(size = 1) +
-  scale_color_hue(direction = 1) +
-  labs(x = "Year", 
-       y = "Total transaction amount (in millions)",
-       title = "Transaction Amount",
-       subtitle = "By location (from 2013 to 2016)") +
-  theme_minimal()
-
-# combine plots into a single page  
-ggarrange(industry_amount_plot, location_amount_plot,
-          ncol = 2, nrow = 1) 
-
-
-
diff --git a/scripts/03-feature-engineering.R b/scripts/03-feature-engineering.R
deleted file mode 100644
index 5996d90..0000000
--- a/scripts/03-feature-engineering.R
+++ /dev/null
@@ -1,112 +0,0 @@
-#################################################################
-##                   MLAA - Assignment AT1a                    ##
-##               MLR model on Financial Data set               ##
-##                    Author: Leah Nguyen                      ##
-#################################################################
-
-#################################################################
-##                Task 2A.i - Aggregated Data set              ##
-#################################################################
-
-##---------------------------------------------------------------
-##  Load the Libraries                                         --
-##---------------------------------------------------------------
-library(tidyverse)       # data wrangling
-library(kableExtra)      # write table
-library(dplyr)           # data processing
-library(ggplot2)         # data visualization
-
-
-# write a reusable function
-aggregate_transactions <- function(df) {
-  
-  # aggregate the data, grouping by date, industry and location, 
-  # and calculating the mean monthly_amount
-  output = df %>%
-    group_by(date, industry, location) %>%
-    summarize(monthly_amount = mean(monthly_amount, na.rm = TRUE))
-  
-  # create a column for the month number and another one for year number
-  output = output %>%
-    # create new column for month number
-    mutate(month_number = format(as.Date(date), "%m")) %>%
-    # create new column for month number
-    mutate(year_number = format(as.Date(date), "%Y"))
-  
-  # Make sure the new columns are of the correct type
-  output$month_number = as.integer(output$month_number)
-  output$year_number = as.integer(output$year_number)
-  
-  transform(output, month_number = as.integer(month_number), year_number = as.integer(year_number))
-  return(output)
-}
-
-# create a new variable that store new df with transformed features
-aggregated_transactions <- aggregate_transactions(df)
-# A tibble: 3,886 x 6
-# Groups:   date, industry [470]
-# date       industry location monthly_amount month_number year_number
-# <date>     <chr>    <chr>             <dbl> <chr>        <chr>      
-#   1 2013-01-01 1        1               136081. 01           2013       
-# 2 2013-01-01 1        10              188735. 01           2013       
-# 3 2013-01-01 1        2               177840. 01           2013       
-# 4 2013-01-01 1        3               141632. 01           2013       
-# 5 2013-01-01 1        4               221058. 01           2013       
-# 6 2013-01-01 1        5               178138. 01           2013       
-# 7 2013-01-01 1        6               133400. 01           2013       
-# 8 2013-01-01 1        7               231599. 01           2013       
-# 9 2013-01-01 1        8               143778. 01           2013       
-# 10 2013-01-01 1        9               157416. 01           2013       
-# ... with 3,876 more rows
-
-
-# turn the df into a Markdown table format 
-rmarkdown::paged_table(aggregated_transactions)
-
-#############################################################################
-##  Task 2A.ii - Visualization: Mean Monthly Transaction Amount (L1 & I1) ##
-#############################################################################
-
-aggregated_transactions %>%
-  filter(industry >= 1L & industry <= 1L) %>%
-  filter(location >= 1L & location <= 
-           1L) %>%
-  ggplot() +
-  aes(x = date, y = monthly_amount, colour = year_number) +
-  geom_line(size = 0.7) +
-  geom_point(size=1) +
-  scale_color_brewer(palette = "Dark2", direction = 1) +
-  labs(
-    x = "year",
-    y = "Transaction amount",
-    title = "Mean Monthly Transaction Amount",
-    subtitle = "Industry 1 & Location 1",
-    color = "Year"
-  ) +
-  ggthemes::theme_par() +
-  theme(
-    plot.title = element_text(size = 12L),
-    plot.subtitle = element_text(hjust = 0.5)
-  )
-
-
-# Plot Mean Transaction Amount Bar chart
-agg_1_1 %>%
-  ggplot() +
-  aes(x = month_number, fill = year_number, weight = monthly_amount) +
-  geom_bar() +
-  scale_fill_manual(values = c(`2013` = "#EBEBAE", `2014` = "#BBE395", `2015` = "#379E54", `2016` = "#004529"
-  )) +
-  labs(x = "Month", y = "Total transaction amount", title = "Transaction Amount by Month", subtitle = "Location 1 & Industry 1", 
-       fill = "year") +
-  theme_classic() +
-  theme(legend.position = "bottom", 
-        plot.title = element_text(size = 15L),
-        axis.title.x=element_blank(),
-        axis.text.x=element_blank(),
-        axis.ticks.x=element_blank()) +
-  facet_wrap(vars(year_number), nrow = 1L) +
-  coord_flip()
-
-
-
diff --git a/scripts/04-basic-model-fitting.R b/scripts/04-basic-model-fitting.R
deleted file mode 100644
index 76cb8c8..0000000
--- a/scripts/04-basic-model-fitting.R
+++ /dev/null
@@ -1,333 +0,0 @@
-#################################################################
-##                   MLAA - Assignment AT1a                    ##
-##               MLR model on Financial Data set               ##
-##                    Author: Leah Nguyen                      ##
-#################################################################
-
-#################################################################
-##                Task 2A.iii - Modelling                      ##
-#################################################################
-
-##---------------------------------------------------------------
-##  Load the Libraries                                         --
-##---------------------------------------------------------------
-library(modelr)       # computing regression model performance metrics
-library(caret)        # streamline the model training process
-library(xts)          # convert df to ts object
-
-
-##---------------------------------------------------------------
-##  1. Train-Test Split                                        --
-##---------------------------------------------------------------
-
-# create helper function with the creation of train-test split
-create_train_test <- function(df){
-  # Test set
-  # Use data for the year 2016 to test accuracy
-  test_df <- df[df$date >=	"2016-01-01",]   
-  # Training set
-  # Use data for the year 2016 for forecasting
-  train_df <- df[(df$date < "2016-01-01"),]
-  return (list(train_df, test_df))
-}
-
-# Create train and test set
-# create new variable and apply train-test split function
-split_list <- create_train_test(aggregated_transactions) 
-
-# 1. train set
-train_set <- split_list[[1]] 
-
-
-# 2. test set
-test_set <- split_list[[2]]
-
-
-# create new df filter by only Location 1 & Industry 1
-agg_1_1 <- aggregated_transactions %>% 
-  filter(industry == 1, location == 1)
-
-
-# # Create train and test
-# use the adjusted data set with new features
-split_list_1_1 <- create_train_test(agg_1_1) 
-
-# 1. train set: filter by only Location 1 & Industry 1
-train_set_1_1 <- split_list_1_1[[1]] 
-
-# 2. test set: filter by only Location 1 & Industry 1
-test_set_1_1 <- split_list_1_1[[2]]
-
-# create new df to plot our test-train set
-# add new column to specify the value type
-train_set_1_1$type <- "Train"
-test_set_1_1$type <- "Test"
-
-# create data frame to display year range of test and train data
-train_test <- sqldf(
-  "SELECT * FROM train_set_1_1
-  UNION ALL
-  SELECT * FROM test_set_1_1
-  ORDER BY type;
-  "
-)
-
-
-# plot year range of train set & test set (L1 & I1)
-ggplot(train_test) +
-  aes(x = date, y = monthly_amount, colour = type) +
-  geom_line(size = 0.5) +
-  scale_color_hue(direction = 1) +
-  labs(x = "Year", y = "Total monthly amount", 
-       title = "Train-Test Split", 
-       subtitle = "Location 1 & Industry 1") +
-  theme_classic() +
-  theme(legend.position = "bottom", plot.title = element_text(size = 15L))
-
-
-##---------------------------------------------------------------
-##  2. Modelling: Basic Model Fitting - L1 & I1                --
-##---------------------------------------------------------------
-
-##::::::::::::::::::::::::::::
-##  M1: date + month_number
-##::::::::::::::::::::::::::::
- 
-M1_1 <- lm(monthly_amount~date+month_number, data =train_set_1_1)
-summary(M1_1)
-# Call:
-#   lm(formula = monthly_amount ~ date + month_number, data = train_set_1_1)
-# 
-# Residuals:
-#   Min       1Q   Median       3Q      Max 
-# -12943.4  -1553.1    494.6   2858.6  10733.4 
-# 
-# Coefficients:
-#   Estimate Std. Error t value Pr(>|t|)    
-# (Intercept)    -1.758e+05  5.029e+04  -3.495 0.001954 ** 
-#   date            1.981e+01  3.123e+00   6.342 1.80e-06 ***
-#   month_number02  1.828e+04  4.561e+03   4.008 0.000551 ***
-#   month_number03  2.508e+04  4.563e+03   5.496 1.38e-05 ***
-#   month_number04  9.805e+03  4.568e+03   2.146 0.042623 *  
-#   month_number05  2.188e+04  4.575e+03   4.783 8.00e-05 ***
-#   month_number06  1.448e+04  4.584e+03   3.159 0.004389 ** 
-#   month_number07  1.903e+04  4.595e+03   4.142 0.000395 ***
-#   month_number08  2.004e+04  4.608e+03   4.349 0.000236 ***
-#   month_number09  1.453e+04  4.622e+03   3.144 0.004546 ** 
-#   month_number10  2.475e+04  4.639e+03   5.335 2.04e-05 ***
-#   month_number11  1.983e+04  4.658e+03   4.258 0.000296 ***
-#   month_number12  4.360e+03  4.678e+03   0.932 0.360955    
-# ---
-#   Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
-# 
-# Residual standard error: 5584 on 23 degrees of freedom
-# Multiple R-squared:  0.8329,	Adjusted R-squared:  0.7457 
-# F-statistic: 9.555 on 12 and 23 DF,  p-value: 2.627e-06
-
-
-##::::::::::::::::::::::::::::
-##  M2: month_number
-##::::::::::::::::::::::::::::
-
-M1_2 <-lm(monthly_amount~month_number, data =train_set_1_1)
-summary(M1_2)
-# Call:
-#   lm(formula = monthly_amount ~ month_number, data = train_set_1_1)
-# 
-# Residuals:
-#   Min       1Q   Median       3Q      Max 
-# -14715.9  -6444.4    625.6   6639.3  12664.7 
-# 
-# Coefficients:
-#   Estimate Std. Error t value Pr(>|t|)    
-# (Intercept)      142568       5233  27.244  < 2e-16 ***
-#   month_number02    18895       7401   2.553 0.017455 *  
-#   month_number03    26248       7401   3.547 0.001641 ** 
-#   month_number04    11588       7401   1.566 0.130484    
-# month_number05    24258       7401   3.278 0.003179 ** 
-#   month_number06    17471       7401   2.361 0.026703 *  
-#   month_number07    22616       7401   3.056 0.005431 ** 
-#   month_number08    24239       7401   3.275 0.003200 ** 
-#   month_number09    19347       7401   2.614 0.015209 *  
-#   month_number10    30155       7401   4.075 0.000436 ***
-#   month_number11    25854       7401   3.493 0.001873 ** 
-#   month_number12    10976       7401   1.483 0.151073    
-# ---
-#   Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
-# 
-# Residual standard error: 9064 on 24 degrees of freedom
-# Multiple R-squared:  0.5407,	Adjusted R-squared:  0.3302 
-# F-statistic: 2.569 on 11 and 24 DF,  p-value: 0.02583
-
-##::::::::::::::::::::::::::::
-##  M3: 1 VARIABLE - date
-##::::::::::::::::::::::::::::
-
-M1_3 <-lm(monthly_amount~date, data =train_set_1_1)
-summary(M1_3)
-# Call:
-#   lm(formula = monthly_amount ~ date, data = train_set_1_1)
-# 
-# Residuals:
-#   Min     1Q Median     3Q    Max 
-# -17585  -7236   1484   6569  17030 
-# 
-# Coefficients:
-#   Estimate Std. Error t value Pr(>|t|)    
-# (Intercept) -1.764e+05  7.681e+04  -2.297   0.0279 *  
-#   date         2.083e+01  4.729e+00   4.405   0.0001 ***
-#   ---
-#   Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
-# 
-# Residual standard error: 8966 on 34 degrees of freedom
-# Multiple R-squared:  0.3633,	Adjusted R-squared:  0.3446 
-# F-statistic:  19.4 on 1 and 34 DF,  p-value: 1e-04
-
-# ----------------------------------------------------------------------------
-# CONCLUSION: 
-# we end up choosing the M1_1 model since its best fitting results
-
-
-##################################################################
-##        Task 2A.iv - Making the Prediction on Dec 2016        ##
-##################################################################
-
-##----------------------------------------------------------------
-##  Dec 2016 Prediction                                         --
-##----------------------------------------------------------------
-
-# create new vector that specify column names
-x <- c("date", "industry", "location", "monthly_amount",
-       "month_number","year_number","type")
-
-
-# create new df for fitted df
-fit_df <- data.frame(train_set_1_1[-7]) %>% 
-  # assign value type for the the dataset
-  mutate(type="Fitted")
-
-
-# Rename the fitted dataset column names
-colnames(fit_df) <- x
-
-
-# create new df for prediction df
-pred_df <- data.frame(test_set_1_1[-7]) %>% 
-  # assign value type for the the dataset
-  mutate(type="Predicted")
-
-
-# Rename the predicted dataset column names
-colnames(pred_df) <- x
-
-
-# create new df for prediction row
-dec2016<-data.frame(date = "2016/12/01",
-                    industry="1",
-                    location="1",
-                    monthly_amount = 0,
-                    month_number="12",
-                    year_number="2016",
-                    type="Predicted"
-)
-
-
-# convert the date column to the right date format to merge with predicted df later
-dec2016$date <- as.Date(dec2016$date,format = "%Y/%m/%d")
-
-
-## Use predict function + model we just built and apply to dec_2016 data frame
-fit_df$monthly_amount <- predict(M1_1, train_set_1_1)
-pred_df$monthly_amount <- predict(M1_1, test_set_1_1)
-pred <- predict(M1_1, dec2016)
-dec2016$monthly_amount <- pred
-
-# combine the predicted df with the new predicted row
-pred_df <- sqldf(
-  "SELECT * FROM pred_df 
-  UNION ALL
-  SELECT * FROM dec2016"
-)
-
-# create new column to specify value type for the dataset
-agg_1_1$type <- "Real"
-
-
-# make sure our variables have the right format for plotting
-fit_df <- as.data.frame(fit_df)
-pred_df <- as.data.frame(pred_df)
-
-
-# merge all necessary variables into 1 single df variable with rbind
-newData <- rbind(fit_df, pred_df,agg_1_1)
-
-
-# Plot goodness of the fir from our MLR
-newData %>%
-  filter(!(year_number %in% "2016")) %>%
-  filter(!(type %in% "Predicted")) %>%
-  ggplot() +
-  aes(x = date, y = monthly_amount, colour = type) +
-  geom_line(size = 0.5) +
-  scale_color_hue(direction = 1) +
-  labs(
-    x = "Year",
-    y = "Mean Transaction Amount",
-    title = "Fit from MLR"
-  ) +
-  theme_light()
-
-
-# Plot the comparison between our MLR prediction on test set with the actual values
-ggplot(newData) +
-  aes(x = date, y = monthly_amount, colour = type) +
-  geom_line(size = 0.7) +
-  scale_color_hue(direction = 1) +
-  labs(x = "Year", y = "Mean transaction amount", 
-       title = "December 2016 Prediction") +
-  theme_light() +
-  theme(plot.title = element_text(face = "bold"))
-
-#################################################################
-##                Task 2B - Describe the model                 ##
-#################################################################
-
-##----------------------------------------------------------------
-##  Evaluation Metrics                                          --
-##----------------------------------------------------------------
-
-# Predictions vs Test data
-evaluation_metrics_test_1_1 <- data.frame(
-  X = "Predictions vs Test data",
-  R2 = rsquare(M1_1, data = test_set_1_1),
-  RMSE = rmse(M1_1, data = test_set_1_1),
-  MAE = mae(M1_1, data = test_set_1_1)
-)
-# table formatting
-rmarkdown::paged_table(evaluation_metrics_test_1_1)
-#                          X        R2     RMSE      MAE
-# 1 Predictions vs Test data 0.5264429 11293.81 10457.18
-
-
-# Fitted value vs Train data
-evaluation_metrics_train_1_1 <- data.frame(
-  X = "Fitted value vs Train data",
-  R2 = rsquare(M1_1, data = train_set_1_1),
-  RMSE = rmse(M1_1, data = train_set_1_1),
-  MAE = mae(M1_1, data = train_set_1_1)
-)
-# table formatting
-rmarkdown::paged_table(evaluation_metrics_train_1_1)
-#                            X        R2     RMSE      MAE
-# 1 Fitted value vs Train data 0.8329209 4463.717 3172.824
-
-
-# Model test error
-test_error <- evaluation_metrics_test_1_1$RMSE/mean(test_set_1_1$monthly_amount) 
-# view the result
-test_error # [1] 0.06164264
-
-
-# plot the model and combine it into 1 single frame
-par(mfrow=c(2,2)) 
-plot(M1_1)
diff --git a/scripts/05-advanced-model-fitting.R b/scripts/05-advanced-model-fitting.R
deleted file mode 100644
index db8d58b..0000000
--- a/scripts/05-advanced-model-fitting.R
+++ /dev/null
@@ -1,121 +0,0 @@
-#################################################################
-##                   MLAA - Assignment AT1a                    ##
-##               MLR model on Financial Data set               ##
-##                    Author: Leah Nguyen                      ##
-#################################################################
-
-#################################################################
-##    Task 3A - Modelling - ALL LOCATION & INDUSTRIES          ##
-#################################################################
-
-##---------------------------------------------------------------
-##  Load the Libraries                                         --
-##---------------------------------------------------------------
-library(modelr)       # computing regression model performance metrics
-library(caret)        # streamline the model training process
-library(xts)          # convert df to ts object
-library(tidyverse)    # data wrangling
-library(dplyr)        # data wrangling
-library(hydroGOF)     # Goodness-of-Fit Functions for Comparison
-
-# convert variables to factor
-aggregated_transactions$industry <- as.numeric(aggregated_transactions$industry)
-aggregated_transactions$location <- as.numeric(aggregated_transactions$location)
-aggregated_transactions$month_number <- as.numeric(aggregated_transactions$month_number)
-aggregated_transactions$year_number <- as.numeric(aggregated_transactions$year_number)
-
-# Using the aggregated dataset from part 2 (aggregated_transactions)
-df <- aggregated_transactions %>% mutate(time_number=c(1:nrow(arrange(aggregated_transactions, date))))
-str(df)
-
-calculate_predictions <- function(df, industries, locations) {
-  
-  output = data.frame()
-  
-  for (ind in industries) {
-    for (loc in locations) {
-      
-      temp = df[df$industry == ind & df$location == loc, ]}}
-      if (length(unique(temp$date)) >= 36) {
-        
-        # arrange dataset by date
-        arrange(temp, date)
-        
-        # Add a number to represent date order
-        temp$time_number = c(1:nrow(temp))
-        
-        # Split test train
-        # Assign observations to training and testing sets
-        trainset_all <- train_set
-        testset_all <- test_set
-        
-        # create new vector that specify column names
-        x <- c("date", "industry", "location", "monthly_amount",
-               "month_number","year_number")
-        
-        
-        # create new df for fitted and predict df
-        trainset_all <- data.frame(trainset_all)
-        testset_all <- data.frame(testset_all)
-        
-        
-        # Rename the fitted dataset column names
-        colnames(trainset_all) <- x
-        colnames(testset_all) <- x
-        
-        
-        # Now to run the earlier linear model with monthly_amount as the target variable
-        model_all <- lm(monthly_amount ~ time_number, data=trainset_all)
-        summary(model_all)
-        
-        
-        # Calculate the mean standard error
-        training.mse <- mean(residuals(model_all)^2)
-        
-        # Calculate root mean squared error
-        training.rmse <- sqrt(training.mse)
-        
-        
-        # create new df for prediction row
-        december_2016 = data.frame(date = "2016/12/01",
-                                   industry=ind,
-                                   location=loc,
-                                   monthly_amount = 0,
-                                   month_number=12,
-                                   year_number=2016,
-                                   time_number=(nrow(temp)+1)
-        )
-        
-        
-        # convert the date column to the right date format to merge with predicted df later
-        december_2016$date <- as.Date(december_2016$date,format = "%Y/%m/%d")
-        
-        # Ensure temp is of type data frame
-        temp = as.data.frame(temp)
-        
-        ## Use predict function + model we just built and apply to dec_2016 data frame
-        # Add the December 2016 row
-        temp <- rbind(temp, december_2016)
-        temp$prediction <- predict(model_all, temp)
-        testset_all$prediction = predict(model_all, testset_all)
-        
-        # Get the last prediction value (which is the Dec 2016 value).
-        train_dec_2016_prediction = tail(temp$prediction, 1)
-        
-        
-        # Output a prediction based on all rows and add it to the temp data frame
-        temp$prediction = predict(model_all, temp)
-        
-        # Get the last prediction value (which is the Dec 2016 value).
-        train_dec_2016_prediction = tail(temp$prediction, 1)
-        
-        tempOutput = c(ind, loc, training.mse, training.rmse, train_dec_2016_prediction)
-        
-      } else {
-        
-        tempOutput = c(ind, loc, NA, NA, NA)
-      }
-      
-      output = rbind(output, tempOutput)
-    }}}
-
diff --git a/scripts/appendix.R b/scripts/appendix.R
deleted file mode 100644
index 810e0bf..0000000
--- a/scripts/appendix.R
+++ /dev/null
@@ -1,15 +0,0 @@
-##########################################################
-# APPENDIX
-##########################################################
-# --------------------------------------------------------
-# APPENDIX 1
-# --------------------------------------------------------
-# load data description csv file
-dd <- read_csv(here("dataset/data_description.csv"))
-
-
-# display the information under table format
-dd %>%
-  kbl() %>%
-  kable_styling(full_width = F)
-