diff --git a/transaction.Rmd b/transaction.Rmd index 709ff3e..acced90 100644 --- a/transaction.Rmd +++ b/transaction.Rmd @@ -28,6 +28,8 @@ library(sqldf) # using SQL library(car) # calculate the VIF library(dplyr) # data processing library(gganimate) # create animated plots +library(ggpubr) # comebine plots into single page +theme_set(theme_pubr()) ``` @@ -171,10 +173,15 @@ After conducting an inspection of retail transaction data, it is known that ther ```{r fig.align="center", message=FALSE, warning=FALSE} # convert date column into the date format df$date <- as.Date(df$date,"%d/%m/%Y") +# convert customer_id column into character format +df$customer_id = as.character(df$customer_id, format = "") # convert location column into character format df$location <- as.character(df$location) # convert industry column into character format df$industry <- as.character(df$industry) +# format transaction amount number for the sake of displaying information +df$monthly_amount <- df$monthly_amount/1e6 + ####################################################### # TABLE TRANSFORMATION @@ -233,61 +240,74 @@ monthly_amount_plot <- ggplot(time_series_df) + aes(x = date, y = transaction_amount) + geom_line(colour = "#B22222") + geom_point(size = 2, show.legend = FALSE) + - labs(x = "Year", y = "Total transaction amount") + + geom_text(aes(date, transaction_amount, label = transaction_amount), hjust=-0.1) + + labs(x = "Year", y = "Total transaction amount in millions") + theme_minimal() + transition_reveal(date) # gganimate specific bits +monthly_amount_plot + # plot number of transaction count over time transaction_count_plot <- ggplot(time_series_df) + aes(x = date, y = transaction_count) + geom_line(size = 0.7, colour = "#B22222") + geom_point() + + geom_text(aes(date, transaction_count, label = transaction_count), hjust=-0.1) + labs(x = "Year", y = "Number of transaction") + theme_minimal() + transition_reveal(date) # gganimate specific bits +transaction_count_plot + # plot number of transaction count over time -transaction_count_plot <- ggplot(time_series_df) + - aes(x = date, y = transaction_count) + - geom_point(shape = "circle", size = 2, - colour = "#FF058F") + - geom_smooth(span = 0.1) + - labs(y = "Number of transactions") + - theme_minimal() + - theme(plot.title = element_text(size = 12L, hjust = 0.5), axis.title.y = element_text(size = 12L)) +# transaction_count_plot <- ggplot(time_series_df) + +# aes(x = date, y = transaction_count) + +# geom_point(shape = "circle", size = 2, +# colour = "#FF058F") + +# geom_smooth(span = 0.1) + +# geom_text(aes(date, transaction_count, label = transaction_count), hjust=-0.1) + +# labs(y = "Number of transactions") + +# theme_minimal() + +# theme(plot.title = element_text(size = 12L, hjust = 0.5), axis.title.y = element_text(size = 12L)) # plot transaction info by industry -# industry_plot <- ggplot(industry) + -# aes(x = date, y = transaction_amount, colour = industry, group = industry) + -# geom_segment(aes(xend = as.Date("1/1/2013","%d/%m/%Y"), -# yend = as.Date("1/11/2016","%d/%m/%Y")), -# linetype = 2, -# colour = 'grey', -# show.legend = FALSE) + -# geom_line(size = 0.5) + -# geom_text(aes(x = 2019.1, label = industry, color = "#000000"), hjust = 0, show.legend = FALSE) + -# scale_color_hue(direction = 1) + -# labs(x = "Year", y = "Total transaction amount", title = "Transaction amount by industry") + -# theme_minimal() + -# theme(plot.title = element_text(size = 15L, face = "bold")) -# industry_plot - -# plot transaction info by location -ggplot(industry) + - aes(x = date, y = transaction_count, colour = industry, group = industry) + - geom_line(size = 0.5) + - scale_color_hue(direction = 1) + - labs(x = "Year", y = "Number of transaction", title = "Transaction amount by industry") + - theme_minimal() + - theme(plot.title = element_text(size = 15L, face = "bold")) + - transition_reveal(date) # gganimate specific bits +# animacion <- industry %>% +# ggplot() + +# geom_col(aes(ranking, transaction_amount, fill = industry)) + +# geom_text(aes(ranking, transaction_amount, label = transaction_amount), hjust=-0.1) + +# geom_text(aes(ranking, y=0 , label = industry), hjust=1.1) + +# geom_text(aes(x=15, y=max(transaction_amount) , label = as.factor(date)), vjust = 0.2, alpha = 0.5, col = "gray", size = 20) + +# coord_flip(clip = "off", expand = FALSE) + scale_x_reverse() + +# theme_minimal() + theme( +# panel.grid = element_blank(), +# legend.position = "none", +# axis.ticks.y = element_blank(), +# axis.title.y = element_blank(), +# axis.text.y = element_blank(), +# plot.margin = margin(1, 4, 1, 3, "cm") +# ) + +# transition_states(date, state_length = 0, transition_length = 2) + +# enter_fade() + +# exit_fade() + +# ease_aes('quadratic-in-out') +# +# animate(animacion, width = 700, height = 432, fps = 25, duration = 15, rewind = FALSE) +# plot transaction info by location +# figure <- ggarrange(monthly_amount_plot, transaction_count_plot, +# labels = c("A", "B"), +# ncol = 2) +# figure # combine multiple plots into 1 single page ``` +Here the dark squares represent a strong correlation (close to 1) while the lighter ones represent the weaker correlation(close to 0). That’s the reason, all the diagonals are dark blue, as a variable is fully correlated with itself. +Now, the thing worth noticing here is that the correlation between newspaper and radio is 0.35. This indicates a fair relationship between newspaper and radio budgets. Hence, it can be inferred that → when the radio budget is increased for a product, there’s a tendency to spend more on newspapers as well. +This is called collinearity and is referred to as a situation in which two or more input variables are linearly related. +Hence, even though the Multiple Regression model shows no impact on sales by the newspaper, the Simple Regression model still does due to this multicollinearity and the absence of other input variables. # Modeling