Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
Petter Olsson committed Nov 14, 2024
1 parent 20b6da1 commit 985086a
Show file tree
Hide file tree
Showing 9 changed files with 114 additions and 80 deletions.
1 change: 1 addition & 0 deletions dominostats.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"R2":[0.108],"MSE":[0.15]}
Binary file modified models/R_linear_model.Rda
Binary file not shown.
Binary file modified models/sklearn_gbm.pkl
Binary file not shown.
16 changes: 8 additions & 8 deletions scripts/.ipynb_checkpoints/R_model_train-checkpoint.R
Original file line number Diff line number Diff line change
@@ -1,37 +1,37 @@
library(mlflow)
print("Reading in data")
project_name <- Sys.getenv('DOMINO_PROJECT_NAME')
path <- paste('/mnt/data/',project_name,'/WineQualityData.csv')
path <- paste('/mnt/data/',project_name,'/credit_card_default.csv')
path <- gsub(" ", "", path, fixed = TRUE)
data <- read.csv(file=path)
head(data)

#mlflow_set_experiment(experiment_name=paste(Sys.getenv('DOMINO_PROJECT_NAME'), Sys.getenv('DOMINO_STARTING_USERNAME')))
mlflow_set_experiment(experiment_name = paste0(Sys.getenv('DOMINO_PROJECT_NAME'), " ", Sys.getenv('DOMINO_STARTING_USERNAME'), " ", Sys.getenv('MLFLOW_NAME')))

data$is_red <- as.integer(data$type != 'white')
#data$is_red <- as.integer(data$type != 'white')

data <-na.omit(data)
dim(data)[1]-sum(complete.cases(data))

train <-data[sample(nrow(data), round(dim(data)[1]*0.75)),]
# test <- data[(round(dim(data)[1]*0.75)+1):dim(data)[1], 2:dim(data)[2]]
test <- data[(data$id %in% train$id)==FALSE,]
train <- subset(train, select = -c(id) )
test <- subset(test, select = -c(id) )
train <- subset(train, select = -c(DEFAULT) )
test <- subset(test, select = -c(DEFAULT) )

train_matrix <- as.matrix(train)
test_matrix <- as.matrix(test)
label_matrix <- as.matrix(train$quality)
test_lab_matrix <- as.matrix(test$quality)
label_matrix <- as.matrix(train$DEFAULT)
test_lab_matrix <- as.matrix(test$DEFAULT)

dim(train)+dim(test)

with(mlflow_start_run(), {
mlflow_set_tag("Model_Type", "R")
print("Training Model")

lm_model <- lm(formula = quality ~., data = train)
lm_model <- lm(formula = DEFAULT ~., data = train)
lm_model


Expand All @@ -41,7 +41,7 @@ with(mlflow_start_run(), {

preds_lm <- predict(lm_model, newdata = test)

rsquared_lm <-round(RSQUARE(preds_lm, test$quality),3)
rsquared_lm <-round(RSQUARE(preds_lm, test$DEFAULT),3)
print(rsquared_lm[1])

#mse
Expand Down
14 changes: 9 additions & 5 deletions scripts/.ipynb_checkpoints/h2o_model_train-checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,13 @@

#read in data then split into train and test

path = str('/mnt/data/{}/WineQualityData.csv'.format(os.environ.get('DOMINO_PROJECT_NAME')))
path = str('/mnt/data/{}/credit_card_default.csv'.format(os.environ.get('DOMINO_PROJECT_NAME')))
data = pd.read_csv(path)
print('Read in {} rows of data'.format(data.shape[0]))

#Find all pearson correlations of numerical variables with quality
corr_values = data.corr(numeric_only=True).sort_values(by = 'quality')['quality'].drop('quality',axis=0)
#corr_values = data.corr(numeric_only=True).sort_values(by = 'quality')['quality'].drop('quality',axis=0)
corr_values = data.corr(numeric_only=True).sort_values(by = 'DEFAULT')['DEFAULT'].drop('DEFAULT',axis=0)

#Keep all variables with above a 8% pearson correlation
important_feats=corr_values[abs(corr_values)>0.08]
Expand All @@ -34,7 +35,8 @@
#Drop NA rows
data = data.dropna(how='any',axis=0)
#Split df into inputs and target
data = data[list(important_feats.keys())+['quality']]
#data = data[list(important_feats.keys())+['quality']]
data = data[list(important_feats.keys())+['DEFAULT']]

train = data[0:round(len(data)*n/100)]
test = data[train.shape[0]:]
Expand All @@ -54,7 +56,8 @@

# Identify predictors and response
x = hTrain.columns
y = "quality"
#y = "quality"
y = "DEFAULT"
x.remove(y)

# Isolate target vasriable
Expand Down Expand Up @@ -90,7 +93,8 @@
"MSE": mse}))

#Write results to dataframe for viz
results = pd.DataFrame({'Actuals':test.quality.reset_index()['quality'], 'Predictions': preds.as_data_frame()['predict']})
#results = pd.DataFrame({'Actuals':test.quality.reset_index()['quality'], 'Predictions': preds.as_data_frame()['predict']})
results = pd.DataFrame({'Actuals':test.quality.reset_index()['DEFAULT'], 'Predictions': preds.as_data_frame()['predict']})

print('Creating visualizations...')
#Scatterplot
Expand Down
13 changes: 8 additions & 5 deletions scripts/.ipynb_checkpoints/sklearn_model_train-checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
scipy.linalg.pinv2 = np.linalg.pinv

#Read in data
path = str('/mnt/data/{}/WineQualityData.csv'.format(os.environ.get('DOMINO_PROJECT_NAME')))
path = str('/mnt/data/{}/credit_card_default.csv'.format(os.environ.get('DOMINO_PROJECT_NAME')))
df = pd.read_csv(path)
print('Read in {} rows of data'.format(df.shape[0]))

Expand All @@ -29,10 +29,12 @@
df.rename({col: col.replace(' ', '_')}, axis =1, inplace = True)

#Create is_red variable to store red/white variety as int
df['is_red'] = df.type.apply(lambda x : int(x=='red'))
#commented out for CC use case
#df['is_red'] = df.type.apply(lambda x : int(x=='red'))

#Find all pearson correlations of numerical variables with quality
corr_values = df.corr(numeric_only=True).sort_values(by = 'quality')['quality'].drop('quality',axis=0)
#corr_values = df.corr(numeric_only=True).sort_values(by = 'quality')['quality'].drop('quality',axis=0)
corr_values = df.corr(numeric_only=True).sort_values(by='DEFAULT')['DEFAULT']

#Keep all variables with above a 8% pearson correlation
important_feats=corr_values[abs(corr_values)>0.08]
Expand All @@ -43,7 +45,8 @@
df = df.dropna(how='any',axis=0)
#Split df into inputs and target
X = df[important_feats.keys()]
y = df['quality'].astype('float64')
#y = df['quality'].astype('float64')
y = df['DEFAULT'].astype('float64')

# create a new MLFlow experiemnt
#mlflow.set_experiment(experiment_name=os.environ.get('DOMINO_PROJECT_NAME') + " " + os.environ.get('DOMINO_STARTING_USERNAME'))
Expand Down Expand Up @@ -97,7 +100,7 @@

fig2, ax2 = plt.subplots(figsize=(10,6))
plt.title('Sklearn Actuals vs Predictions Histogram')
plt.xlabel('Quality')
plt.xlabel('Default')
sns.histplot(results, bins=6, multiple = 'dodge', palette = 'coolwarm')
plt.savefig('/mnt/artifacts/actual_v_pred_hist.png')
mlflow.log_figure(fig2, 'actual_v_pred_hist.png')
Expand Down
123 changes: 71 additions & 52 deletions scripts/R_model_train.R
Original file line number Diff line number Diff line change
@@ -1,67 +1,86 @@
library(mlflow)
library(jsonlite)

print("Reading in data")
project_name <- Sys.getenv('DOMINO_PROJECT_NAME')
path <- paste('/mnt/data/',project_name,'/WineQualityData.csv')
path <- paste('/mnt/data/', project_name, '/credit_card_default.csv')
path <- gsub(" ", "", path, fixed = TRUE)
data <- read.csv(file=path)
data <- read.csv(file = path)
head(data)

#mlflow_set_experiment(experiment_name=paste(Sys.getenv('DOMINO_PROJECT_NAME'), Sys.getenv('DOMINO_STARTING_USERNAME')))
mlflow_set_experiment(experiment_name = paste0(Sys.getenv('DOMINO_PROJECT_NAME'), " ", Sys.getenv('DOMINO_STARTING_USERNAME'), " ", Sys.getenv('MLFLOW_NAME')))

data$is_red <- as.integer(data$type != 'white')

data <-na.omit(data)
dim(data)[1]-sum(complete.cases(data))
# Rename the target column to "DEFAULT"
if ("default_payment_next_month" %in% colnames(data)) {
colnames(data)[colnames(data) == "default_payment_next_month"] <- "DEFAULT"
} else {
stop("Column 'default_payment_next_month' not found in the data.")
}

train <-data[sample(nrow(data), round(dim(data)[1]*0.75)),]
# test <- data[(round(dim(data)[1]*0.75)+1):dim(data)[1], 2:dim(data)[2]]
test <- data[(data$id %in% train$id)==FALSE,]
train <- subset(train, select = -c(id) )
test <- subset(test, select = -c(id) )
# Verify the renaming
print("Columns in data after renaming:")
print(colnames(data))

train_matrix <- as.matrix(train)
test_matrix <- as.matrix(test)
label_matrix <- as.matrix(train$quality)
test_lab_matrix <- as.matrix(test$quality)

dim(train)+dim(test)

with(mlflow_start_run(), {
mlflow_set_tag("Model_Type", "R")
print("Training Model")

lm_model <- lm(formula = quality ~., data = train)
lm_model


RSQUARE = function(y_actual,y_predict){
cor(y_actual,y_predict)^2
}
# Define MLflow experiment
mlflow_set_experiment(experiment_name = paste0(Sys.getenv('DOMINO_PROJECT_NAME'), " ", Sys.getenv('DOMINO_STARTING_USERNAME'), " ", Sys.getenv('MLFLOW_NAME')))

preds_lm <- predict(lm_model, newdata = test)
# Remove missing values
data <- na.omit(data)
print(paste("Number of rows with missing values removed:", dim(data)[1] - sum(complete.cases(data))))

rsquared_lm <-round(RSQUARE(preds_lm, test$quality),3)
print(rsquared_lm[1])
# Split data into training and testing sets
set.seed(123) # Set seed for reproducibility
train <- data[sample(nrow(data), round(dim(data)[1] * 0.75)), ]
test <- data[!(rownames(data) %in% rownames(train)), ]

#mse
mse_lm<- round(mean((test_lab_matrix - preds_lm)^2),3)
print(mse_lm)
# Verify that the train and test sets include the "DEFAULT" column
if (!("DEFAULT" %in% colnames(train))) {
stop("Column 'DEFAULT' is not present in the training set.")
}

mlflow_log_metric("R2", rsquared_lm[1])
mlflow_log_metric("MSE", mse_lm)
# Define target and feature columns
target_variable <- "DEFAULT"
features <- setdiff(names(data), target_variable)

diagnostics = list("R2" = rsquared_lm[1],
"MSE"=mse_lm)
library(jsonlite)
fileConn<-file("dominostats.json")
writeLines(toJSON(diagnostics), fileConn)
close(fileConn)
train_matrix <- as.matrix(train[, features])
test_matrix <- as.matrix(test[, features])
label_matrix <- as.matrix(train[[target_variable]])
test_lab_matrix <- as.matrix(test[[target_variable]])

save(lm_model, file="/mnt/code/models/R_linear_model.Rda")
})
dim(train) + dim(test)

# install.packages("SHAPforxgboost")
# install.packages("SHAPforxgboost")
# library("SHAPforxgboost")
# shap_values <- shap.values(xgb_model = mod, X_train = dataX)
# Start MLflow run
with(mlflow_start_run(), {
mlflow_set_tag("Model_Type", "R")
print("Training Model")

# Train the model (update formula for new dataset)
lm_model <- lm(formula = as.formula(paste(target_variable, "~ .")), data = train)
print(lm_model)

# Define RSQUARE function
RSQUARE <- function(y_actual, y_predict) {
cor(y_actual, y_predict)^2
}

# Predict and calculate metrics
preds_lm <- predict(lm_model, newdata = test)

rsquared_lm <- round(RSQUARE(test[[target_variable]], preds_lm), 3)
print(rsquared_lm)

# Mean Squared Error
mse_lm <- round(mean((test_lab_matrix - preds_lm)^2), 3)
print(mse_lm)

# Log metrics to MLflow
mlflow_log_metric("R2", rsquared_lm)
mlflow_log_metric("MSE", mse_lm)

# Save diagnostics to JSON
diagnostics <- list("R2" = rsquared_lm, "MSE" = mse_lm)
fileConn <- file("dominostats.json")
writeLines(toJSON(diagnostics), fileConn)
close(fileConn)

# Save model
save(lm_model, file = "/mnt/code/models/R_linear_model.Rda")
})
14 changes: 9 additions & 5 deletions scripts/h2o_model_train.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,13 @@

#read in data then split into train and test

path = str('/mnt/data/{}/WineQualityData.csv'.format(os.environ.get('DOMINO_PROJECT_NAME')))
path = str('/mnt/data/{}/credit_card_default.csv'.format(os.environ.get('DOMINO_PROJECT_NAME')))
data = pd.read_csv(path)
print('Read in {} rows of data'.format(data.shape[0]))

#Find all pearson correlations of numerical variables with quality
corr_values = data.corr(numeric_only=True).sort_values(by = 'quality')['quality'].drop('quality',axis=0)
#corr_values = data.corr(numeric_only=True).sort_values(by = 'quality')['quality'].drop('quality',axis=0)
corr_values = data.corr(numeric_only=True).sort_values(by = 'DEFAULT')['DEFAULT'].drop('DEFAULT',axis=0)

#Keep all variables with above a 8% pearson correlation
important_feats=corr_values[abs(corr_values)>0.08]
Expand All @@ -34,7 +35,8 @@
#Drop NA rows
data = data.dropna(how='any',axis=0)
#Split df into inputs and target
data = data[list(important_feats.keys())+['quality']]
#data = data[list(important_feats.keys())+['quality']]
data = data[list(important_feats.keys())+['DEFAULT']]

train = data[0:round(len(data)*n/100)]
test = data[train.shape[0]:]
Expand All @@ -54,7 +56,8 @@

# Identify predictors and response
x = hTrain.columns
y = "quality"
#y = "quality"
y = "DEFAULT"
x.remove(y)

# Isolate target vasriable
Expand Down Expand Up @@ -90,7 +93,8 @@
"MSE": mse}))

#Write results to dataframe for viz
results = pd.DataFrame({'Actuals':test.quality.reset_index()['quality'], 'Predictions': preds.as_data_frame()['predict']})
#results = pd.DataFrame({'Actuals':test.quality.reset_index()['quality'], 'Predictions': preds.as_data_frame()['predict']})
results = pd.DataFrame({'Actuals':test.quality.reset_index()['DEFAULT'], 'Predictions': preds.as_data_frame()['predict']})

print('Creating visualizations...')
#Scatterplot
Expand Down
13 changes: 8 additions & 5 deletions scripts/sklearn_model_train.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
scipy.linalg.pinv2 = np.linalg.pinv

#Read in data
path = str('/mnt/data/{}/WineQualityData.csv'.format(os.environ.get('DOMINO_PROJECT_NAME')))
path = str('/mnt/data/{}/credit_card_default.csv'.format(os.environ.get('DOMINO_PROJECT_NAME')))
df = pd.read_csv(path)
print('Read in {} rows of data'.format(df.shape[0]))

Expand All @@ -29,10 +29,12 @@
df.rename({col: col.replace(' ', '_')}, axis =1, inplace = True)

#Create is_red variable to store red/white variety as int
df['is_red'] = df.type.apply(lambda x : int(x=='red'))
#commented out for CC use case
#df['is_red'] = df.type.apply(lambda x : int(x=='red'))

#Find all pearson correlations of numerical variables with quality
corr_values = df.corr(numeric_only=True).sort_values(by = 'quality')['quality'].drop('quality',axis=0)
#corr_values = df.corr(numeric_only=True).sort_values(by = 'quality')['quality'].drop('quality',axis=0)
corr_values = df.corr(numeric_only=True).sort_values(by='DEFAULT')['DEFAULT']

#Keep all variables with above a 8% pearson correlation
important_feats=corr_values[abs(corr_values)>0.08]
Expand All @@ -43,7 +45,8 @@
df = df.dropna(how='any',axis=0)
#Split df into inputs and target
X = df[important_feats.keys()]
y = df['quality'].astype('float64')
#y = df['quality'].astype('float64')
y = df['DEFAULT'].astype('float64')

# create a new MLFlow experiemnt
#mlflow.set_experiment(experiment_name=os.environ.get('DOMINO_PROJECT_NAME') + " " + os.environ.get('DOMINO_STARTING_USERNAME'))
Expand Down Expand Up @@ -97,7 +100,7 @@

fig2, ax2 = plt.subplots(figsize=(10,6))
plt.title('Sklearn Actuals vs Predictions Histogram')
plt.xlabel('Quality')
plt.xlabel('Default')
sns.histplot(results, bins=6, multiple = 'dodge', palette = 'coolwarm')
plt.savefig('/mnt/artifacts/actual_v_pred_hist.png')
mlflow.log_figure(fig2, 'actual_v_pred_hist.png')
Expand Down

0 comments on commit 985086a

Please sign in to comment.