diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..c8a17ba --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2024 Gil Ramot, Tel Aviv University + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..b07254c --- /dev/null +++ b/README.md @@ -0,0 +1,37 @@ +# modelcomp: comparison tool for machine learning models + +[![PyPI](https://img.shields.io/pypi/v/modelcomp)](https://pypi.org/project/modelcomp/) +![License](https://img.shields.io/github/license/gilramot/modelcomp) +![Downloads](https://img.shields.io/pypi/dm/modelcomp) +[![PyPI pyversions](https://img.shields.io/pypi/pyversions/modelcomp)](https://pypi.org/pypi/modelcomp/) + +modelcomp is a python package that helps you compare between machine learning models' performance on your dataset. It was originally developed for a microbiome research at [Borenstein Lab](http://borensteinlab.com/), Tel Aviv University. + +Thanks to [Alpha program](https://www.madaney.net/en/alpha) for making this happen. + + +## Dependencies + +modelcomp currently supports Python 3.6+. + +Check out the [reqs file](https://github.com/gilramot/modelcomp/blob/master/requirements.txt) for additional package requirements. + +## Installation + +The latest stable release (and required dependencies) can be installed from [PyPI](https://pypi.org/project/modelcomp/): + + pip install modelcomp + +The current version (0.0.1a1) is the version used in the research, and thus all pre-0.0.1 versions are very unstable and may be subject to backwards incompatible changes. + +Anaconda support coming soon! + + +## Contributing + +Feel free to [report an issue](https://github.com/gilramot/modelcomp/issues/new) in the package repository. + + +## Research + +An appendix with the dataset used, results & module is available at [modelcomp-appendix](https://github.com/gilramot/modelcomp-appendix). diff --git a/modelcomp/__init__.py b/modelcomp/__init__.py new file mode 100644 index 0000000..4ae9302 --- /dev/null +++ b/modelcomp/__init__.py @@ -0,0 +1,36 @@ +import modelcomp.models +from modelcomp.analysis import ( + cross_val_models, + std_validation_models, + get_fprtpr, + get_pr +) +from modelcomp.plotter import ( + individual_plots, + general_plots +) +from modelcomp.read import read_data +from modelcomp.utilities import ( + model_names, + model_names_short, + model_names_dict, + remove_falsy_columns, + split_data, + split_array, + get_feature_importance, + merge_dfs, + relative_abundance, + get_label_indexes, + get_k_fold, + get_models, + filter_data, + remove_rare_species, + remove_string_columns, + make_dir, + join_save, + data_to_filename, + filename_to_data, +) +from modelcomp.write import write_data, write_plot + +modelcomp.model_names_dict = dict(zip(modelcomp.utilities.model_names, modelcomp.utilities.model_names_short)) diff --git a/modelcomp/analysis.py b/modelcomp/analysis.py new file mode 100644 index 0000000..f29006a --- /dev/null +++ b/modelcomp/analysis.py @@ -0,0 +1,190 @@ +import numpy as np +import pandas as pd +import shap +from sklearn import metrics +from sklearn.metrics import roc_curve, precision_recall_curve +from sklearn.neighbors import KNeighborsClassifier +from sklearn.preprocessing import StandardScaler +from sklearn.svm import SVC + +import modelcomp as mcp + + +def get_fprtpr(model, X, y, pos_num): + """ + Calculates the fpr & tpr of a model based on the train & test data + :param model: The trained model + :param X: Testing data + :param y: Binary labels + :param pos_num: Positive class label + :return: fpr & tpr values + """ + fpr, tpr, _ = roc_curve(y, model.predict_proba(X)[:, 1], pos_label=pos_num, drop_intermediate=False) + return fpr, tpr + # returning fpr, tpr of roc_curve + + +def get_pr(model, X, y, pos_num): + """ + Calculates the precision & recall of a model based on the train & test data + :param model: The trained model + :param X: Testing data + :param y: Binary labels + :param pos_num: Positive class label + :return: precision & recall values + """ + precision, recall, _ = precision_recall_curve(y, model.predict_proba(X)[:, 1], pos_label=pos_num) + return precision, recall + # returning precision, recall of rp_curve + + +def std_validation_models(models, X_train, X_test, y_train, y_test, tested_on, trained_on, feature_names, validate=True, + explain=True, plot=True): + """ + Standard validation of models (used when the positive class label differs between the training and testing data) + :param models: A list of models to evaluate + :param X_train: Training data + :param X_test: Testing data + :param y_train: Training data labels + :param y_test: Testing data labels + :param tested_on: Tested label + :param trained_on: Trained label + :param feature_names: List of feature names + :param validate: Validate the models (default: True) + :param explain: Explain the models (default: True) + :param plot: Plot the results (default: True) + :return: Model results, exported to the filesystem (default: "export") + """ + X_train, X_test, y_train, y_test = X_train.to_numpy(), X_test.to_numpy(), y_train.to_numpy().ravel(), y_test.to_numpy().ravel() + mean_fpr = np.linspace(0, 1, 100) + # init + for model_index, model in enumerate(models): + X_train_temp, X_test_temp = X_train, X_test + interp_tpr, interp_recall, aucs, pr_aucs = None, None, None, None + feature_importances, shap_values = None, None + save_to_unjoined = mcp.data_to_filename(tested_on, mcp.model_names[model_index], + trained_on=trained_on) + # in-loop init + if validate: + aucs = [] + pr_aucs = [] + if type(model) is SVC: + sc = StandardScaler() + X_train = sc.fit_transform(X_train) + X_test = sc.transform(X_test) + # scaling if svm + model.fit(X_train, y_train) + # fitting model + fpr, tpr = get_fprtpr(model, X_test, y_test, 1) + interp_tpr = np.interp(mean_fpr, fpr, tpr) + interp_tpr[0] = 0.0 + aucs.append(metrics.auc(fpr, tpr)) + # roc curve variables + precision, recall = get_pr(model, X_test, y_test.ravel(), 1) + interp_recall = np.interp(mean_fpr, recall[::-1].ravel(), precision[::-1].ravel()) + interp_recall[0] = 1.0 + pr_aucs.append(metrics.auc(recall, precision)) + # pr curve variables + if explain: + feature_importances = pd.DataFrame(mcp.get_feature_importance(model).T, index=feature_names, + columns=['Importance']) if type( + model) is not KNeighborsClassifier else None + # feature importance + shap_values = (shap.explainers.Permutation(model.predict, X_test, max_evals=1000).shap_values(X_test)) + # shap values + mcp.write_data(save_to_unjoined, feature_names, interp_tpr, interp_recall, aucs, pr_aucs, + feature_importances=feature_importances, shap_values=shap_values) + # exporting data + X_train, X_test = X_train_temp, X_test_temp + if plot: + mcp.individual_plots(save_to_unjoined) + # plotting data + + +def cross_val_models(models, validation_model, X, y, positive_label, feature_names, validate=True, + explain=True, plot=True): + """ + Cross validation of multiple models + :param models: List of models to evaluate + :param validation_model: Validation model to evaluate with + :param X: Features + :param y: Labels + :param positive_label: Positive label + :param feature_names: List of feature names + :param validate: Validate the models (default: True) + :param explain: Explain the models (default: True) + :param plot: Plot the results (default: True) + :return: Model results, exported to the filesystem (default: "export") + """ + mean_fpr = np.linspace(0, 1, 100) + # init + for model_index, model in enumerate(models): + feature_importances, shap_values = None, None + if validate: + feature_importances_per_fold = [] + interp_tpr_per_fold = [] + aucs = [] + interp_recall_per_fold = [] + pr_aucs = [] + fprs = [] + tprs = [] + precisions = [] + recalls = [] + shap_values = None + for split_index, (train, test) in enumerate(validation_model.split(X, y)): + X_train_temp, X_test_temp = X[train], X[test] + if type(model) is SVC: + sc = StandardScaler() + X[train] = sc.fit_transform(X[train]) + X[test] = sc.transform(X[test]) + # scaling if svm + model.fit(X[train], y[train]) + # fitting model + fpr, tpr = get_fprtpr(model, X[test], y[test], 1) + fprs.append(fpr) + tprs.append(tpr) + interp_tpr = np.interp(mean_fpr, fpr, tpr) + interp_tpr[0] = 0.0 + interp_tpr_per_fold.append(interp_tpr) + aucs.append(metrics.auc(fpr, tpr)) + # roc curve variables + precision, recall = get_pr(model, X[test], y[test].ravel(), 1) + precisions.append(precision) + recalls.append(recall) + interp_recall = np.interp(mean_fpr, recall[::-1].ravel(), precision[::-1].ravel()) + interp_recall[0] = 1.0 + interp_recall_per_fold.append(interp_recall) + pr_aucs.append(metrics.auc(recall, precision)) + # pr curve variables + if explain: + feature_importances_per_fold.append( + pd.DataFrame(mcp.get_feature_importance(model).T, index=feature_names, + columns=['Importance']) if type(model) is not KNeighborsClassifier else None) + if feature_importances_per_fold[0] is None: + feature_importances = None + else: + feature_importances = feature_importances_per_fold[0] + for feature_importances_in_fold in feature_importances_per_fold[:1]: + feature_importances = feature_importances.add(feature_importances_in_fold, fill_value=0) + feature_importances['Importance'] = feature_importances['Importance'].map( + lambda old_value: old_value / len(feature_importances_per_fold)) + # feature importance + shap_values_temp = shap.explainers.Permutation(model.predict, X[test], max_evals=1000).shap_values( + X[test]) + if shap_values is None: + shap_values = shap_values_temp + else: + shap_values = np.append(shap_values, shap_values_temp, axis=0) + # shap values + X[train], X[test] = X_train_temp, X_test_temp + mcp.write_data( + mcp.data_to_filename(positive_label, mcp.model_names[model_index]), + feature_names, + interp_tpr_per_fold, interp_recall_per_fold, aucs, pr_aucs, fprs=fprs, tprs=tprs, + precisions=precisions, recalls=recalls, feature_importances=feature_importances, + shap_values=shap_values) + # exporting data + if plot: + mcp.individual_plots( + mcp.data_to_filename(positive_label, mcp.model_names[model_index])) + # plotting data diff --git a/modelcomp/models/__init__.py b/modelcomp/models/__init__.py new file mode 100644 index 0000000..1c02c82 --- /dev/null +++ b/modelcomp/models/__init__.py @@ -0,0 +1,5 @@ +from modelcomp.models.knn import get +from modelcomp.models.logistic_regression import get +from modelcomp.models.random_forest import get +from modelcomp.models.svm import get +from modelcomp.models.xgboost import get diff --git a/modelcomp/models/knn.py b/modelcomp/models/knn.py new file mode 100644 index 0000000..5bace7c --- /dev/null +++ b/modelcomp/models/knn.py @@ -0,0 +1,9 @@ +from sklearn.neighbors import KNeighborsClassifier + + +def get(): + """ + Load a k-NN model from the scikit-learn library + :return: a k-NN model + """ + return KNeighborsClassifier() diff --git a/modelcomp/models/logistic_regression.py b/modelcomp/models/logistic_regression.py new file mode 100644 index 0000000..49d9c80 --- /dev/null +++ b/modelcomp/models/logistic_regression.py @@ -0,0 +1,10 @@ +from sklearn.linear_model import LogisticRegression + + +def get(seed=None): + """ + Load a logistic regression model from the scikit-learn library + :param seed: set seed for model randomness (default: None) + :return: a logistic regression model + """ + return LogisticRegression(random_state=(seed if seed is not None else None), max_iter=10000) diff --git a/modelcomp/models/random_forest.py b/modelcomp/models/random_forest.py new file mode 100644 index 0000000..3f3f99b --- /dev/null +++ b/modelcomp/models/random_forest.py @@ -0,0 +1,10 @@ +from sklearn.ensemble import RandomForestClassifier + + +def get(seed=None): + """ + Load a random forest from the scikit-learn library + :param seed: set seed for model randomness (default: None) + :return: a random forest model + """ + return RandomForestClassifier(random_state=(seed if seed is not None else None)) diff --git a/modelcomp/models/svm.py b/modelcomp/models/svm.py new file mode 100644 index 0000000..bda184f --- /dev/null +++ b/modelcomp/models/svm.py @@ -0,0 +1,10 @@ +from sklearn.svm import SVC + + +def get(seed=None): + """ + Load an svm model from the scikit-learn library + :param seed: set seed for model randomness (default: None) + :return: an svm model + """ + return SVC(random_state=(seed if seed is not None else None), kernel='linear', probability=True) diff --git a/modelcomp/models/xgboost.py b/modelcomp/models/xgboost.py new file mode 100644 index 0000000..850c5d6 --- /dev/null +++ b/modelcomp/models/xgboost.py @@ -0,0 +1,10 @@ +import xgboost as xgb + + +def get(seed=None): + """ + Load an XGBoost model from the xgboost library + :param seed: set seed for model randomness (default: None) + :return: an xgboost model + """ + return xgb.XGBClassifier(random_state=(seed if seed is not None else None)) diff --git a/modelcomp/plotter.py b/modelcomp/plotter.py new file mode 100644 index 0000000..649f609 --- /dev/null +++ b/modelcomp/plotter.py @@ -0,0 +1,267 @@ +import matplotlib.pyplot as plt +import numpy as np +import os +import pandas as pd +import seaborn as sns + +import modelcomp as mc + + +def individual_plots(save_to_unjoined): + """ + Plot the performance of a single model + :param save_to_unjoined: Location to save the plots to + :return: Plots showing the performance of a single model, exported to the filesystem (default: "export////plots") + """ + save_to = mc.join_save(os.path.join(save_to_unjoined, 'plots')) + model_name, trained_on, tested_on = mc.filename_to_data(save_to) + trained_on = 'all diseases' if trained_on == 'a' else trained_on + tested_on = 'all diseases' if tested_on == 'a' else tested_on + title_start = f'{model_name} (trained on {trained_on}, tested on {tested_on}) - ' + interp_tpr, fprs, tprs, interp_recall, precisions, recalls, aucs, pr_aucs, feature_importances, shap_values = mc.read_data( + save_to_unjoined) + mean_fpr = np.linspace(0, 1, 100) + mean_tpr = np.mean(interp_tpr, axis=0) + mean_tpr[-1] = 1.0 + mean_auc = np.mean(aucs) + std_auc = np.std(aucs) + plt.plot( + mean_fpr, + mean_tpr, + color='b', + label=r'AUC = %0.2f ' % (round(mean_auc, 2)) + ( + '$\pm$ %0.2f' % (round(std_auc, 2)) if trained_on == tested_on else ''), + alpha=0.8, + ) + # roc curve + + std_tpr = np.std(interp_tpr, axis=0) + if max(std_tpr) > 0: + tprs_upper = np.minimum(mean_tpr + std_tpr, 1) + tprs_lower = np.maximum(mean_tpr - std_tpr, 0) + plt.fill_between( + mean_fpr, + tprs_lower, + tprs_upper, + color='grey', + alpha=0.3, + label='$\pm$ 1 std. dev.' + ) + # roc standard deviation + + plt.xlim = [-0.05, 1.05] + plt.ylim = [-0.05, 1.05], + plt.xlabel('False Positive Rate') + plt.ylabel('True Positive Rate') + plt.title(f'{title_start} ROC curve') + if fprs is not None: + for rate_index in range(len(fprs)): + plt.plot(fprs[rate_index], tprs[rate_index], alpha=0.15) + # individual splits' roc + + plt.plot([0, 1], [0, 1], linestyle='--') + # randomness line + + plt.legend(loc='lower right') + # legend location + + mc.write_plot(save_to, 'roc') + # saving plot + + mean_recall = np.mean(interp_recall, axis=0) + mean_auc = np.mean(pr_aucs) + std_auc = np.std(pr_aucs) + plt.plot( + mean_fpr, + mean_recall, + color='b', + label=r'PR AUC = %0.2f ' % (round(mean_auc, 2)) + ( + '$\pm$ %0.2f' % (round(std_auc, 2)) if trained_on == tested_on else ''), + alpha=0.8 + ) + # pr curve + std_recall = np.std(interp_recall, axis=0) + if max(std_recall) > 0: + recalls_upper = np.minimum(mean_recall + std_recall, 1) + recalls_lower = np.maximum(mean_recall - std_recall, 0) + plt.fill_between( + mean_fpr, + recalls_lower, + recalls_upper, + color='grey', + alpha=0.3, + label='$\pm$ 1 std. dev.' + ) + # pr standard deviation + + plt.xlim = [-0.05, 1.05], + plt.ylim = [-0.05, 1.05] + plt.xlabel('Recall') + plt.ylabel('Precision') + plt.title(f'{title_start} PR Curve') + + if precisions is not None: + for rate_index in range(len(precisions)): + plt.plot(recalls[rate_index], precisions[rate_index], alpha=0.15) + + plt.legend(loc='lower left') + # legend location + + mc.write_plot(save_to, 'precision-recall') + # saving plot + + if feature_importances is not None: + feature_importances.sort_values(by='Importance', ascending=False, inplace=True) + important_features = feature_importances[:20] + plt.barh(important_features.index.values, important_features['Importance']) + plt.gca().invert_yaxis() + plt.xlabel('Importance') + plt.ylabel('Feature') + plt.title(f'{title_start} Feature Importance') + mc.write_plot(save_to, 'feature_importance') + # plotting feature importance + if shap_values is not None: + important_features = shap_values[:20] + plt.barh(important_features.index.values, important_features['avg_shap_value']) + plt.gca().invert_yaxis() + plt.xlabel('Impact on model output') + plt.ylabel('Feature') + plt.title(f'{title_start} SHAP Values') + mc.write_plot(save_to, 'shap_values') + # plotting shap values + + +def general_plots(positive_uniques): + """ + Plots comparing the models' performances + :param positive_uniques: The positive diseases' IDs in the input data + :return: Plots exported to the filesystem (default: "export/GENERAL PLOTS") + """ + save_to = mc.join_save('GENERAL PLOTS') + mc.make_dir(save_to) + model_aucs = [] + for model_name in mc.model_names: + mean_auc = np.mean( + np.genfromtxt( + mc.join_save(os.path.join('A', 'A', model_name, 'data', 'aucs.csv')))) + model_aucs.append([model_name, mean_auc, 'A', 'A', True]) + for tested_on in positive_uniques: + for trained_on in positive_uniques: + for model_name in mc.model_names: + mean_auc = np.mean(np.genfromtxt( + mc.join_save(os.path.join(tested_on, trained_on, model_name, 'data', 'aucs.csv')))) + model_aucs.append([model_name, mean_auc, trained_on, tested_on, trained_on == tested_on]) + model_aucs_df = pd.DataFrame(model_aucs, columns=['Model', 'AUC', 'Train Data', 'Test Data', 'Train/Test equality']) + average_model_aucs = list() + for model_name in mc.model_names: + average_model_aucs.append(np.mean(model_aucs_df['AUC'].loc[model_aucs_df['Model'] == model_name])) + order_by = [mc.model_names[i] for i in np.argsort(average_model_aucs)[::-1]] + sns.catplot(data=model_aucs_df, x='Model', y='AUC', kind='swarm', order=order_by, hue='Train/Test equality') + sns.boxplot(data=model_aucs_df, x='Model', y='AUC', order=order_by, showcaps=False, boxprops={'facecolor': 'None'}, + whiskerprops={'linewidth': 0}) + plt.xticks(fontsize=8) + plt.title('Model comparison (all validations)') + mc.write_plot(save_to, 'model_comparison_auc') + np.savetxt(os.path.join(save_to, 'average_model_aucs.csv'), + np.array([mc.model_names, average_model_aucs]), + delimiter=',', fmt='%s') + + max_auc_annot = model_aucs_df[['Train Data', 'Test Data', 'AUC']].copy().fillna(0) + max_auc_annot = max_auc_annot.loc[max_auc_annot['Train Data'] != 'A'].loc[max_auc_annot['Test Data'] != 'A'] + max_auc_annot = max_auc_annot.loc[max_auc_annot.groupby(['Train Data', 'Test Data'])['AUC'].idxmax()] + max_auc_annot = max_auc_annot.pivot(index='Train Data', columns='Test Data', values='AUC') + + max_auc = model_aucs_df[['Train Data', 'Test Data', 'AUC']].copy().dropna() + max_auc = max_auc.loc[max_auc['Train Data'] != 'A'].loc[max_auc['Test Data'] != 'A'] + max_auc = max_auc.loc[max_auc.groupby(['Train Data', 'Test Data'])['AUC'].idxmax()] + max_auc = max_auc.pivot(index='Train Data', columns='Test Data', values='AUC') + + sns.heatmap(max_auc, cmap='viridis', vmin=0.4, vmax=1) + for i, col in enumerate(max_auc.columns): + for j, row in enumerate(max_auc.columns): + plt.annotate(str(round(max_auc_annot[row][col], 2) if max_auc_annot[row][col] != 0 else 'NaN'), + xy=(j + 0.5, i + 0.5), + ha='center', va='center', color='white' if max_auc_annot[row][col] != 0 else 'black') + plt.title('AUCs of most accurate model for each train/test data') + mc.write_plot(save_to, 'correlation_heatmap_auc') + + max_auc = model_aucs_df[['Train Data', 'Test Data', 'Model', 'AUC']].copy().dropna() + max_auc = max_auc.loc[max_auc['Train Data'] != 'A'].loc[max_auc['Test Data'] != 'A'] + max_auc = max_auc.loc[max_auc.groupby(['Train Data', 'Test Data'])['AUC'].idxmax()] + max_auc = max_auc.drop('AUC', axis=1).pivot(index='Train Data', columns='Test Data', values='Model') + sns.heatmap(max_auc.replace(order_by[::-1], list(range(1, 6))), cbar=False, cmap='tab10') + for i, col in enumerate(max_auc.columns): + for j, row in enumerate(max_auc.columns): + plt.annotate( + (mc.model_names_dict[max_auc[row][col]] if not str( + max_auc[row][col]) == 'nan' else 'N/A'), + xy=(j + 0.5, i + 0.5), + ha='center', va='center', color='white' if not str(max_auc[row][col]) == 'nan' else 'black') + plt.title('Most accurate model for each train/test data (AUC)') + mc.write_plot(save_to, 'correlation_heatmap_models_auc') + + model_pr_aucs = [] + for model_name in mc.model_names: + mean_pr_auc = np.mean( + np.genfromtxt( + mc.join_save(os.path.join('A', 'A', model_name, 'data', 'pr_aucs.csv')))) + model_pr_aucs.append([model_name, mean_pr_auc, 'A', 'A', True]) + for tested_on in positive_uniques: + for trained_on in positive_uniques: + for model_name in mc.model_names: + mean_pr_auc = np.mean(np.genfromtxt( + mc.join_save( + os.path.join(tested_on, trained_on, model_name, 'data', 'pr_aucs.csv')))) + model_pr_aucs.append([model_name, mean_pr_auc, trained_on, tested_on, trained_on == tested_on]) + model_pr_aucs_df = pd.DataFrame(model_pr_aucs, + columns=['Model', 'PR AUC', 'Train Data', 'Test Data', 'Train/Test equality']) + average_model_pr_aucs = list() + for model_name in mc.model_names: + average_model_pr_aucs.append(np.mean(model_pr_aucs_df['PR AUC'].loc[model_pr_aucs_df['Model'] == model_name])) + pr_order_by = [mc.model_names[i] for i in np.argsort(average_model_pr_aucs)[::-1]] + sns.catplot(data=model_pr_aucs_df, x='Model', y='PR AUC', kind='swarm', order=pr_order_by, + hue='Train/Test equality') + sns.boxplot(data=model_pr_aucs_df, x='Model', y='PR AUC', order=pr_order_by, showcaps=False, + boxprops={'facecolor': 'None'}, + whiskerprops={'linewidth': 0}) + plt.xticks(fontsize=8) + plt.title('Model comparison (all validations)') + mc.write_plot(save_to, 'model_comparison_pr_auc') + np.savetxt(os.path.join(save_to, 'average_model_pr_aucs.csv'), + np.array([mc.model_names, average_model_pr_aucs]), + delimiter=',', fmt='%s') + + max_pr_auc_annot = model_pr_aucs_df[['Train Data', 'Test Data', 'PR AUC']].copy().fillna(0) + max_pr_auc_annot = max_pr_auc_annot.loc[max_pr_auc_annot['Train Data'] != 'A'].loc[ + max_pr_auc_annot['Test Data'] != 'A'] + max_pr_auc_annot = max_pr_auc_annot.loc[max_pr_auc_annot.groupby(['Train Data', 'Test Data'])['PR AUC'].idxmax()] + max_pr_auc_annot = max_pr_auc_annot.pivot(index='Train Data', columns='Test Data', values='PR AUC') + + max_pr_auc = model_pr_aucs_df[['Train Data', 'Test Data', 'PR AUC']].copy().dropna() + max_pr_auc = max_pr_auc.loc[max_pr_auc['Train Data'] != 'A'].loc[max_pr_auc['Test Data'] != 'A'] + max_pr_auc = max_pr_auc.loc[max_pr_auc.groupby(['Train Data', 'Test Data'])['PR AUC'].idxmax()] + max_pr_auc = max_pr_auc.pivot(index='Train Data', columns='Test Data', values='PR AUC') + + sns.heatmap(max_pr_auc, cmap='viridis', vmin=0.4, vmax=1) + for i, col in enumerate(max_pr_auc.columns): + for j, row in enumerate(max_pr_auc.columns): + plt.annotate(str(round(max_pr_auc_annot[row][col], 2) if max_pr_auc_annot[row][col] != 0 else 'NaN'), + xy=(j + 0.5, i + 0.5), + ha='center', va='center', color='white' if max_pr_auc_annot[row][col] != 0 else 'black') + plt.title('PR AUCs of most accurate model for each train/test data') + mc.write_plot(save_to, 'correlation_heatmap_pr_auc') + + max_pr_auc = model_pr_aucs_df[['Train Data', 'Test Data', 'Model', 'PR AUC']].copy().dropna() + max_pr_auc = max_pr_auc.loc[max_pr_auc['Train Data'] != 'A'].loc[max_pr_auc['Test Data'] != 'A'] + max_pr_auc = max_pr_auc.loc[max_pr_auc.groupby(['Train Data', 'Test Data'])['PR AUC'].idxmax()] + max_pr_auc = max_pr_auc.drop('PR AUC', axis=1).pivot(index='Train Data', columns='Test Data', values='Model') + sns.heatmap(max_pr_auc.replace(order_by[::-1], list(range(1, 6))), cbar=False, cmap='tab10') + for i, col in enumerate(max_pr_auc.columns): + for j, row in enumerate(max_pr_auc.columns): + plt.annotate( + (mc.model_names_dict[max_pr_auc[row][col]] if not str( + max_pr_auc[row][col]) == 'nan' else 'N/A'), + xy=(j + 0.5, i + 0.5), + ha='center', va='center', color='white' if not str(max_pr_auc[row][col]) == 'nan' else 'black') + plt.title('Most accurate model for each train/test data (PR AUC)') + mc.write_plot(save_to, 'correlation_heatmap_models_pr_auc') diff --git a/modelcomp/read.py b/modelcomp/read.py new file mode 100644 index 0000000..32e5025 --- /dev/null +++ b/modelcomp/read.py @@ -0,0 +1,62 @@ +import numpy as np +import os +import pandas as pd + +import modelcomp as mcp + + +def read_data(read_from_unjoined): + """ + Imports data from the filesystem + :param read_from_unjoined: + :return: interpolation of tprs, fprs, tprs, interpolation of recalls, precisions, recalls, aucs, pr aucs, feature importances and shap values) + """ + save_to = mcp.join_save(os.path.join(read_from_unjoined, 'data')) + interp_tpr = [] + interp_recall = [] + fprs = [] + tprs = [] + precisions = [] + recalls = [] + count = 0 + while os.path.exists(os.path.join(save_to, 'interp_tpr' + str(count) + '.csv')): + interp_tpr.append(np.genfromtxt(os.path.join(save_to, 'interp_tpr' + str(count) + '.csv'), delimiter=',')) + count += 1 + count = 0 + while os.path.exists(os.path.join(save_to, 'interp_recall' + str(count) + '.csv')): + interp_recall.append(np.genfromtxt(os.path.join(save_to, 'interp_recall' + str(count) + '.csv'), delimiter=',')) + count += 1 + count = 0 + aucs = np.genfromtxt(os.path.join(save_to, 'aucs.csv'), delimiter=',') + pr_aucs = np.genfromtxt(os.path.join(save_to, 'pr_aucs.csv'), delimiter=',') + if os.path.exists(os.path.join(save_to, 'feature_importance.csv')): + feature_importances = pd.read_csv(os.path.join(save_to, 'feature_importance.csv'), index_col=0) + else: + feature_importances = None + if os.path.exists(os.path.join(save_to, 'shap_values.csv')): + shap_values = pd.read_csv(os.path.join(save_to, 'shap_values.csv'), index_col=0) + else: + shap_values = None + while os.path.exists(os.path.join(save_to, 'fpr' + str(count) + '.csv')): + fprs.append(np.genfromtxt(os.path.join(save_to, 'fpr' + str(count) + '.csv'), delimiter=',')) + count += 1 + count = 0 + while os.path.exists(os.path.join(save_to, 'tpr' + str(count) + '.csv')): + tprs.append(np.genfromtxt(os.path.join(save_to, 'tpr' + str(count) + '.csv'), delimiter=',')) + count += 1 + count = 0 + while os.path.exists(os.path.join(save_to, 'precision' + str(count) + '.csv')): + precisions.append(np.genfromtxt(os.path.join(save_to, 'precision' + str(count) + '.csv'), delimiter=',')) + count += 1 + count = 0 + while os.path.exists(os.path.join(save_to, 'recall' + str(count) + '.csv')): + recalls.append(np.genfromtxt(os.path.join(save_to, 'recall' + str(count) + '.csv'), delimiter=',')) + count += 1 + for list_data in [interp_tpr, interp_recall, fprs, tprs, precisions, recalls]: + if len(list_data) == 0: + list_data = None + elif len(list_data) == 1: + list_data = list_data[0] + for array_data in [aucs, pr_aucs]: + if len(array_data.shape) == 0: array_data = array_data.item() + return interp_tpr, fprs, tprs, interp_recall, precisions, recalls, aucs, pr_aucs, feature_importances, shap_values diff --git a/modelcomp/utilities.py b/modelcomp/utilities.py new file mode 100644 index 0000000..7fa6bb5 --- /dev/null +++ b/modelcomp/utilities.py @@ -0,0 +1,215 @@ +import os +import pandas as pd +import random +import xgboost +from sklearn import svm +from sklearn.ensemble import RandomForestClassifier +from sklearn.linear_model import LogisticRegression +from sklearn.model_selection import RepeatedKFold, train_test_split + +import modelcomp.models + +model_names = ['Random Forest', 'XGBoost', 'Logistic Regression', 'SVM', 'k-NN'] +model_names_short = ['RF', 'XGB', 'LR', 'SVM', 'k-NN'] +model_names_dict = None + + +def remove_falsy_columns(data): + """ + Remove columns that contain all false values from df + :param data: a dataframe to remove columns from + :return: the dataframe after filtering + """ + return data.loc[:, data.any()] + + +def split_data(df, drop): + """ + Splits the data to a train and test set + :param df: the dataframe to split + :param drop: the column name to drop + :return: train set, train label, test set, test label + """ + rf_target = pd.DataFrame(df[drop]) + X = df.drop('SampleID', axis=1, errors='ignore').drop(drop, axis=1, errors='ignore') + # drops id column and default name from df + X_train, X_test, y_train, y_test = train_test_split(X, rf_target.values.ravel(), test_size=0.2) + # splitting the df to train and test values + return X_train, X_test, y_train, y_test + + +def split_array(arr, test_size=0.2): + """ + Splits an array to a train and test list + :param arr: the array to split + :param test_size: the test size to split into (default: 0.2) + :return: the split array (train data, test data) + """ + random.shuffle(arr) + return arr[:int(len(arr) * (1 - test_size))], arr[int(0 - len(arr) * test_size):] + # returns a split of a df + + +def get_feature_importance(model): + """ + Gets the feature importance of a trained model + :param model: the trained model (to get a return value needs to be random forest, xgboost, lr or svm) + :return: + """ + if type(model) in [RandomForestClassifier, xgboost.XGBClassifier]: + return model.feature_importances_ + if type(model) in [LogisticRegression, svm.SVC]: + return model.coef_ + + +def merge_dfs(df1, df2, merged_column='SampleID'): + """ + Merges 2 dataframes on 1 column + :param df1: first dataframe + :param df2: second dataframe + :param merged_column: the column to merge on (default: 'SampleID') + :return: the 2 dataframes merged + """ + return df1.merge(df2, on=merged_column) + + +def relative_abundance(data): + """ + Calculates the relative abundance of values in a dataframe + :param data: the dataframe to calculate the relative abundance on + :return: the filtered dataframe + """ + relative_data = data + columns_list = relative_data.columns.values.tolist() + columns_list.remove('SampleID') + # list of data column names (except sample ids) + columns_sums = relative_data[columns_list].sum() + # list of column value sums + for row in range(len(columns_list)): + relative_data[columns_list[row]] /= columns_sums[row] + # dividing each element in all columns by the sum of the column + + return relative_data + + +def get_label_indexes(df, label, column_name='PatientGroup'): + """ + Returns the indexes that have the specified label + :param df: the dataframe to check + :param label: the label to return indexes of + :param column_name: the column name that contains the labels (default: 'PatientGroup') + :return: the filtered dataframe + """ + return list(df[df[column_name] == int(label)].index.values) + + +def get_k_fold(seed=None, splits=5, repeats=5): + """ + Returns a repeated k-fold with the specified seed + :param seed: set seed for model randomness (default: None) + :param splits: number of splits for the validation (default: 5) + :param repeats: number of repeats for the validation (default: 5) + :return: a RepeatedKFold object + """ + return RepeatedKFold(random_state=seed, n_splits=splits, n_repeats=repeats) if seed is not None else RepeatedKFold( + n_splits=splits, n_repeats=repeats) + + +def get_models(seed_all=None, seed_dict=None): + return [modelcomp.models.random_forest.get(seed=( + seed_all if seed_all != None else seed_dict[model_names_short[0]] if seed_dict is not None and + model_names_short[ + 0] in seed_dict else None)), + modelcomp.models.xgboost.get(seed=( + seed_all if seed_all != None else seed_dict[model_names_short[1]] if seed_dict is not None and + model_names_short[ + 1] in seed_dict else None)), + modelcomp.models.logistic_regression.get(seed=( + seed_all if seed_all != None else seed_dict[model_names_short[2]] if seed_dict is not None and + model_names_short[ + 2] in seed_dict else None)), + modelcomp.models.svm.get(seed=( + seed_all if seed_all != None else seed_dict[model_names_short[3]] if seed_dict is not None and + model_names_short[ + 3] in seed_dict else None)), + modelcomp.models.knn.get() + ] + + +def filter_data(abundance, meta, control, disease_list=list(), disease1='', disease2=''): + df_filtered_meta = meta.loc[meta['PatientGroup'].isin(disease_list + [control] + [disease1] + [disease2])] + # filtering meta by parameters + df_filtered_abundance = abundance.loc[abundance['SampleID'].isin(df_filtered_meta['SampleID'])] + # filtering abundance by columns that are in filtered meta (SampleID) + df_new_filtered_meta = df_filtered_meta.copy() + if len(disease_list) != 0: + df_new_filtered_meta['PatientGroup'] = df_new_filtered_meta['PatientGroup'].apply( + lambda x: 0 if x == control else 1) + # mapping falsy values to 0 and truthy values to 1 + else: + df_new_filtered_meta['PatientGroup'] = df_new_filtered_meta['PatientGroup'].apply( + lambda x: 0 if x == control else 1 if x == disease1 else 2) + # mapping values to 0/1/2 based on diseases + df_new_filtered_meta['PatientGroup'] = df_new_filtered_meta['PatientGroup'].astype(int) + # ensure the column is of integer type + return df_filtered_abundance, df_new_filtered_meta + + +def remove_rare_species(df, prevalence_cutoff=0.1, avg_abundance_cutoff=0.005): + filt_df = df.drop('SampleID', axis=1) if 'SampleID' in df.columns else df + n_samples = df.shape[0] + n_taxa_before_filter = df.shape[1] + + # Prevalence calculations (number of non-zero values per feature) + frequencies = (filt_df > 0).sum(axis=0) / n_samples + filt_df = filt_df.loc[:, frequencies > prevalence_cutoff] + + # Average abundance calculations + avg_abundances = filt_df.sum(axis=0) / n_samples + filt_df = filt_df.loc[:, avg_abundances > avg_abundance_cutoff] + + filt_df['SampleID'] = df['SampleID'] + return filt_df + + +def remove_string_columns(df): + return df.drop('SampleID', axis=1, errors='ignore') + + +def make_dir(directory): + if not os.path.exists(directory): + os.makedirs(directory) + + +def join_save(add=None): + save_to = 'export' + return os.path.join(save_to, add) if add is not None else save_to + + +def data_to_filename(test_class, model_name, trained_on=None): + test_class = 'A' if type(test_class) is list else test_class + trained_on = 'A' if type(trained_on) is list else trained_on + # mapping test and train classes to all disease if they are more than 1 disease + return os.path.join(test_class, + trained_on if trained_on is not None else test_class, + model_name) + + +def filename_to_data(filename): + cwd = os.getcwd() + os.chdir(filename) + os.chdir('..') + + model_name = os.path.basename(os.getcwd()) + # getting the model name by directory + os.chdir('..') + + trained_on = os.path.basename(os.getcwd()) + # getting the train class by directory + os.chdir('..') + + tested_on = os.path.basename(os.getcwd()) + # getting the test class by directory + + os.chdir(cwd) + return model_name, str(trained_on).lower(), str(tested_on).lower() diff --git a/modelcomp/write.py b/modelcomp/write.py new file mode 100644 index 0000000..cded83e --- /dev/null +++ b/modelcomp/write.py @@ -0,0 +1,75 @@ +import matplotlib.pyplot as plt +import numpy as np +import os +import pandas as pd + +import modelcomp as mcp + + +def write_data(save_to_unjoined, label=None, interp_tpr=None, interp_recall=None, aucs=None, pr_aucs=None, fprs=None, + tprs=None, + precisions=None, recalls=None, feature_importances=None, shap_values=None): + """ + Writes data to the filesystem + :param save_to_unjoined: location to save to + :param label: label names + :param interp_tpr: interpolation of tprs + :param interp_recall: interpolation of recalls + :param aucs: aucs + :param pr_aucs: pr aucs + :param fprs: fprs + :param tprs: tprs + :param precisions: precisions + :param recalls: recalls + :param feature_importances: feature importances + :param shap_values: shap values + :return: data exported to the filesystem (default: "export////data") + """ + save_to = os.path.join('export', save_to_unjoined, 'data') + mcp.make_dir(save_to) + if interp_tpr is not None: + for index, value in enumerate([interp_tpr] if type(interp_tpr) is np.ndarray else interp_tpr): + np.savetxt(os.path.join(save_to, 'interp_tpr' + str(index) + '.csv'), value, delimiter=',') + if interp_recall is not None: + for index, value in enumerate([interp_recall] if type(interp_recall) is np.ndarray else interp_recall): + np.savetxt(os.path.join(save_to, 'interp_recall' + str(index) + '.csv'), value, delimiter=',') + if aucs is not None: + np.savetxt(os.path.join(save_to, 'aucs.csv'), aucs, delimiter=',') + if pr_aucs is not None: + np.savetxt(os.path.join(save_to, 'pr_aucs.csv'), pr_aucs, delimiter=',') + if feature_importances is not None: + feature_importances.to_csv(os.path.join(save_to, 'feature_importance.csv')) + if shap_values is not None: + shap_columns = pd.DataFrame(shap_values, columns=label) + vals = np.abs(shap_columns.values).mean(0) + avg_shap = pd.DataFrame(list(zip(label, vals)), + columns=['col_name', 'avg_shap_value']) + avg_shap.sort_values(by=['avg_shap_value'], + ascending=False, inplace=True) + avg_shap.to_csv(os.path.join(save_to, 'shap_values.csv'), index=False) + if fprs is not None: + for index, value in enumerate(fprs): + np.savetxt(os.path.join(save_to, 'fpr' + str(index) + '.csv'), value, delimiter=',') + if tprs is not None: + for index, value in enumerate(tprs): + np.savetxt(os.path.join(save_to, 'tpr' + str(index) + '.csv'), value, delimiter=',') + if precisions is not None: + for index, value in enumerate(precisions): + np.savetxt(os.path.join(save_to, 'precision' + str(index) + '.csv'), value, delimiter=',') + if recalls is not None: + for index, value in enumerate(recalls): + np.savetxt(os.path.join(save_to, 'recall' + str(index) + '.csv'), value, delimiter=',') + + +def write_plot(save_to, img_name): + """ + Export the current plt state to the filesystem + :param save_to: location to save to + :param img_name: desired file name + :return: the current plt state exported to the filesystem + """ + dir = save_to + mcp.make_dir(dir) + plt.savefig(os.path.join(dir, img_name + '.png'), bbox_inches='tight') + # saving plot + plt.clf() diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..62d821d --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,44 @@ +[build-system] +requires = ["setuptools"] +build-backend = "setuptools.build_meta" + +[project] +name = "modelcomp" +version = "0.0.1a1" +authors = [ + { name = "Gil Ramot", email = "gilramot3@gmail.com" } +] +maintainers = [ + { name = "Gil Ramot", email = "gilramot3@gmail.com" } +] +description = "Compare between performances of machine learning models with ease." +readme = "README.md" +requires-python = ">=3.6" +license = {file = "LICENSE"} +classifiers = [ + "Programming Language :: Python :: 3 :: Only", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", +] +dependencies = [ + "matplotlib", + "numpy", + "pandas", + "scikit-learn", + "shap", + "xgboost", + "seaborn" +] + +[project.urls] +Repository = "https://github.com/gilramot/modelcomp" +Issues = "https://github.com/gilramot/modelcomp/issues" +Appendix = "https://github.com/gilramot/modelcomp-appendix" diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..f3f73e5 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,7 @@ +matplotlib +numpy +pandas +scikit-learn +shap +xgboost +seaborn \ No newline at end of file