From e991de176e12207ad85331ff83a2bbdffce162a8 Mon Sep 17 00:00:00 2001 From: Lu Zhu Date: Mon, 6 May 2024 23:15:46 -0400 Subject: [PATCH] remove unused code --- auroris/curation/_curator.py | 2 +- auroris/curation/actions/_discretize.py | 15 --- auroris/curation/actions/_distribution.py | 3 +- auroris/report/broadcaster/_html.py | 7 -- auroris/types.py | 1 - auroris/utils.py | 1 - auroris/visualization/_distribution.py | 140 +--------------------- 7 files changed, 5 insertions(+), 164 deletions(-) diff --git a/auroris/curation/_curator.py b/auroris/curation/_curator.py index c42b193..d0b16e3 100644 --- a/auroris/curation/_curator.py +++ b/auroris/curation/_curator.py @@ -46,7 +46,7 @@ def transform(self, dataset: pd.DataFrame) -> Tuple[pd.DataFrame, CurationReport dataset = dataset.copy(deep=True) for action in self.steps: logger.info(f"Performing step: {action.name}") - if action._dep_action and not action._dep_action in self.state: + if action._dep_action and action._dep_action not in self.state: raise RuntimeError(f"{action._dep_action} should be called before {action.name}.") with report.section(action.name): kwargs = {} diff --git a/auroris/curation/actions/_discretize.py b/auroris/curation/actions/_discretize.py index 7780fd7..d5be312 100644 --- a/auroris/curation/actions/_discretize.py +++ b/auroris/curation/actions/_discretize.py @@ -7,7 +7,6 @@ from auroris.curation.actions._base import BaseAction from auroris.report import CurationReport from auroris.types import VerbosityLevel -from auroris.visualization._distribution import detailed_distributions_plots def discretize( @@ -105,19 +104,5 @@ def transform( if report is not None: report.log_new_column(column_name) - # sections = [] - # low = -np.inf - # high = np.inf - - # for i, threshold in enumerate(self.thresholds + [high]): - # if self.label_order == "descending": - # i = len(self.thresholds) - i - # pct = 100 * sum(X == i) / len(X) - # sections.append( - # {"label": f"{column_name} = {i}: {pct:.1f} %", "start": low, "end": threshold, "pct": pct} - # ) - # low = threshold - # fig = detailed_distributions_plots(data=dataset[self.input_column], label_name=self.input_column, sections=sections) - # report.log_image(fig, title="Data class distribution") return dataset diff --git a/auroris/curation/actions/_distribution.py b/auroris/curation/actions/_distribution.py index c0e5310..e2cddd3 100644 --- a/auroris/curation/actions/_distribution.py +++ b/auroris/curation/actions/_distribution.py @@ -1,12 +1,11 @@ from typing import Dict, List, Optional import pandas as pd -from pydantic import Field, PrivateAttr +from pydantic import Field import numpy as np from auroris.curation.actions._base import BaseAction from auroris.report import CurationReport from auroris.types import VerbosityLevel -from auroris.curation.actions._discretize import Discretization from auroris.visualization import detailed_distributions_plots diff --git a/auroris/report/broadcaster/_html.py b/auroris/report/broadcaster/_html.py index be2346a..09beb99 100644 --- a/auroris/report/broadcaster/_html.py +++ b/auroris/report/broadcaster/_html.py @@ -1,10 +1,6 @@ -import io import fsspec import base64 from typing import Optional -from PIL import Image as PILImage -from PIL.Image import Image as ImageType -from IPython.core.display import Image as IPy_Image import datamol as dm from auroris.report import CurationReport, Section @@ -121,9 +117,6 @@ def on_report_start(self, report: CurationReport):

Version: {report.auroris_version}

""" ) - # self._file.write("

Curation Report

") - # self._file.write(f"

Time: {report.time_stamp.strftime('%Y-%m-%d %H:%M:%S')}

") - # self._file.write(f"

Version: {report.auroris_version}

") def on_section_start(self, section: Section): self._file.write(f"

{section.title}

") diff --git a/auroris/types.py b/auroris/types.py index 9879e68..013135c 100644 --- a/auroris/types.py +++ b/auroris/types.py @@ -1,5 +1,4 @@ from enum import IntEnum -from PIL.Image import Image class VerbosityLevel(IntEnum): diff --git a/auroris/utils.py b/auroris/utils.py index f56a421..c5ed03b 100644 --- a/auroris/utils.py +++ b/auroris/utils.py @@ -3,7 +3,6 @@ from PIL import Image from PIL.Image import Image as ImageType from sklearn.utils.multiclass import type_of_target -import matplotlib.pyplot as plt from matplotlib.backends.backend_agg import FigureCanvasAgg from io import BytesIO diff --git a/auroris/visualization/_distribution.py b/auroris/visualization/_distribution.py index 4282b00..d99d8a1 100644 --- a/auroris/visualization/_distribution.py +++ b/auroris/visualization/_distribution.py @@ -1,17 +1,14 @@ -from typing import Callable, Dict, List, Optional, Tuple +from typing import List, Optional import numpy as np import pandas as pd import seaborn as sns from loguru import logger from scipy import stats +import matplotlib.pyplot as plt from auroris.visualization.utils import create_figure -import seaborn as sns -import matplotlib.pyplot as plt -import numpy as np - def detailed_distributions_plots( data: pd.DataFrame, label_name: str, sections: Optional[List[dict]] = None, log_scale: bool = False @@ -33,7 +30,7 @@ def detailed_distributions_plots( logger.exception(e) if log_scale: logger.exception( - f"The current error is likely due to the `log_scale` was enabled. Please disable the `log_scale` and try again." + "The current error is likely due to the `log_scale` was enabled. Please disable the `log_scale` and try again." ) # Fill the sections under the KDE curve @@ -51,137 +48,6 @@ def detailed_distributions_plots( return fig.figure -# def detailed_distributions_plots( -# df: pd.DataFrame, -# thresholds: Optional[Dict[str, Tuple[int, Callable]]] = None, -# discretizer: Optional[callable] = None, -# label_names: List[str] = None, -# log_scale_mapping: Dict[str, bool] = None, -# positive_color: str = "#3db371", -# negative_color: str = "#a9a9a9", -# n_cols: int = 3, -# fig_base_size: float = 8, -# w_h_ratio: float = 0.5, -# legend_fontsize: int = 18, -# ticks_fontsize: int = 18, -# title_fontsize: int = 18, -# gridsize: int = 1000, -# dpi: int = 150, -# seaborn_theme: Optional[str] = "whitegrid", -# ): -# """Plot the detailed distribution of the columns in `df`. Also, color the part of the -# "positive" distribution using `thresholds`. - -# Args: -# df: A dataframe with binarized readouts only. NaN are allowed. -# thresholds: A dict mapping of the `df` column. Value is a tuple where the first -# element is the threshold value and the second element is a callable deciding wether -# a datapoint meets the criterai or not (something like `np.less` or np.greater`). -# label_names: Name of the labels (same order as the columns in `df`). If not set -# the name of the columns are used. -# log_scale_mapping: A dict mapping of the `df` column. If True, -# the plot for this readout will be log scaled. -# positive_color: Color for `True` or `1`. -# negative_color: Color for `False` or `0`. -# n_cols: Number of columns in the subplots. -# fig_base_size: Base size of the plots. -# w_h_ratio: Width/height ratio. -# legend_fontsize: Font size of the legend. -# ticks_fontsize: Font size of the x ticks and x label. -# title_fontsize: Font size of the title. -# gridsize: Gridsize for the kernel density estimate (KDE). -# dpi: DPI value of the figure. -# seaborn_theme: Seaborn theme. -# """ - -# # NOTE: the `thresholds` API is not super nice, consider an alternative. -# # NOTE: we could eventually add support for multiclass here if we need it. -# if thresholds is None: -# thresholds = {} - -# if log_scale_mapping is None: -# log_scale_mapping = {} - -# if label_names is None: -# label_names = df.columns.tolist() - -# # Check all columns are numeric -# numerics = df.apply(lambda x: x.dtype.kind in "biufc") -# if not numerics.all(): -# raise ValueError(f"Not all columns are numeric: {numerics[~numerics].to_dict()}") - -# n_plots = len(df.columns) - -# # Create the figure -# with create_figure( -# n_plots=n_plots, -# n_cols=n_cols, -# dpi=dpi, -# fig_base_size=fig_base_size, -# w_h_ratio=w_h_ratio, -# seaborn_theme=seaborn_theme, -# ) as (fig, axes): -# for ax, readout, label_name in zip(axes, df.columns, label_names): -# values = df[readout].dropna() - -# # Get threshold value and function -# threshold_value, threshold_fn = None, None -# threshold = thresholds.get(readout, None) -# if threshold is not None: -# threshold_value, threshold_fn = threshold - -# # Whether to log scale -# log_scale = log_scale_mapping.get(readout, False) - -# # Draw distribution and kde plot -# kde_kws = {} -# kde_kws["clip"] = values.min(), values.max() -# kde_kws["gridsize"] = gridsize -# kplot = sns.histplot( -# values, -# kde=True, -# ax=ax, -# color=negative_color, -# kde_kws=kde_kws, -# log_scale=log_scale, -# ) - -# # Label -# ax.set_title(label_name, fontsize=title_fontsize) -# ax.set_xlabel(None) -# ax.set_ylabel("Count", fontsize=ticks_fontsize) - -# ax.xaxis.set_tick_params(labelsize=ticks_fontsize) -# ax.yaxis.set_tick_params(labelsize=ticks_fontsize) - -# if threshold_value is not None and threshold_fn is not None: -# # Fill between on active values -# x, y = kplot.get_lines()[0].get_data() -# ax.fill_between( -# x, -# y, -# where=threshold_fn(x, threshold_value), -# facecolor=positive_color, -# alpha=0.8, -# ) - -# # Active ratio text box -# positive_ratio = threshold_fn(values, threshold_value).sum() / len(values) * 100 -# ax.text( -# 0.85, -# 0.95, -# f"{positive_ratio:.1f} %", -# transform=ax.transAxes, -# fontsize=legend_fontsize, -# verticalalignment="top", -# bbox=dict(boxstyle="round", facecolor="wheat", alpha=0.5), -# ) -# else: -# logger.warning(f"Threshold not available for readout '{readout}'") - -# return fig - - def visualize_distribution_with_outliers( values: np.ndarray, is_outlier: Optional[List[bool]] = None,