diff --git a/proc_dash/app.py b/proc_dash/app.py index b758052..69e5d6a 100644 --- a/proc_dash/app.py +++ b/proc_dash/app.py @@ -8,28 +8,31 @@ from dash.dependencies import Input, Output, State from dash.exceptions import PreventUpdate +import proc_dash.plotting as plot import proc_dash.utility as util from dash import Dash, ctx, dash_table, dcc, html -app = Dash( - __name__, - external_stylesheets=["https://codepen.io/chriddyp/pen/bWLwgP.css"], -) +EMPTY_FIGURE_PROPS = {"data": [], "layout": {}, "frames": []} + +app = Dash(__name__, external_stylesheets=[dbc.themes.FLATLY]) app.layout = html.Div( children=[ html.H2(children="Neuroimaging Derivatives Status Dashboard"), + dcc.Store(id="memory"), dcc.Upload( id="upload-data", - children=html.Button("Drag and Drop or Select .csv File"), + children=dbc.Button( + "Drag and Drop or Select .csv File", color="secondary" + ), # TODO: Constrain click responsive area of button style={"margin-top": "10px", "margin-bottom": "10px"}, multiple=False, ), html.Div( id="output-data-upload", children=[ - html.H6(id="input-filename"), + html.H4(id="input-filename"), html.Div( children=[ html.Div(id="total-participants"), @@ -49,78 +52,165 @@ page_size=50, fixed_rows={"headers": True}, style_table={"height": "300px", "overflowY": "auto"}, - ), # TODO: Treat all columns as strings to standardize filtering syntax? + style_cell={ + "fontSize": 13 # accounts for font size inflation by dbc theme + }, + ), + # NOTE: Could cast columns to strings for the datatable to standardize filtering syntax, + # but this results in undesirable effects (e.g., if there is session 1 and session 11, + # a query for "1" would return both) ], style={"margin-top": "10px", "margin-bottom": "10px"}, ), - dbc.Card( + dbc.Row( [ - # TODO: Put label and dropdown in same row - html.Div( - [ - dbc.Label("Filter by multiple sessions:"), - dcc.Dropdown( - id="session-dropdown", - options=[], - multi=True, - placeholder="Select one or more available sessions to filter by", - # TODO: Can set `disabled=True` here to prevent any user interaction before file is uploaded - ), - ] + dbc.Col( + dbc.Form( + [ + # TODO: Put label and dropdown in same row + html.Div( + [ + dbc.Label( + "Filter by multiple sessions:", + html_for="session-dropdown", + className="mb-0", + ), + dcc.Dropdown( + id="session-dropdown", + options=[], + multi=True, + placeholder="Select one or more available sessions to filter by", + # TODO: Can set `disabled=True` here to prevent any user interaction before file is uploaded + ), + ], + className="mb-2", # Add margin to keep dropdowns spaced apart + ), + html.Div( + [ + dbc.Label( + "Selection operator:", + html_for="select-operator", + className="mb-0", + ), + dcc.Dropdown( + id="select-operator", + options=[ + { + "label": "AND", + "value": "AND", + "title": "Show only participants with all selected sessions.", + }, + { + "label": "OR", + "value": "OR", + "title": "Show participants with any of the selected sessions.", + }, + ], + value="AND", + clearable=False, + # TODO: Can set `disabled=True` here to prevent any user interaction before file is uploaded + ), + ], + className="mb-2", + ), + ], + ) ), - html.Div( - [ - dbc.Label("Selection operator:"), - dcc.Dropdown( - id="select-operator", - options=[ - { - "label": "AND", - "value": "AND", - "title": "Show only participants with all selected sessions.", - }, - { - "label": "OR", - "value": "OR", - "title": "Show participants with any of the selected sessions.", - }, - ], - value="AND", - clearable=False, - # TODO: Can set `disabled=True` here to prevent any user interaction before file is uploaded + dbc.Col( + dbc.Card( + dbc.CardBody( + [ + html.H5( + "Legend: Processing status", + className="card-title", + ), + html.P( + children=util.construct_legend_str( + util.PIPE_COMPLETE_STATUS_SHORT_DESC + ), + style={ + "whiteSpace": "pre" # preserve newlines + }, + className="card-text", + ), + ] ), - ] + ) ), ] ), - ] + dbc.Row( + [ + # NOTE: Legend displayed for both graphs so that user can toggle visibility of status data + dbc.Col( + dcc.Graph( + id="fig-pipeline-status", style={"display": "none"} + ) + ), + dbc.Col( + dcc.Graph( + id="fig-pipeline-status-all-ses", + style={"display": "none"}, + ) + ), + ], + ), + ], + style={"padding": "10px 10px 10px 10px"}, ) @app.callback( [ - Output("interactive-datatable", "columns"), - Output("interactive-datatable", "data"), + Output("memory", "data"), Output("total-participants", "children"), Output("session-dropdown", "options"), ], [ Input("upload-data", "contents"), State("upload-data", "filename"), - Input("session-dropdown", "value"), - Input("select-operator", "value"), ], ) -def update_outputs(contents, filename, session_values, operator_value): +def process_bagel(contents, filename): + """ + From the contents of a correctly-formatted uploaded .csv file, parse and store the pipeline overview + data as a dataframe and update the session dropdown options and displayed total participants count. + Returns any errors encountered during input file processing as a user-friendly message. + """ if contents is None: - return None, None, "Upload a CSV file to begin.", [] - - data, total_subjects, sessions, upload_error = util.parse_csv_contents( - contents=contents, filename=filename - ) + return None, "Upload a CSV file to begin.", [] + try: + data, total_subjects, sessions, upload_error = util.parse_csv_contents( + contents=contents, filename=filename + ) + except Exception: + upload_error = "Something went wrong while processing this file." if upload_error is not None: - return None, None, f"Error: {upload_error} Please try again.", [] + return None, f"Error: {upload_error} Please try again.", [] + + report_total_subjects = f"Total number of participants: {total_subjects}" + session_opts = [{"label": ses, "value": ses} for ses in sessions] + + return data.to_dict("records"), report_total_subjects, session_opts + + +@app.callback( + [ + Output("interactive-datatable", "columns"), + Output("interactive-datatable", "data"), + ], + [ + Input("memory", "data"), + Input("session-dropdown", "value"), + Input("select-operator", "value"), + ], +) +def update_outputs(parsed_data, session_values, operator_value): + if parsed_data is None: + return None, None + + data = pd.DataFrame.from_dict(parsed_data) if session_values: data = util.filter_by_sessions( @@ -128,13 +218,10 @@ def update_outputs(contents, filename, session_values, operator_value): session_values=session_values, operator_value=operator_value, ) - tbl_columns = [{"name": i, "id": i} for i in data.columns] tbl_data = data.to_dict("records") - tbl_total_subjects = f"Total number of participants: {total_subjects}" - session_opts = [{"label": ses, "value": ses} for ses in sessions] - return tbl_columns, tbl_data, tbl_total_subjects, session_opts + return tbl_columns, tbl_data @app.callback( @@ -171,13 +258,63 @@ def update_matching_participants(columns, virtual_data): State("upload-data", "filename"), prevent_initial_call=True, ) -def reset_table(contents, filename): - """If file contents change (i.e., new CSV uploaded), reset file name and filter selection values.""" +def reset_selections(contents, filename): + """ + If file contents change (i.e., selected new CSV for upload), reset displayed file name and dropdown filter + selection values. Reset will occur regardless of whether there is an issue processing the selected file. + """ if ctx.triggered_id == "upload-data": return f"Input file: {filename}", "", "" raise PreventUpdate +@app.callback( + [ + Output("fig-pipeline-status-all-ses", "figure"), + Output("fig-pipeline-status-all-ses", "style"), + ], + Input("memory", "data"), + prevent_initial_call=True, +) +def generate_overview_status_fig_for_participants(parsed_data): + """ + If new dataset uploaded, generate stacked bar plot of pipeline_complete statuses per session, + grouped by pipeline. Provides overview of the number of participants with each status in a given session, + per processing pipeline. + """ + if parsed_data is None: + return EMPTY_FIGURE_PROPS, {"display": "none"} + + return plot.plot_pipeline_status_by_participants( + pd.DataFrame.from_dict(parsed_data) + ), {"display": "block"} + + +@app.callback( + [ + Output("fig-pipeline-status", "figure"), + Output("fig-pipeline-status", "style"), + ], + Input( + "interactive-datatable", "data" + ), # Input not triggered by datatable frontend filtering + prevent_initial_call=True, +) +def update_overview_status_fig_for_records(data): + """ + When visible data in the overview datatable is updated (excluding built-in frontend datatable filtering + but including component filtering for multiple sessions), generate stacked bar plot of pipeline_complete + statuses aggregated by pipeline. Counts of statuses in plot thus correspond to unique records (unique + participant-session combinations). + """ + if data is not None: + return plot.plot_pipeline_status_by_records( + pd.DataFrame.from_dict(data) + ), {"display": "block"} + + return EMPTY_FIGURE_PROPS, {"display": "none"} + + if __name__ == "__main__": app.run_server(debug=True) diff --git a/proc_dash/plotting.py b/proc_dash/plotting.py new file mode 100644 index 0000000..7c716f7 --- /dev/null +++ b/proc_dash/plotting.py @@ -0,0 +1,90 @@ +import pandas as pd +import plotly.express as px + +STATUS_CMAP = px.colors.qualitative.Bold +STATUS_COLORS = { + "SUCCESS": STATUS_CMAP[5], + "FAIL": STATUS_CMAP[9], + "INCOMPLETE": STATUS_CMAP[3], + "UNAVAILABLE": STATUS_CMAP[10], +} +PIPELINE_STATUS_ORDER = ["SUCCESS", "FAIL", "INCOMPLETE", "UNAVAILABLE"] +LAYOUTS = { + "margin": {"l": 30, "r": 30, "t": 60, "b": 30}, # margins of chart + "title": { # figure title position properties + "yref": "container", + "y": 1, + "yanchor": "top", + "pad": {"t": 20}, + }, +} + + +def transform_data_to_long(data: pd.DataFrame) -> pd.DataFrame: + return pd.melt( + data, + id_vars=["participant_id", "session"], + var_name="pipeline_name", + value_name="pipeline_complete", + ) + + +def plot_pipeline_status_by_participants(data: pd.DataFrame): + status_counts = ( + transform_data_to_long(data) + .groupby(["pipeline_name", "pipeline_complete", "session"]) + .size() + .reset_index(name="participants") + ) + + fig = px.bar( + status_counts, + x="session", + y="participants", + color="pipeline_complete", + text_auto=True, + facet_col="pipeline_name", + category_orders={"pipeline_complete": PIPELINE_STATUS_ORDER}, + color_discrete_map=STATUS_COLORS, + labels={ + "pipeline_name": "Pipeline", + "participants": "Participants (n)", + "pipeline_complete": "Processing status", + "session": "Session", + }, + title="Overview: Participant pipeline statuses by session", + ) + # Treat session labels as categorical in plot to avoid a continuous x-axis + fig.update_xaxes(type="category") + fig.update_layout(margin=LAYOUTS["margin"], title=LAYOUTS["title"]) + + return fig + + +def plot_pipeline_status_by_records(data: pd.DataFrame): + status_counts = ( + transform_data_to_long(data) + .groupby(["pipeline_name", "pipeline_complete"]) + .size() + .reset_index(name="records") + ) + + fig = px.bar( + status_counts, + x="pipeline_name", + y="records", + color="pipeline_complete", + text_auto=True, + category_orders={"pipeline_complete": PIPELINE_STATUS_ORDER}, + color_discrete_map=STATUS_COLORS, + labels={ + "pipeline_name": "Pipeline", + "records": "Records (n)", + "pipeline_complete": "Processing status", + }, + title="Selected sessions: Pipeline statuses of matching records (default: all)" + # alternative title: "Pipeline statuses of unique records for selected sessions (default: all)" + ) + fig.update_layout(margin=LAYOUTS["margin"], title=LAYOUTS["title"]) + + return fig diff --git a/proc_dash/utility.py b/proc_dash/utility.py index f0aad52..e9b75c4 100644 --- a/proc_dash/utility.py +++ b/proc_dash/utility.py @@ -7,6 +7,19 @@ import pandas as pd SCHEMAS_PATH = Path(__file__).absolute().parents[1] / "schemas" +PIPE_COMPLETE_STATUS_SHORT_DESC = { + "SUCCESS": "All stages of pipeline finished successfully (all expected output files present).", + "FAIL": "At least one stage of the pipeline failed.", + "INCOMPLETE": "Pipeline has not yet been run or at least one stage is unfinished/still running.", + "UNAVAILABLE": "Relevant data modality for pipeline not available.", +} + + +def construct_legend_str(status_desc: dict) -> str: + """From a dictionary, constructs a legend-style string with multiple lines in the format of key: value.""" + return "\n".join( + [f"{status}: {desc}" for status, desc in status_desc.items()] + ) def get_required_bagel_columns() -> list: @@ -25,17 +38,14 @@ def get_required_bagel_columns() -> list: # TODO: When possible values per column have been finalized (waiting on mr_proc), # validate that each column only has acceptable values -def check_required_columns(bagel: pd.DataFrame): - """Returns error if required columns in bagel schema are missing.""" +def get_missing_required_columns(bagel: pd.DataFrame) -> set: + """Identifies any missing required columns in bagel schema.""" missing_req_columns = set(get_required_bagel_columns()).difference( bagel.columns ) # TODO: Check if there are any missing values in the `participant_id` column - if len(missing_req_columns) > 0: - raise LookupError( - f"The selected .csv is missing the following required metadata columns: {missing_req_columns}." - ) + return missing_req_columns def extract_pipelines(bagel: pd.DataFrame) -> dict: @@ -55,8 +65,8 @@ def extract_pipelines(bagel: pd.DataFrame) -> dict: return pipelines_dict -def check_num_subjects(bagel: pd.DataFrame): - """Returns error if subjects and sessions are different across pipelines in the input.""" +def are_subjects_same_across_pipelines(bagel: pd.DataFrame) -> bool: + """Checks if subjects and sessions are the same across pipelines in the input.""" pipelines_dict = extract_pipelines(bagel) pipeline_subject_sessions = [ @@ -64,13 +74,10 @@ def check_num_subjects(bagel: pd.DataFrame): for df in pipelines_dict.values() ] - if not all( + return all( pipeline.equals(pipeline_subject_sessions[0]) for pipeline in pipeline_subject_sessions - ): - raise LookupError( - "The pipelines in bagel.csv do not have the same number of subjects and sessions." - ) + ) def count_unique_subjects(data: pd.DataFrame) -> int: @@ -86,9 +93,6 @@ def get_pipelines_overview(bagel: pd.DataFrame) -> pd.DataFrame: Constructs a dataframe containing global statuses of pipelines in bagel.csv (based on "pipeline_complete" column) for each participant and session. """ - check_required_columns(bagel) - check_num_subjects(bagel) - pipeline_complete_df = bagel.pivot( index=["participant_id", "session"], columns=["pipeline_name", "pipeline_version"], @@ -99,6 +103,9 @@ def get_pipelines_overview(bagel: pd.DataFrame) -> pd.DataFrame: "-".join(tup) for tup in pipeline_complete_df.columns.to_flat_index() ] + pipeline_complete_df = pipeline_complete_df.reindex( + sorted(pipeline_complete_df.columns), axis=1 + ) pipeline_complete_df.reset_index(inplace=True) return pipeline_complete_df @@ -125,19 +132,18 @@ def parse_csv_contents( decoded = base64.b64decode(content_string) error_msg = None - try: - if ".csv" in filename: - bagel = pd.read_csv(io.StringIO(decoded.decode("utf-8"))) + if ".csv" in filename: + bagel = pd.read_csv(io.StringIO(decoded.decode("utf-8"))) + if len(missing_req_cols := get_missing_required_columns(bagel)) > 0: + error_msg = f"The selected .csv is missing the following required metadata columns: {missing_req_cols}." + elif not are_subjects_same_across_pipelines(bagel): + error_msg = "The pipelines in bagel.csv do not have the same number of subjects and sessions." + else: overview_df = get_pipelines_overview(bagel=bagel) total_subjects = count_unique_subjects(overview_df) sessions = overview_df["session"].sort_values().unique().tolist() - else: - error_msg = "Input file is not a .csv file." - except LookupError as err: - error_msg = str(err) - except Exception as exc: - print(exc) - error_msg = "Something went wrong while processing this file." + else: + error_msg = "Input file is not a .csv file." if error_msg is not None: return None, None, None, error_msg