From 11f8825d321579a6de26e1ed4285b12cd85a918a Mon Sep 17 00:00:00 2001 From: daimm2000 Date: Wed, 10 Apr 2024 16:11:14 -0400 Subject: [PATCH 1/2] Adding codes to help filter data before registration date and after study period/data ever recorded. --- .../data_volume_summaries_template.ipynb | 513 ++++++++++++------ 1 file changed, 343 insertions(+), 170 deletions(-) diff --git a/code/forest_mano/data_volume_summaries_template.ipynb b/code/forest_mano/data_volume_summaries_template.ipynb index 713fc80..6a6fc62 100644 --- a/code/forest_mano/data_volume_summaries_template.ipynb +++ b/code/forest_mano/data_volume_summaries_template.ipynb @@ -1,173 +1,346 @@ { - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "dca7c2c5", - "metadata": {}, - "outputs": [], - "source": [ - "### This notebook is intended to let a new lab member quickly create the checkerboard plots that we use in the lab. \n", - "\n", - "## Once you enter the correct Beiwe ID, you should be able to create a somewhat decent looking pdf output by running the following in shell:\n", - "\n", - "# jupyter nbconvert --to pdf --TemplateExporter.exclude_input=True \"data_volume_summaries_template.ipynb\"\n", - "\n", - "## make sure you replace \"data_volume_summaries_template.ipynb\" with the name of the noteobook if you changed the notebook's nanme. \n", - "\n", - "## (the notebook name is the title of the notebook, so the report looks nicer if you change the name to something better)" - ] + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "### This notebook is intended to let a new lab member quickly create the checkerboard plots that we use in the lab. \n", + "\n", + "## Once you enter the correct Beiwe ID, you should be able to create a somewhat decent looking pdf output by running the following in shell:\n", + "\n", + "# jupyter nbconvert --to pdf --TemplateExporter.exclude_input=True \"data_volume_summaries_template.ipynb\"\n", + "\n", + "## make sure you replace \"data_volume_summaries_template.ipynb\" with the name of the noteobook if you changed the notebook's nanme. \n", + "\n", + "## (the notebook name is the title of the notebook, so the report looks nicer if you change the name to something better)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "xmQzhlEszGUX" + }, + "outputs": [], + "source": [ + "import sys\n", + "#Use this cell if you've moved this notebook somewhere else\n", + "#sys.path.insert(0, \"/path/to/repo/beiwe/code\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import data_summaries" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "XsXqqFS_zGUY" + }, + "outputs": [], + "source": [ + "kr = data_summaries.read_keyring(\"keyring_studies.py\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "bVjoT7i2zGUZ" + }, + "outputs": [], + "source": [ + "data_summaries_file_path = \"data_volume.csv\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "rQXZheRNzGUZ" + }, + "outputs": [], + "source": [ + "import os\n", + "study_id = \"\" ## put the study ID here\n", + "data_summaries.get_data_summaries(study_id,\n", + " output_file_path = data_summaries_file_path,\n", + " keyring = kr)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "enxXmD3hzGUZ" + }, + "outputs": [], + "source": [ + "download_folder = \"raw_data\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "DytbKoclzGUZ" + }, + "outputs": [], + "source": [ + "from helper_functions import download_data\n", + "download_data(kr, study_id, download_folder, data_streams = [\"identifiers\"]) #1st time" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "oNCeesBfzGUZ" + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import glob\n", + "import warnings\n", + "warnings.filterwarnings(\"ignore\")\n", + "\n", + "all_files = glob.glob(download_folder + \"/**/*.csv\", recursive=True)\n", + "\n", + "lst = []\n", + "\n", + "for filename in all_files:\n", + " df = pd.read_csv(filename,index_col=False)\n", + " lst.append(df)\n", + "\n", + "# concatenate all identifier files\n", + "df = pd.concat(lst, axis=0, ignore_index=True).sort_values(by=\"timestamp\")\n", + "# keep only the first registration record\n", + "df_wo_dup = df.drop_duplicates(subset=['patient_id'], keep = \"first\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "61ScRhOJzGUa" + }, + "outputs": [], + "source": [ + "# turn into dictionary (\"id\":\"registration date\")\n", + "df_wo_dup['UTC time'] = [item.split('T')[0] for item in df_wo_dup['UTC time']]\n", + "registration_date_dict = dict(zip(df_wo_dup['patient_id'], df_wo_dup['UTC time']))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "-vPkhQ6gzGUa" + }, + "outputs": [], + "source": [ + "data_summary = pd.read_csv(data_summaries_file_path)\n", + "data_summary['date'] = pd.to_datetime(data_summary['date'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "l3u34A-3zGUa" + }, + "outputs": [], + "source": [ + "## this will remove all the data before registration date\n", + "for key in registration_date_dict:\n", + " registration_date_dict[key] = pd.to_datetime(registration_date_dict[key])\n", + "\n", + "filtered_df = pd.DataFrame()\n", + "for p_id, earliest_date in registration_date_dict.items():\n", + " temp_df = data_summary[(data_summary['participant_id'] == p_id) & (data_summary['date'] >= earliest_date)]\n", + " filtered_df = pd.concat([filtered_df, temp_df])\n", + "\n", + "filtered_df = filtered_df.reset_index(drop=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "gHpsiy3VzGUa" + }, + "outputs": [], + "source": [ + "def filter_data(df, end_date = True, study_period_days = None):\n", + "\n", + " '''\n", + " df: using the results from \"filtered_df\" from above that already truncated off the data before registration date\n", + " end_date: True or False based on the need to specify \"study_period_days\" or not\n", + " study_period_days: set \"end_date\" to False before specifying this, this is required in the form of integer days\n", + " '''\n", + "\n", + " filtered_df_metrics = df.dropna(subset=df.columns[2:69], how=\"all\")\n", + "\n", + " last_dates_with_data = filtered_df_metrics.groupby('participant_id')['date'].max().reset_index()\n", + " last_dates_with_data.columns = ['participant_id', 'last_dates_with_data']\n", + "\n", + " df_with_last_date = df.merge(last_dates_with_data, on='participant_id', how='left')\n", + "\n", + " if end_date:\n", + " filtered_df_new = df_with_last_date[df_with_last_date['date'] <= df_with_last_date['last_dates_with_data']]\n", + " filtered_df_new = filtered_df_new.drop(columns=['last_dates_with_data'])\n", + " filtered_df_new = filtered_df_new.reset_index(drop=True)\n", + "\n", + " elif study_period_days != None:\n", + " filtered_df_new = pd.DataFrame()\n", + " for p_id, earliest_date in registration_date_dict.items():\n", + " end_date = df_with_last_date[(df_with_last_date['participant_id'] == p_id)].last_dates_with_data.unique()[0]\n", + "\n", + " desired_end_date = earliest_date + pd.Timedelta(days=study_period_days - 1)\n", + "\n", + " # consider if there are participants not finishing study collection yet\n", + " end_date_to_use = min(desired_end_date, end_date)\n", + "\n", + " temp_df = df[(df['participant_id'] == p_id) & (df['date'] >= earliest_date) &\n", + " (df['date'] <= end_date_to_use)]\n", + " filtered_df_new = pd.concat([filtered_df_new, temp_df])\n", + "\n", + " else:\n", + " print(\"No modification occurred.\")\n", + " return df\n", + "\n", + " return filtered_df_new.reset_index(drop=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "UM6hVE_lzGUa" + }, + "outputs": [], + "source": [ + "# test case\n", + "filtered_df_new = filter_data(filtered_df, end_date=False, study_period_days=2)\n", + "## or filtered_df_new = filter_data(filtered_df) if you just need the end date of data ever recorded" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "XLJzi3K1zGUa" + }, + "outputs": [], + "source": [ + "# saving file\n", + "filtered_df_new.to_csv(\"data_volume_truncted.csv\", index=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Data Volume Summaries for Study" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Here are the number of users with at least one day of data by data stream" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(data_summaries.get_num_users(summaries_path = data_summaries_file_path).set_index(\"Data Type\", drop = True))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Summary Plots: X axis is time since study entry" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data_plots_path = \"data_volume_plots\"\n", + "data_summaries.data_volume_plots(\n", + " data_summaries_path = data_summaries_file_path, \n", + " output_dir = data_plots_path,\n", + " display_plots = True, #this needs to be true for the notebook to run\n", + " binary_heatmap = True, #if this is False, a continuous data volume measurement will be put on the heatmaps\n", + " plot_study_time = True, #whether to use study time instead of calendar time\n", + " overlay_surveys = True, #whether to overlay survey submissions on top of data\n", + " include_y_labels = False # if you have a ton of users, don't include y labels so you can fit it on one page. \n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Summary Plots: X axis is calendar time" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data_plots_path = \"data_volume_plots\"\n", + "data_summaries.data_volume_plots(\n", + " data_summaries_path = data_summaries_file_path, \n", + " output_dir = data_plots_path,\n", + " display_plots = True, \n", + " binary_heatmap = True, \n", + " plot_study_time = False,\n", + " overlay_surveys = True,\n", + " include_y_labels = False\n", + ")" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "forest_test2", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.0" + } }, - { - "cell_type": "code", - "execution_count": null, - "id": "da992417", - "metadata": {}, - "outputs": [], - "source": [ - "import sys\n", - "#Use this cell if you've moved this notebook somewhere else\n", - "#sys.path.insert(0, \"/path/to/repo/beiwe/code\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d720133d", - "metadata": {}, - "outputs": [], - "source": [ - "import data_summaries" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3416a2b8", - "metadata": {}, - "outputs": [], - "source": [ - "kr = data_summaries.read_keyring(\"/Volumes/SanDisk/onnela_lab/keyring_files/keyring_studies.py\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5d38313e", - "metadata": {}, - "outputs": [], - "source": [ - "data_summaries_file_path = \"data_volume.csv\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "45ecaaf1", - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "study_id = \"58de9c9246b9fc61b2007034\"\n", - "data_summaries.get_data_summaries(study_id,\n", - " output_file_path = data_summaries_file_path,\n", - " keyring = kr)" - ] - }, - { - "cell_type": "markdown", - "id": "1e1ba32a", - "metadata": {}, - "source": [ - "# Data Volume Summaries for Study\n", - "\n", - "Here are the number of users with at least one day of data by data stream" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "39aad277", - "metadata": {}, - "outputs": [], - "source": [ - "print(data_summaries.get_num_users(summaries_path = data_summaries_file_path).set_index(\"Data Type\", drop = True))" - ] - }, - { - "cell_type": "markdown", - "id": "72575028", - "metadata": {}, - "source": [ - "## Summary Plots: X axis is time since study entry" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "52e84f13", - "metadata": {}, - "outputs": [], - "source": [ - "data_plots_path = \"data_volume_plots\"\n", - "data_summaries.data_volume_plots(\n", - " data_summaries_path = data_summaries_file_path, \n", - " output_dir = data_plots_path,\n", - " display_plots = True, #this needs to be true for the notebook to run\n", - " binary_heatmap = True, #if this is False, a continuous data volume measurement will be put on the heatmaps\n", - " plot_study_time = True, #whether to use study time instead of calendar time\n", - " overlay_surveys = True, #whether to overlay survey submissions on top of data\n", - " include_y_labels = False # if you have a ton of users, don't include y labels so you can fit it on one page. \n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "592ef085", - "metadata": {}, - "source": [ - "## Summary Plots: X axis is calendar time" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c70e7bd1", - "metadata": {}, - "outputs": [], - "source": [ - "data_plots_path = \"data_volume_plots\"\n", - "data_summaries.data_volume_plots(\n", - " data_summaries_path = data_summaries_file_path, \n", - " output_dir = data_plots_path,\n", - " display_plots = True, \n", - " binary_heatmap = True, \n", - " plot_study_time = False,\n", - " overlay_surveys = True,\n", - " include_y_labels = False\n", - ")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.7" - } - }, - "nbformat": 4, - "nbformat_minor": 5 + "nbformat": 4, + "nbformat_minor": 0 } From 0714c6635f88378c489aca3fca043123eb2e6259 Mon Sep 17 00:00:00 2001 From: daimm2000 Date: Sun, 14 Apr 2024 00:39:38 -0400 Subject: [PATCH 2/2] Changed on if else statement condition --- .../data_volume_summaries_template.ipynb | 23 ++++++++----------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/code/forest_mano/data_volume_summaries_template.ipynb b/code/forest_mano/data_volume_summaries_template.ipynb index 6a6fc62..c45183e 100644 --- a/code/forest_mano/data_volume_summaries_template.ipynb +++ b/code/forest_mano/data_volume_summaries_template.ipynb @@ -179,12 +179,11 @@ }, "outputs": [], "source": [ - "def filter_data(df, end_date = True, study_period_days = None):\n", + "def filter_data(df: pd.DataFrame, study_period_days: int | None = None) -> pd.DataFrame:\n", "\n", " '''\n", " df: using the results from \"filtered_df\" from above that already truncated off the data before registration date\n", - " end_date: True or False based on the need to specify \"study_period_days\" or not\n", - " study_period_days: set \"end_date\" to False before specifying this, this is required in the form of integer days\n", + " study_period_days: this is required in the form of integer days, None will result in date ever recorded\n", " '''\n", "\n", " filtered_df_metrics = df.dropna(subset=df.columns[2:69], how=\"all\")\n", @@ -194,12 +193,7 @@ "\n", " df_with_last_date = df.merge(last_dates_with_data, on='participant_id', how='left')\n", "\n", - " if end_date:\n", - " filtered_df_new = df_with_last_date[df_with_last_date['date'] <= df_with_last_date['last_dates_with_data']]\n", - " filtered_df_new = filtered_df_new.drop(columns=['last_dates_with_data'])\n", - " filtered_df_new = filtered_df_new.reset_index(drop=True)\n", - "\n", - " elif study_period_days != None:\n", + " if study_period_days:\n", " filtered_df_new = pd.DataFrame()\n", " for p_id, earliest_date in registration_date_dict.items():\n", " end_date = df_with_last_date[(df_with_last_date['participant_id'] == p_id)].last_dates_with_data.unique()[0]\n", @@ -213,9 +207,10 @@ " (df['date'] <= end_date_to_use)]\n", " filtered_df_new = pd.concat([filtered_df_new, temp_df])\n", "\n", - " else:\n", - " print(\"No modification occurred.\")\n", - " return df\n", + " else: \n", + " filtered_df_new = df_with_last_date[df_with_last_date['date'] <= df_with_last_date['last_dates_with_data']]\n", + " filtered_df_new = filtered_df_new.drop(columns=['last_dates_with_data'])\n", + " filtered_df_new = filtered_df_new.reset_index(drop=True)\n", "\n", " return filtered_df_new.reset_index(drop=True)" ] @@ -228,8 +223,8 @@ }, "outputs": [], "source": [ - "# test case\n", - "filtered_df_new = filter_data(filtered_df, end_date=False, study_period_days=2)\n", + "# example case\n", + "filtered_df_new = filter_data(filtered_df, study_period_days=2)\n", "## or filtered_df_new = filter_data(filtered_df) if you just need the end date of data ever recorded" ] },