From 11f8825d321579a6de26e1ed4285b12cd85a918a Mon Sep 17 00:00:00 2001
From: daimm2000 <daimm2000@gmail.com>
Date: Wed, 10 Apr 2024 16:11:14 -0400
Subject: [PATCH 1/2] Adding codes to help filter data before registration date
 and after study period/data ever recorded.

---
 .../data_volume_summaries_template.ipynb      | 513 ++++++++++++------
 1 file changed, 343 insertions(+), 170 deletions(-)

diff --git a/code/forest_mano/data_volume_summaries_template.ipynb b/code/forest_mano/data_volume_summaries_template.ipynb
index 713fc80..6a6fc62 100644
--- a/code/forest_mano/data_volume_summaries_template.ipynb
+++ b/code/forest_mano/data_volume_summaries_template.ipynb
@@ -1,173 +1,346 @@
 {
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "dca7c2c5",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "### This notebook is intended to let a new lab member quickly create the checkerboard plots that we use in the lab. \n",
-    "\n",
-    "## Once you enter the correct Beiwe ID, you should be able to create a somewhat decent looking pdf output by running the following in shell:\n",
-    "\n",
-    "# jupyter nbconvert --to pdf --TemplateExporter.exclude_input=True  \"data_volume_summaries_template.ipynb\"\n",
-    "\n",
-    "## make sure you replace \"data_volume_summaries_template.ipynb\" with the name of the noteobook if you changed the notebook's nanme. \n",
-    "\n",
-    "## (the notebook name is the title of the notebook, so the report looks nicer if you change the name to something better)"
-   ]
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "### This notebook is intended to let a new lab member quickly create the checkerboard plots that we use in the lab. \n",
+        "\n",
+        "## Once you enter the correct Beiwe ID, you should be able to create a somewhat decent looking pdf output by running the following in shell:\n",
+        "\n",
+        "# jupyter nbconvert --to pdf --TemplateExporter.exclude_input=True  \"data_volume_summaries_template.ipynb\"\n",
+        "\n",
+        "## make sure you replace \"data_volume_summaries_template.ipynb\" with the name of the noteobook if you changed the notebook's nanme. \n",
+        "\n",
+        "## (the notebook name is the title of the notebook, so the report looks nicer if you change the name to something better)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "xmQzhlEszGUX"
+      },
+      "outputs": [],
+      "source": [
+        "import sys\n",
+        "#Use this cell if you've moved this notebook somewhere else\n",
+        "#sys.path.insert(0, \"/path/to/repo/beiwe/code\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "import data_summaries"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "XsXqqFS_zGUY"
+      },
+      "outputs": [],
+      "source": [
+        "kr = data_summaries.read_keyring(\"keyring_studies.py\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "bVjoT7i2zGUZ"
+      },
+      "outputs": [],
+      "source": [
+        "data_summaries_file_path = \"data_volume.csv\""
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "rQXZheRNzGUZ"
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "study_id = \"\" ## put the study ID here\n",
+        "data_summaries.get_data_summaries(study_id,\n",
+        "        output_file_path = data_summaries_file_path,\n",
+        "        keyring = kr)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "enxXmD3hzGUZ"
+      },
+      "outputs": [],
+      "source": [
+        "download_folder = \"raw_data\""
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "DytbKoclzGUZ"
+      },
+      "outputs": [],
+      "source": [
+        "from helper_functions import download_data\n",
+        "download_data(kr, study_id, download_folder, data_streams = [\"identifiers\"]) #1st time"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "oNCeesBfzGUZ"
+      },
+      "outputs": [],
+      "source": [
+        "import pandas as pd\n",
+        "import glob\n",
+        "import warnings\n",
+        "warnings.filterwarnings(\"ignore\")\n",
+        "\n",
+        "all_files = glob.glob(download_folder + \"/**/*.csv\", recursive=True)\n",
+        "\n",
+        "lst = []\n",
+        "\n",
+        "for filename in all_files:\n",
+        "    df = pd.read_csv(filename,index_col=False)\n",
+        "    lst.append(df)\n",
+        "\n",
+        "# concatenate all identifier files\n",
+        "df = pd.concat(lst, axis=0, ignore_index=True).sort_values(by=\"timestamp\")\n",
+        "# keep only the first registration record\n",
+        "df_wo_dup = df.drop_duplicates(subset=['patient_id'], keep = \"first\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "61ScRhOJzGUa"
+      },
+      "outputs": [],
+      "source": [
+        "# turn into dictionary (\"id\":\"registration date\")\n",
+        "df_wo_dup['UTC time'] = [item.split('T')[0] for item in df_wo_dup['UTC time']]\n",
+        "registration_date_dict = dict(zip(df_wo_dup['patient_id'], df_wo_dup['UTC time']))"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "-vPkhQ6gzGUa"
+      },
+      "outputs": [],
+      "source": [
+        "data_summary = pd.read_csv(data_summaries_file_path)\n",
+        "data_summary['date'] = pd.to_datetime(data_summary['date'])"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "l3u34A-3zGUa"
+      },
+      "outputs": [],
+      "source": [
+        "## this will remove all the data before registration date\n",
+        "for key in registration_date_dict:\n",
+        "    registration_date_dict[key] = pd.to_datetime(registration_date_dict[key])\n",
+        "\n",
+        "filtered_df = pd.DataFrame()\n",
+        "for p_id, earliest_date in registration_date_dict.items():\n",
+        "    temp_df = data_summary[(data_summary['participant_id'] == p_id) & (data_summary['date'] >= earliest_date)]\n",
+        "    filtered_df = pd.concat([filtered_df, temp_df])\n",
+        "\n",
+        "filtered_df = filtered_df.reset_index(drop=True)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "gHpsiy3VzGUa"
+      },
+      "outputs": [],
+      "source": [
+        "def filter_data(df, end_date = True, study_period_days = None):\n",
+        "\n",
+        "    '''\n",
+        "        df: using the results from \"filtered_df\" from above that already truncated off the data before registration date\n",
+        "        end_date: True or False based on the need to specify \"study_period_days\" or not\n",
+        "        study_period_days: set \"end_date\" to False before specifying this, this is required in the form of integer days\n",
+        "    '''\n",
+        "\n",
+        "    filtered_df_metrics = df.dropna(subset=df.columns[2:69], how=\"all\")\n",
+        "\n",
+        "    last_dates_with_data = filtered_df_metrics.groupby('participant_id')['date'].max().reset_index()\n",
+        "    last_dates_with_data.columns = ['participant_id', 'last_dates_with_data']\n",
+        "\n",
+        "    df_with_last_date = df.merge(last_dates_with_data, on='participant_id', how='left')\n",
+        "\n",
+        "    if end_date:\n",
+        "        filtered_df_new = df_with_last_date[df_with_last_date['date'] <= df_with_last_date['last_dates_with_data']]\n",
+        "        filtered_df_new = filtered_df_new.drop(columns=['last_dates_with_data'])\n",
+        "        filtered_df_new = filtered_df_new.reset_index(drop=True)\n",
+        "\n",
+        "    elif study_period_days != None:\n",
+        "        filtered_df_new = pd.DataFrame()\n",
+        "        for p_id, earliest_date in registration_date_dict.items():\n",
+        "            end_date = df_with_last_date[(df_with_last_date['participant_id'] == p_id)].last_dates_with_data.unique()[0]\n",
+        "\n",
+        "            desired_end_date = earliest_date + pd.Timedelta(days=study_period_days - 1)\n",
+        "\n",
+        "            # consider if there are participants not finishing study collection yet\n",
+        "            end_date_to_use = min(desired_end_date, end_date)\n",
+        "\n",
+        "            temp_df = df[(df['participant_id'] == p_id) & (df['date'] >= earliest_date) &\n",
+        "                        (df['date'] <= end_date_to_use)]\n",
+        "            filtered_df_new = pd.concat([filtered_df_new, temp_df])\n",
+        "\n",
+        "    else:\n",
+        "        print(\"No modification occurred.\")\n",
+        "        return df\n",
+        "\n",
+        "    return filtered_df_new.reset_index(drop=True)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "UM6hVE_lzGUa"
+      },
+      "outputs": [],
+      "source": [
+        "# test case\n",
+        "filtered_df_new = filter_data(filtered_df, end_date=False, study_period_days=2)\n",
+        "## or filtered_df_new = filter_data(filtered_df) if you just need the end date of data ever recorded"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "XLJzi3K1zGUa"
+      },
+      "outputs": [],
+      "source": [
+        "# saving file\n",
+        "filtered_df_new.to_csv(\"data_volume_truncted.csv\", index=False)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "### Data Volume Summaries for Study"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "##### Here are the number of users with at least one day of data by data stream"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "print(data_summaries.get_num_users(summaries_path = data_summaries_file_path).set_index(\"Data Type\", drop = True))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "### Summary Plots: X axis is time since study entry"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "data_plots_path = \"data_volume_plots\"\n",
+        "data_summaries.data_volume_plots(\n",
+        "        data_summaries_path = data_summaries_file_path, \n",
+        "        output_dir = data_plots_path,\n",
+        "        display_plots = True, #this needs to be true for the notebook to run\n",
+        "        binary_heatmap = True, #if this is False, a continuous data volume measurement will be put on the heatmaps\n",
+        "        plot_study_time = True, #whether to use study time instead of calendar time\n",
+        "        overlay_surveys = True, #whether to overlay survey submissions on top of data\n",
+        "        include_y_labels = False # if you have a ton of users, don't include y labels so you can fit it on one page. \n",
+        ")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "### Summary Plots: X axis is calendar time"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "data_plots_path = \"data_volume_plots\"\n",
+        "data_summaries.data_volume_plots(\n",
+        "        data_summaries_path = data_summaries_file_path, \n",
+        "        output_dir = data_plots_path,\n",
+        "        display_plots = True, \n",
+        "        binary_heatmap = True, \n",
+        "        plot_study_time = False,\n",
+        "        overlay_surveys = True,\n",
+        "        include_y_labels = False\n",
+        ")"
+      ]
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "forest_test2",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.8.0"
+    }
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "da992417",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import sys\n",
-    "#Use this cell if you've moved this notebook somewhere else\n",
-    "#sys.path.insert(0, \"/path/to/repo/beiwe/code\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "d720133d",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import data_summaries"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "3416a2b8",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "kr = data_summaries.read_keyring(\"/Volumes/SanDisk/onnela_lab/keyring_files/keyring_studies.py\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "5d38313e",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "data_summaries_file_path = \"data_volume.csv\""
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "45ecaaf1",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import os\n",
-    "study_id =  \"58de9c9246b9fc61b2007034\"\n",
-    "data_summaries.get_data_summaries(study_id,\n",
-    "        output_file_path = data_summaries_file_path,\n",
-    "        keyring = kr)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "1e1ba32a",
-   "metadata": {},
-   "source": [
-    "# Data Volume Summaries for Study\n",
-    "\n",
-    "Here are the number of users with at least one day of data by data stream"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "39aad277",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "print(data_summaries.get_num_users(summaries_path = data_summaries_file_path).set_index(\"Data Type\", drop = True))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "72575028",
-   "metadata": {},
-   "source": [
-    "## Summary Plots: X axis is time since study entry"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "52e84f13",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "data_plots_path = \"data_volume_plots\"\n",
-    "data_summaries.data_volume_plots(\n",
-    "        data_summaries_path = data_summaries_file_path, \n",
-    "        output_dir = data_plots_path,\n",
-    "        display_plots = True, #this needs to be true for the notebook to run\n",
-    "        binary_heatmap = True, #if this is False, a continuous data volume measurement will be put on the heatmaps\n",
-    "        plot_study_time = True, #whether to use study time instead of calendar time\n",
-    "        overlay_surveys = True, #whether to overlay survey submissions on top of data\n",
-    "        include_y_labels = False # if you have a ton of users, don't include y labels so you can fit it on one page. \n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "592ef085",
-   "metadata": {},
-   "source": [
-    "## Summary Plots: X axis is calendar time"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "c70e7bd1",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "data_plots_path = \"data_volume_plots\"\n",
-    "data_summaries.data_volume_plots(\n",
-    "        data_summaries_path = data_summaries_file_path, \n",
-    "        output_dir = data_plots_path,\n",
-    "        display_plots = True, \n",
-    "        binary_heatmap = True, \n",
-    "        plot_study_time = False,\n",
-    "        overlay_surveys = True,\n",
-    "        include_y_labels = False\n",
-    ")"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.9.7"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
+  "nbformat": 4,
+  "nbformat_minor": 0
 }

From 0714c6635f88378c489aca3fca043123eb2e6259 Mon Sep 17 00:00:00 2001
From: daimm2000 <daimm2000@gmail.com>
Date: Sun, 14 Apr 2024 00:39:38 -0400
Subject: [PATCH 2/2] Changed on if else statement condition

---
 .../data_volume_summaries_template.ipynb      | 23 ++++++++-----------
 1 file changed, 9 insertions(+), 14 deletions(-)

diff --git a/code/forest_mano/data_volume_summaries_template.ipynb b/code/forest_mano/data_volume_summaries_template.ipynb
index 6a6fc62..c45183e 100644
--- a/code/forest_mano/data_volume_summaries_template.ipynb
+++ b/code/forest_mano/data_volume_summaries_template.ipynb
@@ -179,12 +179,11 @@
       },
       "outputs": [],
       "source": [
-        "def filter_data(df, end_date = True, study_period_days = None):\n",
+        "def filter_data(df: pd.DataFrame, study_period_days: int | None = None) -> pd.DataFrame:\n",
         "\n",
         "    '''\n",
         "        df: using the results from \"filtered_df\" from above that already truncated off the data before registration date\n",
-        "        end_date: True or False based on the need to specify \"study_period_days\" or not\n",
-        "        study_period_days: set \"end_date\" to False before specifying this, this is required in the form of integer days\n",
+        "        study_period_days: this is required in the form of integer days, None will result in date ever recorded\n",
         "    '''\n",
         "\n",
         "    filtered_df_metrics = df.dropna(subset=df.columns[2:69], how=\"all\")\n",
@@ -194,12 +193,7 @@
         "\n",
         "    df_with_last_date = df.merge(last_dates_with_data, on='participant_id', how='left')\n",
         "\n",
-        "    if end_date:\n",
-        "        filtered_df_new = df_with_last_date[df_with_last_date['date'] <= df_with_last_date['last_dates_with_data']]\n",
-        "        filtered_df_new = filtered_df_new.drop(columns=['last_dates_with_data'])\n",
-        "        filtered_df_new = filtered_df_new.reset_index(drop=True)\n",
-        "\n",
-        "    elif study_period_days != None:\n",
+        "    if study_period_days:\n",
         "        filtered_df_new = pd.DataFrame()\n",
         "        for p_id, earliest_date in registration_date_dict.items():\n",
         "            end_date = df_with_last_date[(df_with_last_date['participant_id'] == p_id)].last_dates_with_data.unique()[0]\n",
@@ -213,9 +207,10 @@
         "                        (df['date'] <= end_date_to_use)]\n",
         "            filtered_df_new = pd.concat([filtered_df_new, temp_df])\n",
         "\n",
-        "    else:\n",
-        "        print(\"No modification occurred.\")\n",
-        "        return df\n",
+        "    else: \n",
+        "        filtered_df_new = df_with_last_date[df_with_last_date['date'] <= df_with_last_date['last_dates_with_data']]\n",
+        "        filtered_df_new = filtered_df_new.drop(columns=['last_dates_with_data'])\n",
+        "        filtered_df_new = filtered_df_new.reset_index(drop=True)\n",
         "\n",
         "    return filtered_df_new.reset_index(drop=True)"
       ]
@@ -228,8 +223,8 @@
       },
       "outputs": [],
       "source": [
-        "# test case\n",
-        "filtered_df_new = filter_data(filtered_df, end_date=False, study_period_days=2)\n",
+        "# example case\n",
+        "filtered_df_new = filter_data(filtered_df, study_period_days=2)\n",
         "## or filtered_df_new = filter_data(filtered_df) if you just need the end date of data ever recorded"
       ]
     },