top analysis and plotting updates

rkansal47 · Jul 22, 2024 · 085d6bb · 085d6bb
1 parent 962d127
commit 085d6bb
Show file tree

Hide file tree

Showing 2 changed files with 189 additions and 57 deletions.
diff --git a/src/HHbbVV/postprocessing/TopAnalysis.ipynb b/src/HHbbVV/postprocessing/TopAnalysis.ipynb
@@ -11,9 +11,10 @@
     "import numpy as np\n",
     "import warnings\n",
     "import pandas as pd\n",
+    "from pathlib import Path\n",
     "\n",
     "from pandas.errors import SettingWithCopyWarning\n",
-    "from hh_vars import data_key\n",
+    "from HHbbVV.hh_vars import data_key\n",
     "import postprocessing\n",
     "\n",
     "# ignore these because they don't seem to apply\n",
@@ -50,11 +51,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "plot_dir = \"../../../plots/ttsfs/24Mar1_update_lps\"\n",
-    "\n",
-    "import os\n",
-    "\n",
-    "_ = os.system(f\"mkdir -p {plot_dir}\")"
+    "plot_dir = Path(\"../../../plots/ttsfs/24Jul22UpdateLP\")\n",
+    "plot_dir.mkdir(parents=True, exist_ok=True)"
    ]
   },
   {
@@ -63,31 +61,48 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "samples = {\n",
+    "bg_samples = {\n",
     "    \"QCD\": \"QCD\",\n",
-    "    # \"Single Top\": \"ST\",\n",
-    "    \"Top\": [\"TTToSemiLeptonic\", \"ST\"],\n",
+    "    # \"Top\": [\"TTToSemiLeptonic\", \"ST\"],\n",
     "    \"TTbar\": [\"TTTo2L2Nu\", \"TTToHadronic\"],\n",
     "    \"W+Jets\": \"WJets\",\n",
     "    \"Diboson\": [\"WW\", \"WZ\", \"ZZ\"],\n",
     "    \"Data\": \"SingleMuon\",\n",
     "}\n",
     "\n",
+    "sig_samples = {\"Top\": [\"TTToSemiLeptonic\", \"ST\"]}\n",
+    "\n",
+    "samples = {**bg_samples, **sig_samples}\n",
+    "\n",
     "top_matched_key = \"TT Top Matched\"\n",
     "\n",
-    "data_dir = \"../../../../data/ttsfs/24Feb28_update_lp/\"\n",
+    "# data_dir = \"../../../../data/ttsfs/24Feb28_update_lp/\"\n",
+    "data_dir = \"/ceph/cms/store/user/rkansal/bbVV/ttsfs/24Feb28_update_lp/\"\n",
+    "signal_data_dir = \"/ceph/cms/store/user/rkansal/bbVV/ttsfs/24Jul22UpdateLP/\"\n",
     "year = \"2018\"\n",
     "\n",
     "# filters = [(\"('ak8FatJetPt', '0')\", \">=\", 500)]\n",
     "filters = None\n",
     "\n",
-    "events_dict = postprocessing.load_samples(data_dir, samples, year, hem_cleaning=False)\n",
+    "events_dict = postprocessing.load_samples(data_dir, bg_samples, year, hem_cleaning=False)\n",
+    "events_dict |= postprocessing.load_samples(signal_data_dir, sig_samples, year, hem_cleaning=False)\n",
     "\n",
     "cutflow = pd.DataFrame(index=list(samples.keys()))\n",
     "utils.add_to_cutflow(events_dict, \"Selection\", \"weight\", cutflow)\n",
     "cutflow"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "utils.get_pickles(f\"{signal_data_dir}/{year}/TTToSemiLeptonic/pickles\", year, f\"TTToSemiLeptonic\")[\n",
+    "    \"lp_hist\"\n",
+    "][2, ...]"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -154,6 +169,18 @@
     "cutflow"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for key in events_dict:\n",
+    "    events_dict[key] = events_dict[key][events_dict[key][\"ak8FatJetPt\"][0] >= 500]\n",
+    "    events_dict[key] = events_dict[key][events_dict[key][\"ak8FatJetMsd\"][0] >= 125]\n",
+    "    events_dict[key] = events_dict[key][events_dict[key][\"ak8FatJetMsd\"][0] <= 225]"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -169,6 +196,44 @@
     "# del events_dict[\"TTbar\"]"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "CLIP = 5.0\n",
+    "\n",
+    "events = events_dict[top_matched_key]\n",
+    "\n",
+    "up_prong_rc = (\n",
+    "    (events[\"lp_sf_outside_boundary_quarks\"][0] > 0)\n",
+    "    | (events[\"lp_sf_double_matched_event\"][0] > 0)\n",
+    "    | (events[\"lp_sf_unmatched_quarks\"][0] > 0)\n",
+    ").to_numpy()\n",
+    "\n",
+    "down_prong_rc = (\n",
+    "    (events[\"lp_sf_inside_boundary_quarks\"][0] > 0)\n",
+    "    | (events[\"lp_sf_double_matched_event\"][0] > 0)\n",
+    "    | (events[\"lp_sf_unmatched_quarks\"][0] > 0)\n",
+    ").to_numpy()\n",
+    "\n",
+    "# TODO: update\n",
+    "rc_unmatched = np.zeros(len(events), dtype=bool)\n",
+    "\n",
+    "for shift, prong_rc in [(\"up\", up_prong_rc), (\"down\", down_prong_rc)]:\n",
+    "    np_sfs = events[\"lp_sf_lnN\"][0].to_numpy()\n",
+    "    np_sfs[prong_rc] = events[f\"lp_sf_np_{shift}\"][0][prong_rc]\n",
+    "    np_sfs = np.nan_to_num(np.clip(np_sfs, 1.0 / CLIP, CLIP))\n",
+    "    events.loc[:, (f\"lp_sf_np_{shift}\", 0)] = np_sfs / np.mean(np_sfs, axis=0)\n",
+    "\n",
+    "for shift in [\"up\", \"down\"]:\n",
+    "    np_sfs = events[\"lp_sf_lnN\"][0].to_numpy()\n",
+    "    np_sfs[rc_unmatched] = CLIP if shift == \"up\" else 1.0 / CLIP\n",
+    "    np_sfs = np.nan_to_num(np.clip(np_sfs, 1.0 / CLIP, CLIP))\n",
+    "    events[f\"lp_sf_unmatched_{shift}\"] = np_sfs / np.mean(np_sfs, axis=0)"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -184,10 +249,11 @@
     "    \"lp_sf_pt_extrap_vars\",\n",
     "    \"lp_sfs_bl_ratio\",\n",
     "]:\n",
-    "    # cut off at 10\n",
+    "    # cut off at 5\n",
     "    events_dict[top_matched_key].loc[:, key] = np.clip(\n",
-    "        events_dict[top_matched_key].loc[:, key].values, 0.1, 10\n",
+    "        events_dict[top_matched_key].loc[:, key].values, 1.0 / 5.0, 5.0\n",
     "    )\n",
+    "\n",
     "    if key == \"lp_sfs_bl_ratio\":\n",
     "        mean_lp_sfs = np.mean(\n",
     "            np.nan_to_num(\n",
@@ -229,37 +295,17 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "for key in events_dict:\n",
-    "    events_dict[key] = events_dict[key][events_dict[key][\"ak8FatJetPt\"][0] >= 500]\n",
-    "    events_dict[key] = events_dict[key][events_dict[key][\"ak8FatJetMsd\"][0] >= 125]\n",
-    "    events_dict[key] = events_dict[key][events_dict[key][\"ak8FatJetMsd\"][0] <= 225]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "events = events_dict[top_matched_key]\n",
-    "sj_matching_unc = (\n",
-    "    (np.sum(events[\"lp_sf_unmatched_quarks\"]) / (len(events) * 3))\n",
-    "    # OR of double matched and boundary quarks\n",
-    "    # >0.1 to avoid floating point errors\n",
-    "    + (\n",
-    "        np.sum((events[\"lp_sf_double_matched_event\"] + events[\"lp_sf_boundary_quarks\"]) > 0.1)\n",
-    "        / (len(events))\n",
-    "    )\n",
-    ").values[0]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "sj_matching_unc"
+    "# events = events_dict[top_matched_key]\n",
+    "# sj_matching_unc = (\n",
+    "#     (np.sum(events[\"lp_sf_unmatched_quarks\"]) / (len(events) * 3))\n",
+    "#     # OR of double matched and boundary quarks\n",
+    "#     # >0.1 to avoid floating point errors\n",
+    "#     + (\n",
+    "#         np.sum((events[\"lp_sf_double_matched_event\"] + events[\"lp_sf_boundary_quarks\"]) > 0.1)\n",
+    "#         / (len(events))\n",
+    "#     )\n",
+    "# ).values[0]\n",
+    "# sj_matching_unc"
    ]
   },
   {
@@ -305,7 +351,7 @@
    "source": [
     "# {var: (bins, label)}\n",
     "plot_vars = {\n",
-    "    \"ak8FatJetMsd\": ([20, 125, 225], r\"$m_{SD}$ (GeV)\"),\n",
+    "    # \"ak8FatJetMsd\": ([20, 125, 225], r\"$m_{SD}$ (GeV)\"),\n",
     "    # \"ak8FatJetParticleNetMass\": ([30, 50, 200], r\"$m_{reg}$ (GeV)\"),\n",
     "    # \"ak8FatJetPt\": ([30, 0, 1200], r\"$p_T$ (GeV)\"),\n",
     "    # \"MET_pt\": ([30, 0, 200], r\"MET (GeV)\"),\n",
@@ -389,6 +435,26 @@
     "            )[0]\n",
     "        )\n",
     "\n",
+    "    np_up_down = []\n",
+    "    for key in [\"lp_sf_np_up\", \"lp_sf_np_down\"]:\n",
+    "        np_up_down.append(\n",
+    "            np.histogram(\n",
+    "                events[var][0].values.squeeze(),\n",
+    "                np.linspace(*bins[1:], bins[0] + 1),\n",
+    "                weights=events[\"weight\"][0].values * events[key][0].values,\n",
+    "            )[0]\n",
+    "        )\n",
+    "\n",
+    "    um_up_down = []\n",
+    "    for key in [\"lp_sf_unmatched_up\", \"lp_sf_unmatched_down\"]:\n",
+    "        um_up_down.append(\n",
+    "            np.histogram(\n",
+    "                events[var][0].values.squeeze(),\n",
+    "                np.linspace(*bins[1:], bins[0] + 1),\n",
+    "                weights=events[\"weight\"][0].values * events[key].values,\n",
+    "            )[0]\n",
+    "        )\n",
+    "\n",
     "    nom_vals = toy_hists[0]  # first column are nominal values\n",
     "\n",
     "    pt_toy_hists = []\n",
@@ -412,11 +478,30 @@
     "    uncs = {\n",
     "        \"stat_unc\": np.minimum(nom_vals, np.std(toy_hists[1:], axis=0)),  # cap at 100% unc\n",
     "        \"syst_rat_unc\": np.minimum(nom_vals, (np.abs(sys_up_down[0] - sys_up_down[1])) / 2),\n",
-    "        \"syst_sjm_unc\": nom_vals * sj_matching_unc,\n",
+    "        \"np_unc\": np.minimum(nom_vals, (np.abs(np_up_down[0] - np_up_down[1])) / 2),\n",
+    "        \"um_unc\": np.minimum(nom_vals, (np.abs(um_up_down[0] - um_up_down[1])) / 2),\n",
+    "        # \"syst_sjm_unc\": nom_vals * sj_matching_unc,\n",
     "        \"syst_sjpt_unc\": np.minimum(nom_vals, np.std(pt_toy_hists, axis=0)),\n",
     "        \"syst_b_unc\": np.abs(1 - (b_ratio_hist / nom_vals)) * nom_vals,\n",
     "    }\n",
     "\n",
+    "    # uncs = {}\n",
+    "\n",
+    "    # for i, shift in enumerate([\"up\", \"down\"]):\n",
+    "    #     uncs[shift] = {\n",
+    "    #         \"syst_rat_unc\": np.clip(sys_up_down[i], 0, 2 * nom_vals),\n",
+    "    #         \"np_unc\": np.clip(np_up_down[i], 0, 2 * nom_vals),\n",
+    "    #         \"um_unc\": np.clip(um_up_down[i], 0, 2 * nom_vals),\n",
+    "    #     }\n",
+    "\n",
+    "    #     uncs[shift]\n",
+    "\n",
+    "    #     for key, val in uncs_symm.items():\n",
+    "    #         if shift == \"up\":\n",
+    "    #             uncs[shift][key] = nom_vals + val\n",
+    "    #         else:\n",
+    "    #             uncs[shift][key] = nom_vals - val\n",
+    "\n",
     "    uncs_dict[var] = uncs\n",
     "\n",
     "    unc = np.linalg.norm(list(uncs.values()), axis=0)\n",
@@ -488,6 +573,26 @@
     "                )[0]\n",
     "            )\n",
     "\n",
+    "        np_up_down = []\n",
+    "        for key in [\"lp_sf_np_up\", \"lp_sf_np_down\"]:\n",
+    "            np_up_down.append(\n",
+    "                np.histogram(\n",
+    "                    events[var][0].values.squeeze(),\n",
+    "                    np.linspace(*bins[1:], bins[0] + 1),\n",
+    "                    weights=events[\"weight\"][0].values * np.nan_to_num(events[key][0].values),\n",
+    "                )[0]\n",
+    "            )\n",
+    "\n",
+    "        um_up_down = []\n",
+    "        for key in [\"lp_sf_unmatched_up\", \"lp_sf_unmatched_down\"]:\n",
+    "            um_up_down.append(\n",
+    "                np.histogram(\n",
+    "                    events[var][0].values.squeeze(),\n",
+    "                    np.linspace(*bins[1:], bins[0] + 1),\n",
+    "                    weights=events[\"weight\"][0].values * np.nan_to_num(events[key].values),\n",
+    "                )[0]\n",
+    "            )\n",
+    "\n",
     "        nom_vals = toy_hists[0]  # first column are nominal values\n",
     "\n",
     "        pt_toy_hists = []\n",
@@ -511,7 +616,9 @@
     "        uncs = {\n",
     "            \"stat_unc\": np.minimum(nom_vals, np.std(toy_hists[1:], axis=0)),  # cap at 100% unc\n",
     "            \"syst_rat_unc\": np.minimum(nom_vals, (np.abs(sys_up_down[0] - sys_up_down[1])) / 2),\n",
-    "            \"syst_sjm_unc\": nom_vals * sj_matching_unc,\n",
+    "            \"np_unc\": np.minimum(nom_vals, (np.abs(np_up_down[0] - np_up_down[1])) / 2),\n",
+    "            \"um_unc\": np.minimum(nom_vals, (np.abs(um_up_down[0] - um_up_down[1])) / 2),\n",
+    "            # \"syst_sjm_unc\": nom_vals * sj_matching_unc,\n",
     "            \"syst_sjpt_unc\": np.minimum(nom_vals, np.std(pt_toy_hists, axis=0)),\n",
     "            \"syst_b_unc\": np.abs(1 - (b_ratio_hist / nom_vals)) * nom_vals,\n",
     "        }\n",
@@ -884,7 +991,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.11"
+   "version": "3.9.15"
   },
   "orig_nbformat": 4,
   "vscode": {

diff --git a/src/HHbbVV/postprocessing/plotting.py b/src/HHbbVV/postprocessing/plotting.py
@@ -213,7 +213,7 @@ def _asimov_significance(s, b):
     return np.sqrt(2 * ((s + b) * np.log(1 + (s / b)) - s))
 
 
-def add_cms_label(ax, year, label=None):
+def add_cms_label(ax, year, label="Preliminary"):
     if year == "all":
         hep.cms.label(
             label,
@@ -621,6 +621,7 @@ def ratioLinePlot(
             alpha=0.2,
             hatch="//",
             linewidth=0,
+            label="Lund Plane Uncertainty",
         )
 
     # if sig_key in hists:
@@ -635,20 +636,31 @@ def ratioLinePlot(
     hep.histplot(
         hists[data_key, :], ax=ax, yerr=data_err, histtype="errorbar", label=data_key, color="black"
     )
-    ax.legend(ncol=2)
+
+    if bg_err is not None:
+        # Switch order so that uncertainty label comes at the end
+        handles, labels = ax.get_legend_handles_labels()
+        handles = handles[1:] + handles[:1]
+        labels = labels[1:] + labels[:1]
+        ax.legend(handles, labels, ncol=2)
+    else:
+        ax.legend(ncol=2)
+
     ax.set_ylim(0, ax.get_ylim()[1] * 1.5)
 
     data_vals = hists[data_key, :].values()
 
     if not pulls:
-        datamc_ratio = data_vals / (bg_tot + 1e-5)
+        # datamc_ratio = data_vals / (bg_tot + 1e-5)
 
-        if bg_err == "ratio":
-            yerr = ratio_uncertainty(data_vals, bg_tot, "poisson")
-        elif bg_err is None:
-            yerr = 0
-        else:
-            yerr = datamc_ratio * (bg_err / (bg_tot + 1e-8))
+        # if bg_err == "ratio":
+        #     yerr = ratio_uncertainty(data_vals, bg_tot, "poisson")
+        # elif bg_err is None:
+        #     yerr = 0
+        # else:
+        #     yerr = datamc_ratio * (bg_err / (bg_tot + 1e-8))
+
+        yerr = ratio_uncertainty(data_vals, bg_tot, "poisson")
 
         hep.histplot(
             hists[data_key, :] / (sum([hists[sample, :] for sample in bg_keys]).values() + 1e-5),
@@ -658,6 +670,19 @@ def ratioLinePlot(
             color="black",
             capsize=4,
         )
+
+        if bg_err is not None:
+            # (bkg + err) / bkg
+            rax.fill_between(
+                np.repeat(hists.axes[1].edges, 2)[1:-1],
+                np.repeat((bg_tot - bg_err) / bg_tot, 2),
+                np.repeat((bg_tot + bg_err) / bg_tot, 2),
+                color="black",
+                alpha=0.1,
+                hatch="//",
+                linewidth=0,
+            )
+
         rax.set_ylabel("Data/MC")
         rax.set_ylim(0.5, 1.5)
         rax.grid()