diff --git a/src/HHbbVV/postprocessing/TopAnalysis.ipynb b/src/HHbbVV/postprocessing/TopAnalysis.ipynb index 18c2c9a9..4f4cbf16 100644 --- a/src/HHbbVV/postprocessing/TopAnalysis.ipynb +++ b/src/HHbbVV/postprocessing/TopAnalysis.ipynb @@ -11,9 +11,10 @@ "import numpy as np\n", "import warnings\n", "import pandas as pd\n", + "from pathlib import Path\n", "\n", "from pandas.errors import SettingWithCopyWarning\n", - "from hh_vars import data_key\n", + "from HHbbVV.hh_vars import data_key\n", "import postprocessing\n", "\n", "# ignore these because they don't seem to apply\n", @@ -50,11 +51,8 @@ "metadata": {}, "outputs": [], "source": [ - "plot_dir = \"../../../plots/ttsfs/24Mar1_update_lps\"\n", - "\n", - "import os\n", - "\n", - "_ = os.system(f\"mkdir -p {plot_dir}\")" + "plot_dir = Path(\"../../../plots/ttsfs/24Jul22UpdateLP\")\n", + "plot_dir.mkdir(parents=True, exist_ok=True)" ] }, { @@ -63,31 +61,48 @@ "metadata": {}, "outputs": [], "source": [ - "samples = {\n", + "bg_samples = {\n", " \"QCD\": \"QCD\",\n", - " # \"Single Top\": \"ST\",\n", - " \"Top\": [\"TTToSemiLeptonic\", \"ST\"],\n", + " # \"Top\": [\"TTToSemiLeptonic\", \"ST\"],\n", " \"TTbar\": [\"TTTo2L2Nu\", \"TTToHadronic\"],\n", " \"W+Jets\": \"WJets\",\n", " \"Diboson\": [\"WW\", \"WZ\", \"ZZ\"],\n", " \"Data\": \"SingleMuon\",\n", "}\n", "\n", + "sig_samples = {\"Top\": [\"TTToSemiLeptonic\", \"ST\"]}\n", + "\n", + "samples = {**bg_samples, **sig_samples}\n", + "\n", "top_matched_key = \"TT Top Matched\"\n", "\n", - "data_dir = \"../../../../data/ttsfs/24Feb28_update_lp/\"\n", + "# data_dir = \"../../../../data/ttsfs/24Feb28_update_lp/\"\n", + "data_dir = \"/ceph/cms/store/user/rkansal/bbVV/ttsfs/24Feb28_update_lp/\"\n", + "signal_data_dir = \"/ceph/cms/store/user/rkansal/bbVV/ttsfs/24Jul22UpdateLP/\"\n", "year = \"2018\"\n", "\n", "# filters = [(\"('ak8FatJetPt', '0')\", \">=\", 500)]\n", "filters = None\n", "\n", - "events_dict = postprocessing.load_samples(data_dir, samples, year, hem_cleaning=False)\n", + "events_dict = postprocessing.load_samples(data_dir, bg_samples, year, hem_cleaning=False)\n", + "events_dict |= postprocessing.load_samples(signal_data_dir, sig_samples, year, hem_cleaning=False)\n", "\n", "cutflow = pd.DataFrame(index=list(samples.keys()))\n", "utils.add_to_cutflow(events_dict, \"Selection\", \"weight\", cutflow)\n", "cutflow" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "utils.get_pickles(f\"{signal_data_dir}/{year}/TTToSemiLeptonic/pickles\", year, f\"TTToSemiLeptonic\")[\n", + " \"lp_hist\"\n", + "][2, ...]" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -154,6 +169,18 @@ "cutflow" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for key in events_dict:\n", + " events_dict[key] = events_dict[key][events_dict[key][\"ak8FatJetPt\"][0] >= 500]\n", + " events_dict[key] = events_dict[key][events_dict[key][\"ak8FatJetMsd\"][0] >= 125]\n", + " events_dict[key] = events_dict[key][events_dict[key][\"ak8FatJetMsd\"][0] <= 225]" + ] + }, { "cell_type": "code", "execution_count": null, @@ -169,6 +196,44 @@ "# del events_dict[\"TTbar\"]" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "CLIP = 5.0\n", + "\n", + "events = events_dict[top_matched_key]\n", + "\n", + "up_prong_rc = (\n", + " (events[\"lp_sf_outside_boundary_quarks\"][0] > 0)\n", + " | (events[\"lp_sf_double_matched_event\"][0] > 0)\n", + " | (events[\"lp_sf_unmatched_quarks\"][0] > 0)\n", + ").to_numpy()\n", + "\n", + "down_prong_rc = (\n", + " (events[\"lp_sf_inside_boundary_quarks\"][0] > 0)\n", + " | (events[\"lp_sf_double_matched_event\"][0] > 0)\n", + " | (events[\"lp_sf_unmatched_quarks\"][0] > 0)\n", + ").to_numpy()\n", + "\n", + "# TODO: update\n", + "rc_unmatched = np.zeros(len(events), dtype=bool)\n", + "\n", + "for shift, prong_rc in [(\"up\", up_prong_rc), (\"down\", down_prong_rc)]:\n", + " np_sfs = events[\"lp_sf_lnN\"][0].to_numpy()\n", + " np_sfs[prong_rc] = events[f\"lp_sf_np_{shift}\"][0][prong_rc]\n", + " np_sfs = np.nan_to_num(np.clip(np_sfs, 1.0 / CLIP, CLIP))\n", + " events.loc[:, (f\"lp_sf_np_{shift}\", 0)] = np_sfs / np.mean(np_sfs, axis=0)\n", + "\n", + "for shift in [\"up\", \"down\"]:\n", + " np_sfs = events[\"lp_sf_lnN\"][0].to_numpy()\n", + " np_sfs[rc_unmatched] = CLIP if shift == \"up\" else 1.0 / CLIP\n", + " np_sfs = np.nan_to_num(np.clip(np_sfs, 1.0 / CLIP, CLIP))\n", + " events[f\"lp_sf_unmatched_{shift}\"] = np_sfs / np.mean(np_sfs, axis=0)" + ] + }, { "cell_type": "code", "execution_count": null, @@ -184,10 +249,11 @@ " \"lp_sf_pt_extrap_vars\",\n", " \"lp_sfs_bl_ratio\",\n", "]:\n", - " # cut off at 10\n", + " # cut off at 5\n", " events_dict[top_matched_key].loc[:, key] = np.clip(\n", - " events_dict[top_matched_key].loc[:, key].values, 0.1, 10\n", + " events_dict[top_matched_key].loc[:, key].values, 1.0 / 5.0, 5.0\n", " )\n", + "\n", " if key == \"lp_sfs_bl_ratio\":\n", " mean_lp_sfs = np.mean(\n", " np.nan_to_num(\n", @@ -229,37 +295,17 @@ "metadata": {}, "outputs": [], "source": [ - "for key in events_dict:\n", - " events_dict[key] = events_dict[key][events_dict[key][\"ak8FatJetPt\"][0] >= 500]\n", - " events_dict[key] = events_dict[key][events_dict[key][\"ak8FatJetMsd\"][0] >= 125]\n", - " events_dict[key] = events_dict[key][events_dict[key][\"ak8FatJetMsd\"][0] <= 225]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "events = events_dict[top_matched_key]\n", - "sj_matching_unc = (\n", - " (np.sum(events[\"lp_sf_unmatched_quarks\"]) / (len(events) * 3))\n", - " # OR of double matched and boundary quarks\n", - " # >0.1 to avoid floating point errors\n", - " + (\n", - " np.sum((events[\"lp_sf_double_matched_event\"] + events[\"lp_sf_boundary_quarks\"]) > 0.1)\n", - " / (len(events))\n", - " )\n", - ").values[0]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sj_matching_unc" + "# events = events_dict[top_matched_key]\n", + "# sj_matching_unc = (\n", + "# (np.sum(events[\"lp_sf_unmatched_quarks\"]) / (len(events) * 3))\n", + "# # OR of double matched and boundary quarks\n", + "# # >0.1 to avoid floating point errors\n", + "# + (\n", + "# np.sum((events[\"lp_sf_double_matched_event\"] + events[\"lp_sf_boundary_quarks\"]) > 0.1)\n", + "# / (len(events))\n", + "# )\n", + "# ).values[0]\n", + "# sj_matching_unc" ] }, { @@ -305,7 +351,7 @@ "source": [ "# {var: (bins, label)}\n", "plot_vars = {\n", - " \"ak8FatJetMsd\": ([20, 125, 225], r\"$m_{SD}$ (GeV)\"),\n", + " # \"ak8FatJetMsd\": ([20, 125, 225], r\"$m_{SD}$ (GeV)\"),\n", " # \"ak8FatJetParticleNetMass\": ([30, 50, 200], r\"$m_{reg}$ (GeV)\"),\n", " # \"ak8FatJetPt\": ([30, 0, 1200], r\"$p_T$ (GeV)\"),\n", " # \"MET_pt\": ([30, 0, 200], r\"MET (GeV)\"),\n", @@ -389,6 +435,26 @@ " )[0]\n", " )\n", "\n", + " np_up_down = []\n", + " for key in [\"lp_sf_np_up\", \"lp_sf_np_down\"]:\n", + " np_up_down.append(\n", + " np.histogram(\n", + " events[var][0].values.squeeze(),\n", + " np.linspace(*bins[1:], bins[0] + 1),\n", + " weights=events[\"weight\"][0].values * events[key][0].values,\n", + " )[0]\n", + " )\n", + "\n", + " um_up_down = []\n", + " for key in [\"lp_sf_unmatched_up\", \"lp_sf_unmatched_down\"]:\n", + " um_up_down.append(\n", + " np.histogram(\n", + " events[var][0].values.squeeze(),\n", + " np.linspace(*bins[1:], bins[0] + 1),\n", + " weights=events[\"weight\"][0].values * events[key].values,\n", + " )[0]\n", + " )\n", + "\n", " nom_vals = toy_hists[0] # first column are nominal values\n", "\n", " pt_toy_hists = []\n", @@ -412,11 +478,30 @@ " uncs = {\n", " \"stat_unc\": np.minimum(nom_vals, np.std(toy_hists[1:], axis=0)), # cap at 100% unc\n", " \"syst_rat_unc\": np.minimum(nom_vals, (np.abs(sys_up_down[0] - sys_up_down[1])) / 2),\n", - " \"syst_sjm_unc\": nom_vals * sj_matching_unc,\n", + " \"np_unc\": np.minimum(nom_vals, (np.abs(np_up_down[0] - np_up_down[1])) / 2),\n", + " \"um_unc\": np.minimum(nom_vals, (np.abs(um_up_down[0] - um_up_down[1])) / 2),\n", + " # \"syst_sjm_unc\": nom_vals * sj_matching_unc,\n", " \"syst_sjpt_unc\": np.minimum(nom_vals, np.std(pt_toy_hists, axis=0)),\n", " \"syst_b_unc\": np.abs(1 - (b_ratio_hist / nom_vals)) * nom_vals,\n", " }\n", "\n", + " # uncs = {}\n", + "\n", + " # for i, shift in enumerate([\"up\", \"down\"]):\n", + " # uncs[shift] = {\n", + " # \"syst_rat_unc\": np.clip(sys_up_down[i], 0, 2 * nom_vals),\n", + " # \"np_unc\": np.clip(np_up_down[i], 0, 2 * nom_vals),\n", + " # \"um_unc\": np.clip(um_up_down[i], 0, 2 * nom_vals),\n", + " # }\n", + "\n", + " # uncs[shift]\n", + "\n", + " # for key, val in uncs_symm.items():\n", + " # if shift == \"up\":\n", + " # uncs[shift][key] = nom_vals + val\n", + " # else:\n", + " # uncs[shift][key] = nom_vals - val\n", + "\n", " uncs_dict[var] = uncs\n", "\n", " unc = np.linalg.norm(list(uncs.values()), axis=0)\n", @@ -488,6 +573,26 @@ " )[0]\n", " )\n", "\n", + " np_up_down = []\n", + " for key in [\"lp_sf_np_up\", \"lp_sf_np_down\"]:\n", + " np_up_down.append(\n", + " np.histogram(\n", + " events[var][0].values.squeeze(),\n", + " np.linspace(*bins[1:], bins[0] + 1),\n", + " weights=events[\"weight\"][0].values * np.nan_to_num(events[key][0].values),\n", + " )[0]\n", + " )\n", + "\n", + " um_up_down = []\n", + " for key in [\"lp_sf_unmatched_up\", \"lp_sf_unmatched_down\"]:\n", + " um_up_down.append(\n", + " np.histogram(\n", + " events[var][0].values.squeeze(),\n", + " np.linspace(*bins[1:], bins[0] + 1),\n", + " weights=events[\"weight\"][0].values * np.nan_to_num(events[key].values),\n", + " )[0]\n", + " )\n", + "\n", " nom_vals = toy_hists[0] # first column are nominal values\n", "\n", " pt_toy_hists = []\n", @@ -511,7 +616,9 @@ " uncs = {\n", " \"stat_unc\": np.minimum(nom_vals, np.std(toy_hists[1:], axis=0)), # cap at 100% unc\n", " \"syst_rat_unc\": np.minimum(nom_vals, (np.abs(sys_up_down[0] - sys_up_down[1])) / 2),\n", - " \"syst_sjm_unc\": nom_vals * sj_matching_unc,\n", + " \"np_unc\": np.minimum(nom_vals, (np.abs(np_up_down[0] - np_up_down[1])) / 2),\n", + " \"um_unc\": np.minimum(nom_vals, (np.abs(um_up_down[0] - um_up_down[1])) / 2),\n", + " # \"syst_sjm_unc\": nom_vals * sj_matching_unc,\n", " \"syst_sjpt_unc\": np.minimum(nom_vals, np.std(pt_toy_hists, axis=0)),\n", " \"syst_b_unc\": np.abs(1 - (b_ratio_hist / nom_vals)) * nom_vals,\n", " }\n", @@ -884,7 +991,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.11" + "version": "3.9.15" }, "orig_nbformat": 4, "vscode": { diff --git a/src/HHbbVV/postprocessing/plotting.py b/src/HHbbVV/postprocessing/plotting.py index 20f57791..887b12bf 100644 --- a/src/HHbbVV/postprocessing/plotting.py +++ b/src/HHbbVV/postprocessing/plotting.py @@ -213,7 +213,7 @@ def _asimov_significance(s, b): return np.sqrt(2 * ((s + b) * np.log(1 + (s / b)) - s)) -def add_cms_label(ax, year, label=None): +def add_cms_label(ax, year, label="Preliminary"): if year == "all": hep.cms.label( label, @@ -621,6 +621,7 @@ def ratioLinePlot( alpha=0.2, hatch="//", linewidth=0, + label="Lund Plane Uncertainty", ) # if sig_key in hists: @@ -635,20 +636,31 @@ def ratioLinePlot( hep.histplot( hists[data_key, :], ax=ax, yerr=data_err, histtype="errorbar", label=data_key, color="black" ) - ax.legend(ncol=2) + + if bg_err is not None: + # Switch order so that uncertainty label comes at the end + handles, labels = ax.get_legend_handles_labels() + handles = handles[1:] + handles[:1] + labels = labels[1:] + labels[:1] + ax.legend(handles, labels, ncol=2) + else: + ax.legend(ncol=2) + ax.set_ylim(0, ax.get_ylim()[1] * 1.5) data_vals = hists[data_key, :].values() if not pulls: - datamc_ratio = data_vals / (bg_tot + 1e-5) + # datamc_ratio = data_vals / (bg_tot + 1e-5) - if bg_err == "ratio": - yerr = ratio_uncertainty(data_vals, bg_tot, "poisson") - elif bg_err is None: - yerr = 0 - else: - yerr = datamc_ratio * (bg_err / (bg_tot + 1e-8)) + # if bg_err == "ratio": + # yerr = ratio_uncertainty(data_vals, bg_tot, "poisson") + # elif bg_err is None: + # yerr = 0 + # else: + # yerr = datamc_ratio * (bg_err / (bg_tot + 1e-8)) + + yerr = ratio_uncertainty(data_vals, bg_tot, "poisson") hep.histplot( hists[data_key, :] / (sum([hists[sample, :] for sample in bg_keys]).values() + 1e-5), @@ -658,6 +670,19 @@ def ratioLinePlot( color="black", capsize=4, ) + + if bg_err is not None: + # (bkg + err) / bkg + rax.fill_between( + np.repeat(hists.axes[1].edges, 2)[1:-1], + np.repeat((bg_tot - bg_err) / bg_tot, 2), + np.repeat((bg_tot + bg_err) / bg_tot, 2), + color="black", + alpha=0.1, + hatch="//", + linewidth=0, + ) + rax.set_ylabel("Data/MC") rax.set_ylim(0.5, 1.5) rax.grid()