Skip to content

Commit

Permalink
top analysis and plotting updates
Browse files Browse the repository at this point in the history
  • Loading branch information
rkansal47 committed Jul 22, 2024
1 parent 962d127 commit 085d6bb
Show file tree
Hide file tree
Showing 2 changed files with 189 additions and 57 deletions.
203 changes: 155 additions & 48 deletions src/HHbbVV/postprocessing/TopAnalysis.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,10 @@
"import numpy as np\n",
"import warnings\n",
"import pandas as pd\n",
"from pathlib import Path\n",
"\n",
"from pandas.errors import SettingWithCopyWarning\n",
"from hh_vars import data_key\n",
"from HHbbVV.hh_vars import data_key\n",
"import postprocessing\n",
"\n",
"# ignore these because they don't seem to apply\n",
Expand Down Expand Up @@ -50,11 +51,8 @@
"metadata": {},
"outputs": [],
"source": [
"plot_dir = \"../../../plots/ttsfs/24Mar1_update_lps\"\n",
"\n",
"import os\n",
"\n",
"_ = os.system(f\"mkdir -p {plot_dir}\")"
"plot_dir = Path(\"../../../plots/ttsfs/24Jul22UpdateLP\")\n",
"plot_dir.mkdir(parents=True, exist_ok=True)"
]
},
{
Expand All @@ -63,31 +61,48 @@
"metadata": {},
"outputs": [],
"source": [
"samples = {\n",
"bg_samples = {\n",
" \"QCD\": \"QCD\",\n",
" # \"Single Top\": \"ST\",\n",
" \"Top\": [\"TTToSemiLeptonic\", \"ST\"],\n",
" # \"Top\": [\"TTToSemiLeptonic\", \"ST\"],\n",
" \"TTbar\": [\"TTTo2L2Nu\", \"TTToHadronic\"],\n",
" \"W+Jets\": \"WJets\",\n",
" \"Diboson\": [\"WW\", \"WZ\", \"ZZ\"],\n",
" \"Data\": \"SingleMuon\",\n",
"}\n",
"\n",
"sig_samples = {\"Top\": [\"TTToSemiLeptonic\", \"ST\"]}\n",
"\n",
"samples = {**bg_samples, **sig_samples}\n",
"\n",
"top_matched_key = \"TT Top Matched\"\n",
"\n",
"data_dir = \"../../../../data/ttsfs/24Feb28_update_lp/\"\n",
"# data_dir = \"../../../../data/ttsfs/24Feb28_update_lp/\"\n",
"data_dir = \"/ceph/cms/store/user/rkansal/bbVV/ttsfs/24Feb28_update_lp/\"\n",
"signal_data_dir = \"/ceph/cms/store/user/rkansal/bbVV/ttsfs/24Jul22UpdateLP/\"\n",
"year = \"2018\"\n",
"\n",
"# filters = [(\"('ak8FatJetPt', '0')\", \">=\", 500)]\n",
"filters = None\n",
"\n",
"events_dict = postprocessing.load_samples(data_dir, samples, year, hem_cleaning=False)\n",
"events_dict = postprocessing.load_samples(data_dir, bg_samples, year, hem_cleaning=False)\n",
"events_dict |= postprocessing.load_samples(signal_data_dir, sig_samples, year, hem_cleaning=False)\n",
"\n",
"cutflow = pd.DataFrame(index=list(samples.keys()))\n",
"utils.add_to_cutflow(events_dict, \"Selection\", \"weight\", cutflow)\n",
"cutflow"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"utils.get_pickles(f\"{signal_data_dir}/{year}/TTToSemiLeptonic/pickles\", year, f\"TTToSemiLeptonic\")[\n",
" \"lp_hist\"\n",
"][2, ...]"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down Expand Up @@ -154,6 +169,18 @@
"cutflow"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"for key in events_dict:\n",
" events_dict[key] = events_dict[key][events_dict[key][\"ak8FatJetPt\"][0] >= 500]\n",
" events_dict[key] = events_dict[key][events_dict[key][\"ak8FatJetMsd\"][0] >= 125]\n",
" events_dict[key] = events_dict[key][events_dict[key][\"ak8FatJetMsd\"][0] <= 225]"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand All @@ -169,6 +196,44 @@
"# del events_dict[\"TTbar\"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"CLIP = 5.0\n",
"\n",
"events = events_dict[top_matched_key]\n",
"\n",
"up_prong_rc = (\n",
" (events[\"lp_sf_outside_boundary_quarks\"][0] > 0)\n",
" | (events[\"lp_sf_double_matched_event\"][0] > 0)\n",
" | (events[\"lp_sf_unmatched_quarks\"][0] > 0)\n",
").to_numpy()\n",
"\n",
"down_prong_rc = (\n",
" (events[\"lp_sf_inside_boundary_quarks\"][0] > 0)\n",
" | (events[\"lp_sf_double_matched_event\"][0] > 0)\n",
" | (events[\"lp_sf_unmatched_quarks\"][0] > 0)\n",
").to_numpy()\n",
"\n",
"# TODO: update\n",
"rc_unmatched = np.zeros(len(events), dtype=bool)\n",
"\n",
"for shift, prong_rc in [(\"up\", up_prong_rc), (\"down\", down_prong_rc)]:\n",
" np_sfs = events[\"lp_sf_lnN\"][0].to_numpy()\n",
" np_sfs[prong_rc] = events[f\"lp_sf_np_{shift}\"][0][prong_rc]\n",
" np_sfs = np.nan_to_num(np.clip(np_sfs, 1.0 / CLIP, CLIP))\n",
" events.loc[:, (f\"lp_sf_np_{shift}\", 0)] = np_sfs / np.mean(np_sfs, axis=0)\n",
"\n",
"for shift in [\"up\", \"down\"]:\n",
" np_sfs = events[\"lp_sf_lnN\"][0].to_numpy()\n",
" np_sfs[rc_unmatched] = CLIP if shift == \"up\" else 1.0 / CLIP\n",
" np_sfs = np.nan_to_num(np.clip(np_sfs, 1.0 / CLIP, CLIP))\n",
" events[f\"lp_sf_unmatched_{shift}\"] = np_sfs / np.mean(np_sfs, axis=0)"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand All @@ -184,10 +249,11 @@
" \"lp_sf_pt_extrap_vars\",\n",
" \"lp_sfs_bl_ratio\",\n",
"]:\n",
" # cut off at 10\n",
" # cut off at 5\n",
" events_dict[top_matched_key].loc[:, key] = np.clip(\n",
" events_dict[top_matched_key].loc[:, key].values, 0.1, 10\n",
" events_dict[top_matched_key].loc[:, key].values, 1.0 / 5.0, 5.0\n",
" )\n",
"\n",
" if key == \"lp_sfs_bl_ratio\":\n",
" mean_lp_sfs = np.mean(\n",
" np.nan_to_num(\n",
Expand Down Expand Up @@ -229,37 +295,17 @@
"metadata": {},
"outputs": [],
"source": [
"for key in events_dict:\n",
" events_dict[key] = events_dict[key][events_dict[key][\"ak8FatJetPt\"][0] >= 500]\n",
" events_dict[key] = events_dict[key][events_dict[key][\"ak8FatJetMsd\"][0] >= 125]\n",
" events_dict[key] = events_dict[key][events_dict[key][\"ak8FatJetMsd\"][0] <= 225]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"events = events_dict[top_matched_key]\n",
"sj_matching_unc = (\n",
" (np.sum(events[\"lp_sf_unmatched_quarks\"]) / (len(events) * 3))\n",
" # OR of double matched and boundary quarks\n",
" # >0.1 to avoid floating point errors\n",
" + (\n",
" np.sum((events[\"lp_sf_double_matched_event\"] + events[\"lp_sf_boundary_quarks\"]) > 0.1)\n",
" / (len(events))\n",
" )\n",
").values[0]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sj_matching_unc"
"# events = events_dict[top_matched_key]\n",
"# sj_matching_unc = (\n",
"# (np.sum(events[\"lp_sf_unmatched_quarks\"]) / (len(events) * 3))\n",
"# # OR of double matched and boundary quarks\n",
"# # >0.1 to avoid floating point errors\n",
"# + (\n",
"# np.sum((events[\"lp_sf_double_matched_event\"] + events[\"lp_sf_boundary_quarks\"]) > 0.1)\n",
"# / (len(events))\n",
"# )\n",
"# ).values[0]\n",
"# sj_matching_unc"
]
},
{
Expand Down Expand Up @@ -305,7 +351,7 @@
"source": [
"# {var: (bins, label)}\n",
"plot_vars = {\n",
" \"ak8FatJetMsd\": ([20, 125, 225], r\"$m_{SD}$ (GeV)\"),\n",
" # \"ak8FatJetMsd\": ([20, 125, 225], r\"$m_{SD}$ (GeV)\"),\n",
" # \"ak8FatJetParticleNetMass\": ([30, 50, 200], r\"$m_{reg}$ (GeV)\"),\n",
" # \"ak8FatJetPt\": ([30, 0, 1200], r\"$p_T$ (GeV)\"),\n",
" # \"MET_pt\": ([30, 0, 200], r\"MET (GeV)\"),\n",
Expand Down Expand Up @@ -389,6 +435,26 @@
" )[0]\n",
" )\n",
"\n",
" np_up_down = []\n",
" for key in [\"lp_sf_np_up\", \"lp_sf_np_down\"]:\n",
" np_up_down.append(\n",
" np.histogram(\n",
" events[var][0].values.squeeze(),\n",
" np.linspace(*bins[1:], bins[0] + 1),\n",
" weights=events[\"weight\"][0].values * events[key][0].values,\n",
" )[0]\n",
" )\n",
"\n",
" um_up_down = []\n",
" for key in [\"lp_sf_unmatched_up\", \"lp_sf_unmatched_down\"]:\n",
" um_up_down.append(\n",
" np.histogram(\n",
" events[var][0].values.squeeze(),\n",
" np.linspace(*bins[1:], bins[0] + 1),\n",
" weights=events[\"weight\"][0].values * events[key].values,\n",
" )[0]\n",
" )\n",
"\n",
" nom_vals = toy_hists[0] # first column are nominal values\n",
"\n",
" pt_toy_hists = []\n",
Expand All @@ -412,11 +478,30 @@
" uncs = {\n",
" \"stat_unc\": np.minimum(nom_vals, np.std(toy_hists[1:], axis=0)), # cap at 100% unc\n",
" \"syst_rat_unc\": np.minimum(nom_vals, (np.abs(sys_up_down[0] - sys_up_down[1])) / 2),\n",
" \"syst_sjm_unc\": nom_vals * sj_matching_unc,\n",
" \"np_unc\": np.minimum(nom_vals, (np.abs(np_up_down[0] - np_up_down[1])) / 2),\n",
" \"um_unc\": np.minimum(nom_vals, (np.abs(um_up_down[0] - um_up_down[1])) / 2),\n",
" # \"syst_sjm_unc\": nom_vals * sj_matching_unc,\n",
" \"syst_sjpt_unc\": np.minimum(nom_vals, np.std(pt_toy_hists, axis=0)),\n",
" \"syst_b_unc\": np.abs(1 - (b_ratio_hist / nom_vals)) * nom_vals,\n",
" }\n",
"\n",
" # uncs = {}\n",
"\n",
" # for i, shift in enumerate([\"up\", \"down\"]):\n",
" # uncs[shift] = {\n",
" # \"syst_rat_unc\": np.clip(sys_up_down[i], 0, 2 * nom_vals),\n",
" # \"np_unc\": np.clip(np_up_down[i], 0, 2 * nom_vals),\n",
" # \"um_unc\": np.clip(um_up_down[i], 0, 2 * nom_vals),\n",
" # }\n",
"\n",
" # uncs[shift]\n",
"\n",
" # for key, val in uncs_symm.items():\n",
" # if shift == \"up\":\n",
" # uncs[shift][key] = nom_vals + val\n",
" # else:\n",
" # uncs[shift][key] = nom_vals - val\n",
"\n",
" uncs_dict[var] = uncs\n",
"\n",
" unc = np.linalg.norm(list(uncs.values()), axis=0)\n",
Expand Down Expand Up @@ -488,6 +573,26 @@
" )[0]\n",
" )\n",
"\n",
" np_up_down = []\n",
" for key in [\"lp_sf_np_up\", \"lp_sf_np_down\"]:\n",
" np_up_down.append(\n",
" np.histogram(\n",
" events[var][0].values.squeeze(),\n",
" np.linspace(*bins[1:], bins[0] + 1),\n",
" weights=events[\"weight\"][0].values * np.nan_to_num(events[key][0].values),\n",
" )[0]\n",
" )\n",
"\n",
" um_up_down = []\n",
" for key in [\"lp_sf_unmatched_up\", \"lp_sf_unmatched_down\"]:\n",
" um_up_down.append(\n",
" np.histogram(\n",
" events[var][0].values.squeeze(),\n",
" np.linspace(*bins[1:], bins[0] + 1),\n",
" weights=events[\"weight\"][0].values * np.nan_to_num(events[key].values),\n",
" )[0]\n",
" )\n",
"\n",
" nom_vals = toy_hists[0] # first column are nominal values\n",
"\n",
" pt_toy_hists = []\n",
Expand All @@ -511,7 +616,9 @@
" uncs = {\n",
" \"stat_unc\": np.minimum(nom_vals, np.std(toy_hists[1:], axis=0)), # cap at 100% unc\n",
" \"syst_rat_unc\": np.minimum(nom_vals, (np.abs(sys_up_down[0] - sys_up_down[1])) / 2),\n",
" \"syst_sjm_unc\": nom_vals * sj_matching_unc,\n",
" \"np_unc\": np.minimum(nom_vals, (np.abs(np_up_down[0] - np_up_down[1])) / 2),\n",
" \"um_unc\": np.minimum(nom_vals, (np.abs(um_up_down[0] - um_up_down[1])) / 2),\n",
" # \"syst_sjm_unc\": nom_vals * sj_matching_unc,\n",
" \"syst_sjpt_unc\": np.minimum(nom_vals, np.std(pt_toy_hists, axis=0)),\n",
" \"syst_b_unc\": np.abs(1 - (b_ratio_hist / nom_vals)) * nom_vals,\n",
" }\n",
Expand Down Expand Up @@ -884,7 +991,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.11"
"version": "3.9.15"
},
"orig_nbformat": 4,
"vscode": {
Expand Down
43 changes: 34 additions & 9 deletions src/HHbbVV/postprocessing/plotting.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,7 @@ def _asimov_significance(s, b):
return np.sqrt(2 * ((s + b) * np.log(1 + (s / b)) - s))


def add_cms_label(ax, year, label=None):
def add_cms_label(ax, year, label="Preliminary"):
if year == "all":
hep.cms.label(
label,
Expand Down Expand Up @@ -621,6 +621,7 @@ def ratioLinePlot(
alpha=0.2,
hatch="//",
linewidth=0,
label="Lund Plane Uncertainty",
)

# if sig_key in hists:
Expand All @@ -635,20 +636,31 @@ def ratioLinePlot(
hep.histplot(
hists[data_key, :], ax=ax, yerr=data_err, histtype="errorbar", label=data_key, color="black"
)
ax.legend(ncol=2)

if bg_err is not None:
# Switch order so that uncertainty label comes at the end
handles, labels = ax.get_legend_handles_labels()
handles = handles[1:] + handles[:1]
labels = labels[1:] + labels[:1]
ax.legend(handles, labels, ncol=2)
else:
ax.legend(ncol=2)

ax.set_ylim(0, ax.get_ylim()[1] * 1.5)

data_vals = hists[data_key, :].values()

if not pulls:
datamc_ratio = data_vals / (bg_tot + 1e-5)
# datamc_ratio = data_vals / (bg_tot + 1e-5)

if bg_err == "ratio":
yerr = ratio_uncertainty(data_vals, bg_tot, "poisson")
elif bg_err is None:
yerr = 0
else:
yerr = datamc_ratio * (bg_err / (bg_tot + 1e-8))
# if bg_err == "ratio":
# yerr = ratio_uncertainty(data_vals, bg_tot, "poisson")
# elif bg_err is None:
# yerr = 0
# else:
# yerr = datamc_ratio * (bg_err / (bg_tot + 1e-8))

yerr = ratio_uncertainty(data_vals, bg_tot, "poisson")

hep.histplot(
hists[data_key, :] / (sum([hists[sample, :] for sample in bg_keys]).values() + 1e-5),
Expand All @@ -658,6 +670,19 @@ def ratioLinePlot(
color="black",
capsize=4,
)

if bg_err is not None:
# (bkg + err) / bkg
rax.fill_between(
np.repeat(hists.axes[1].edges, 2)[1:-1],
np.repeat((bg_tot - bg_err) / bg_tot, 2),
np.repeat((bg_tot + bg_err) / bg_tot, 2),
color="black",
alpha=0.1,
hatch="//",
linewidth=0,
)

rax.set_ylabel("Data/MC")
rax.set_ylim(0.5, 1.5)
rax.grid()
Expand Down

0 comments on commit 085d6bb

Please sign in to comment.