Skip to content

Commit

Permalink
Merge pull request #200 from open2c/stats-fix
Browse files Browse the repository at this point in the history
quick fix bugs in stats
  • Loading branch information
golobor authored Mar 9, 2024
2 parents d4062f5 + e3af225 commit f6574dd
Showing 1 changed file with 11 additions and 20 deletions.
31 changes: 11 additions & 20 deletions pairtools/lib/stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,17 +176,13 @@ def __getitem__(self, key, filter="no_filter"):
# there is only genomic distance range of the bin that's left:
(bin_range,) = k_fields
# extract left border of the bin "1000000+" or "1500-6000":
dist_bin_left = (
dist_bin_left = int(
bin_range.strip("+")
if bin_range.endswith("+")
else bin_range.split("-")[0]
)
# get the index of that bin:
bin_idx = (
np.searchsorted(self._dist_bins, int(dist_bin_left), "right") - 1
)
# store corresponding value:
return self._stat[filter]["dist_freq"][dirs][bin_idx]
return self._stat[filter]["dist_freq"][dirs][dist_bin_left]
else:
raise ValueError(
"{} is not a valid key: {} section implies 2 identifiers".format(
Expand Down Expand Up @@ -337,20 +333,13 @@ def from_file(cls, file_handle):
# there is only genomic distance range of the bin that's left:
(bin_range,) = key_fields
# extract left border of the bin "1000000+" or "1500-6000":
dist_bin_left = (
dist_bin_left = int(
bin_range.strip("+")
if bin_range.endswith("+")
else bin_range.split("-")[0]
)
# get the index of that bin:
bin_idx = (
np.searchsorted(
stat_from_file._dist_bins, int(dist_bin_left), "right"
)
- 1
)
# store corresponding value:
stat_from_file._stat[default_filter][key][dirs][bin_idx] = int(
stat_from_file._stat[default_filter][key][dirs][dist_bin_left] = int(
fields[1]
)
else:
Expand Down Expand Up @@ -446,10 +435,10 @@ def add_pair(
if chrom1 == chrom2:
self._stat[filter]["cis"] += 1
dist = np.abs(pos2 - pos1)
bin = self._dist_bins[
dist_bin = self._dist_bins[
np.searchsorted(self._dist_bins, dist, "right") - 1
]
self._stat[filter]["dist_freq"][strand1 + strand2][bin] += 1
self._stat[filter]["dist_freq"][strand1 + strand2][dist_bin] += 1
if dist >= 1000:
self._stat[filter]["cis_1kb+"] += 1
if dist >= 2000:
Expand Down Expand Up @@ -702,17 +691,19 @@ def flatten(self, filter="no_filter"):
if (k == "dist_freq") and v:
for i in range(len(self._dist_bins)):
for dirs, freqs in v.items():
dist = self._dist_bins[i]
# last bin is treated differently: "100000+" vs "1200-3000":
if i != len(self._dist_bins) - 1:
dist = self._dist_bins[i]
if i < len(self._dist_bins) - 1:
dist_next = self._dist_bins[i + 1]
formatted_key = self._KEY_SEP.join(
["{}", "{}-{}", "{}"]
).format(k, dist, dist_next, dirs)
else:
elif i == len(self._dist_bins) - 1:
formatted_key = self._KEY_SEP.join(
["{}", "{}+", "{}"]
).format(k, dist, dirs)
else:
raise ValueError("There is a mismatch between dist_freq bins in the instance")
# store key,value pair:
flat_stat[formatted_key] = freqs[dist]
elif (k in ["pair_types", "dedup", "chromsizes"]) and v:
Expand Down

0 comments on commit f6574dd

Please sign in to comment.