Skip to content

Commit

Permalink
add fqstat genebed
Browse files Browse the repository at this point in the history
  • Loading branch information
kdm9 committed Oct 10, 2024
1 parent c35e451 commit f6f2d38
Show file tree
Hide file tree
Showing 3 changed files with 109 additions and 0 deletions.
6 changes: 6 additions & 0 deletions blsl/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,10 @@
from .gfftagsane import main as gfftagsane_main
cmds["gfftagsane"] = gfftagsane_main

from .genebed import main as genebed_main
cmds["genebed"] = genebed_main


from .liftoff_gff3 import liftoff_gff3_main
cmds["liftoff-gff3"] = liftoff_gff3_main

Expand All @@ -89,6 +93,8 @@
from .pairs import main as pairs_main
cmds["pairs"] = pairs_main

from .fqstat import main as fqstat_main
cmds["fqstat"] = fqstat_main


try:
Expand Down
73 changes: 73 additions & 0 deletions blsl/fqstat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
#!/usr/bin/env python3
import gzip
import io
from collections import Counter
from dataclasses import dataclass
from pathlib import Path
from concurrent.futures import as_completed, ProcessPoolExecutor
import argparse
import multiprocessing


def head(raw, size=1_000_000):
return io.BytesIO(raw.read(size))

@dataclass
class FQStat:
path: Path
file_size: int
estimated_nreads: float
mean_read_len: float
mean_record_size: float
n_reads_sampled: int
bytes_per_record: float


def estimate_fq_stats(fq, head_bytes=1_000_000):
fq = Path(fq)
with open(fq, "rb") as fh:
buf = head(fh, size=head_bytes)
bytes_read = len(buf.getbuffer())
zfh = gzip.open(buf)
n = 0
recsizes = 0
readlens = 0
try:
for hdr, seq, qhdr, qual in zip(zfh, zfh, zfh, zfh):
n += 1
recsizes += len(hdr) + len(seq) + len(qhdr) + len(qual)
readlens += len(seq) -1
except EOFError:
pass
fsize = fq.stat().st_size
estim_reads = fsize / bytes_read * n
return FQStat(path=fq, file_size=fsize, estimated_nreads=round(estim_reads),
mean_read_len=readlens/n, mean_record_size=recsizes/n,
n_reads_sampled=n, bytes_per_record=bytes_read/n)



def main(argv=None):
ap = argparse.ArgumentParser()
ap.add_argument("--out", "-o", type=argparse.FileType("w"), default=stdout,
help="Output table")
ap.add_argument("--threads", "-j", type=int, default=multiprocessing.cpu_count(),
help="Parallel CPUs")
ap.add_argument("--head", "-b", type=int, default=1_000_000,
help="Inspect the first N bytes")
ap.add_argument("fastqs", nargs="+")

args = ap.parse_args(argv)

results = []
with ProcessPoolExecutor(args.threads) as exc:
jobs = set()
for fq in args.fastqs:
jobs.add(exc.submit(estimate_fq_stats, fq, head_bytes=args.head))
for res in tqdm(as_completed(jobs)):
results.append(res.result())

print("path", "file_size", "estimated_n_reads", "read_length", "record_size", "bytes_per_record", sep="\t", file=args.out)
for res in results:
print(res.path, res.file_size, res.estimated_nreads, res.mean_read_len, res.mean_record_size, res.bytes_per_record, sep="\t", file=args.out)

30 changes: 30 additions & 0 deletions blsl/genebed.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#!/usr/bin/env python3
import argparse
from sys import stdout
from .gffparse import parseGFF3, write_line
from tqdm import tqdm

def main(argv=None):
"""Extract a BED file of genes from a GFF"""
ap = argparse.ArgumentParser("blsl genebed")
ap.add_argument("-o", "--output", type=argparse.FileType("wt"), default=stdout,
help="Output gff file")
#ap.add_argument("-f", "--fields", default="ID,Name,Parent,locus_tag",
# help="Attribute tags to keep (case sensitive, give multiple times like -f ID -f tag2 -f tag3).")
ap.add_argument("input")

args = ap.parse_args(argv)
#args.fields = args.fields.split(",")

def N(x):
if x is None:
return "."
return str(x)

for record in tqdm(parseGFF3(args.input, return_as=dict)):
if record["type"] != "gene":
continue
print(record["seqid"], record["start"], record["end"], record["attributes"]["ID"], N(record["score"]), N(record["strand"]), sep="\t", file=args.output)

if __name__ == "__main__":
main()

0 comments on commit f6f2d38

Please sign in to comment.