-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathpreprocess_dataset_guppy.py
54 lines (38 loc) · 1.65 KB
/
preprocess_dataset_guppy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import pandas as pd
import glob
import os
import subprocess
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--root_downloaded', type=str )
parser.add_argument('--root_processed', type=str )
parser.add_argument('--ref', type=str )
parser.add_argument('--which', type=int,default=0 )
parser.add_argument('--output_dv',type=str)
parser.add_argument('--csv',type=str)
args = parser.parse_args()
root_downloaded = args.root_downloaded + "/"
#root = args.root_processed + "/"
list_f = "list_files.txt"
list_p = "list_percents.txt"
sub = "190205_MN19358_FAK41381_A"
list_f = pd.read_csv(os.path.join(root_downloaded,list_f),names=["f"])
list_p = pd.read_csv(os.path.join(root_downloaded,list_p),names=["p"])
ref="/home/jeanmichel/DATA/Reference_Genomes/S288C_R64-2-1/S288C_reference_sequence_R64-2-1_20150113.fa"
ref = args.ref
cmd = []
#output_dir_bf = root+"/files/" # for big files
#output_dir_extracted = root+"/training_initial/" # for extracted file
data = []
for ifile,(f,p) in enumerate(zip(list_f.f[:],list_p.p[:])):
f = f[:-6]+"_fast5"
start = root_downloaded+f+f"/env/ig/atelier/nanopore/cns/MN19358/RG_IN_PROCESS/{sub}/fast5_file_management/workdir/*"
print(start)
f5 = glob.glob(start)[0]
f5 = glob.glob(f5+"/*")[0][:-7] + f"{args.which}.fast5"
fq = root_downloaded+f[:-6]+".fastq"
data.append({"key":"Brdu_%.2f"%p,"f5":f5,"fq":fq,"ref":ref,"percents":[p],"mods":["B"],"canonical":["T"],"long_name":["Brdu"],"mix":True})
cmd = f"ln -s {f5} {args.root_processed}/B_{ifile}_{os.path.split(f5)[1]}"
print(cmd)
#subprocess.run(cmd, shell=True, check=True)
pd.DataFrame(data).to_csv(args.csv,index=False)