-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathget_annotations.py
98 lines (85 loc) · 4.51 KB
/
get_annotations.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
# SVEN: https://github.com/gao-lab/SVEN
import argparse
import os
os.environ["HDF5_USE_FILE_LOCKING"] = 'FALSE'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import sys
sys.path.append("./")
import h5py
import numpy as np
import tensorflow as tf
from sven.predict import run_hol_prediction, run_sep_prediction
from sven.utils import replace_anno
parser = argparse.ArgumentParser(description='Get functional annotations.')
parser.add_argument('--input_file', type = str, default = "./work_dir/temp.h5", help = 'Input file in hdf5 format, default is ./work_dir/temp.h5.')
parser.add_argument('--type', type = str, default = "tss", help = 'Type of input file: tss, sv or snv. Default is tss.')
parser.add_argument('--work_dir', type = str, default = "./work_dir/", help = 'Work directory, default is ./work_dir/.')
parser.add_argument('--mode', type = str, default = "fast", help = 'Prediction mode, default is fast. Use fast or full.')
parser.add_argument('--seq_len', type = int, default = 131072, help = 'Input sequence length, default is 131_072. No need to change.')
parser.add_argument('--batch_size', type = int, default = 16, help = 'Batch size, default is 16.')
parser.add_argument('--gpu', type = str, default = "-1", help = 'GPU id, default is -1 (CPU)')
parser.add_argument('--model_path', type = str, default = "./model_params/", help = 'Model parameters path, default is ./model_params/')
parser.add_argument('--resume', type = str, default = "true", help = 'Conintue prediction or start new prediction, default is true.')
args = parser.parse_args()
# select GPU or CPU
if args.gpu != "-1":
# check if GPU is available
gpus = tf.config.list_physical_devices('GPU')
gpu_num = len(gpus)
if gpu_num == 0:
print("##### No GPU available. Use CPUs instead. #####")
else:
print("##### There are %s GPU(s) available in the system. #####" % gpu_num)
# select GPU
if int(args.gpu) < gpu_num:
os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
print("##### GPU %s is selected for prediction. #####" % args.gpu)
else:
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
print("##### GPU %s is not available. Use GPU 0 instead. #####" % args.gpu)
else:
print("##### Use CPUs for prediction. #####")
# force to use CPUs in a system with GPUs
os.environ["CUDA_VISIBLE_DEVICES"] = ""
def run_prediction(data_input, out_dir, output_prefix):
run_hol_prediction(data_input, "acc", out_dir, output_prefix, args.seq_len, args.model_path, args.batch_size, args.resume, args.gpu)
run_hol_prediction(data_input, "tf", out_dir, output_prefix, args.seq_len, args.model_path, args.batch_size, args.resume, args.gpu)
run_hol_prediction(data_input, "his", out_dir, output_prefix, args.seq_len, args.model_path, args.batch_size, args.resume, args.gpu)
if args.mode == "full":
run_sep_prediction(data_input, "tf", out_dir, output_prefix, args.seq_len, args.model_path, args.batch_size, args.resume, args.gpu)
run_sep_prediction(data_input, "his", out_dir, output_prefix, args.seq_len, args.model_path, args.batch_size, args.resume, args.gpu)
# replace annotations
print("##### Replacing annotations... #####")
replace_anno("tf", out_dir, output_prefix)
replace_anno("his", out_dir, output_prefix)
if __name__ == "__main__":
# get absolute paths
args.model_path = os.path.abspath(args.model_path)
args.work_dir = os.path.abspath(args.work_dir)
# Create output directory
out_dir = args.work_dir + "/annotations/"
if not os.path.exists(out_dir):
os.makedirs(out_dir)
# Load data
print("##### Loading data... #####")
if args.type == "tss":
data_x = h5py.File(args.input_file, 'r')['seq'][:]
elif args.type == "sv" or args.type == "snv":
with h5py.File(args.input_file, 'r') as h5file:
data_ref = h5file['ref_seq'][:]
data_alt = h5file['alt_seq'][:]
else:
raise ValueError("Invalid type. Please use 'tss', 'sv' or 'snv'.")
# Run prediction
if args.mode == "fast":
print("##### Predicting with fast mode... #####")
elif args.mode == "full":
print("##### Predicting with full mode... #####")
else:
raise ValueError("Invalid mode. Please use 'fast' or 'full'.")
if args.type == "tss":
run_prediction(data_x, out_dir, "tss")
else:
run_prediction(data_ref, out_dir, "ref")
run_prediction(data_alt, out_dir, "alt")
print("##### Success: get functional annotations. #####")