-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.cpp
303 lines (270 loc) · 9.37 KB
/
main.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
/*
* =====================================================================================
*
* Filename: main.cpp
*
* Description: the framework of NURD.
*
* Version: 1.0
* Created: 02/19/2014 10:21:42 PM
* Revision: none
* Compiler: g++
*
* Author: Xinyun Ma
* Organization: Tsinghua University
*
* =====================================================================================
*/
#include <cstdlib>
#include <fstream>
#include <iostream>
#include <sstream>
#include <stdexcept>
#include <vector>
#include <dirent.h> // directory operation
#include <unistd.h> // parsing argument
#include <sys/stat.h> // mkdir and access
#include <boost/unordered_map.hpp>
#include "class.h"
#include "common.h"
#include "algorithm.h"
using namespace std;
using namespace boost;
void usage(ostream & out)
{
out << "NURD is a tool to estimate isoform expression with RNA-Seq data." << endl;
out << "Version: 1.1.1" << endl;
out << "========================================" << endl;
out << "Usage:\t" << "NURD [options] <-G annotation.gtf>|<-R annotation.refflat> <-S mapping_file.sam>" << endl;
out << "\t" << "-G: annotation.gtf: gene annotation in gtf format." << endl;
out << "\t" << "-R: annotation.refflat: gene annotation in refflat format." << endl;
out << "\t" << "-S: mapping_result.sam: reads mapping result in sam format." << endl;
out << "========================================" << endl;
out << "\t" << "Optional:" << endl;
out << "\t" << "-A: the weight of GBC when mixturing the GBC and LBC into one gene structure matrix. It's a float number between 0-1. Default: 0.5" << endl;
out << "\t" << "-O: output_dir: the directory to output the estimation result. Default: current directory" << endl;
out << "========================================" << endl;
}
// get the name of sam file.
// take both linux and windows into consider.
string get_file_name(const string& dir, int& flag){
flag = 0;
if(dir.size() > 0){
if( dir[dir.size() - 1] == '\\' || dir[dir.size() - 1] == '/' ){
flag = 1;
return "";
}
size_t length = 0;
int i = (int)dir.size() - 1;
for(; i >= 0 ; i--){
if( dir[i] == '\\' || dir[i] == '/' ){
break;
}
length++;
}
return dir.substr(i + 1 , length);
}
else{
flag = 2;
return "";
}
}
// create directory.
int create_dir(const string& dirName)
{
string dir = dirName + "/";
for( int i=1; i < dir.length(); i++)
{
if(dir[i] == '/')
{
dir[i] = 0;
if( access(dir.c_str(), F_OK) != 0 )
{
if(mkdir(dir.c_str(), 0755) == -1) // mode 755 means: rwxr-xr-x
{
return -1;
}
}
dir[i] = '/';
}
}
return 0;
}
int main(int argc, char**argv)
{
// output to stringstream ss
stringstream ss (stringstream::in | stringstream::out);
// parsing argument
unordered_map<string, string> argu_parse_result;
unordered_map<string, string>::iterator argu_parse_iterator;
ifstream in_anno; // annotation file.
ifstream in_rdmap; // reads mapping file.
ofstream out_expr_rpkm; // expression estimation file of rpkm data.
ofstream out_expr_rdcnt; // expression estimation file of read count data.
string in_anno_name;
string in_rdmap_name;
string out_nurd_name;
string out_expr_rpkm_name;
string out_expr_rdcnt_name;
unsigned int anno_choice = 2; // choice of annotation: 1: refflat, 2: GTF
double alpha = 0.5; // weight of GBC and LBC. 0.5 as default.
try
{
int ch;
opterr = 0;
while((ch = getopt(argc,argv,"O:R:G:S:"))!= -1)
{
switch(ch)
{
case 'O': argu_parse_result["O"] = optarg; break;
case 'R': argu_parse_result["R"] = optarg; break;
case 'G': argu_parse_result["G"] = optarg; break;
case 'S': argu_parse_result["S"] = optarg; break;
case 'A': argu_parse_result["A"] = optarg; break;
}
}
//post process of argument.
//situation 1: -O is not specified.
if( (argu_parse_iterator = argu_parse_result.find("O")) == argu_parse_result.end() )
{
argu_parse_result["O"] = "./";
}
else
{
argu_parse_result["O"] += "/";
if( create_dir(argu_parse_result["O"]) == -1 )
{
throw runtime_error("mkdir error!\t");
}
}
//situation 2: neithor of -G or -R is specified: error
if( (argu_parse_result.find("R") == argu_parse_result.end()) && (argu_parse_result.find("G") == argu_parse_result.end()) )
{
throw runtime_error("no annotation is specified.");
}
//situation 3: both of -G and -R are specified: error
if( (argu_parse_result.find("R") != argu_parse_result.end()) && (argu_parse_result.find("G") != argu_parse_result.end()) )
{
throw runtime_error("there are multiple annotation formats.");
}
//situation 4: -R is specified, but no refflat file.
if( argu_parse_result.find("R") != argu_parse_result.end() )
{
in_anno_name = argu_parse_result["R"];
in_anno.open(in_anno_name.c_str());
if(!in_anno.is_open()){
throw runtime_error("can not open refflat file! Please check whether there exists the refflat file you specified.");
}
anno_choice = 1;
}
//situation 5: -G is specified, but no gtf file.
if( argu_parse_result.find("G") != argu_parse_result.end() )
{
in_anno_name = argu_parse_result["G"];
in_anno.open(in_anno_name.c_str());
if(!in_anno.is_open()){
throw runtime_error("can not open gtf file! Please check whether there exists the gtf file you specified.");
}
anno_choice = 0;
}
//situation 6: -S is not specified: error
if( argu_parse_result.find("S") == argu_parse_result.end() )
{
throw runtime_error("no mapping file is specified.");
}
else
{
in_rdmap_name = argu_parse_result["S"];
in_rdmap.open(in_rdmap_name.c_str());
if(!in_rdmap.is_open()){
throw runtime_error("can not open sam file! Please check whether there exists the sam file you specified.");
}
}
// situation 7: -A is out of range.
if( argu_parse_result.find("A") == argu_parse_result.end() )
{
argu_parse_result["A"] = "0.5"; // default: 0.5
}
else{
alpha = atof(argu_parse_result["A"].c_str());
if(alpha > 1.0 || alpha < 0.0){
throw runtime_error("illegal alpha. (alpha should be in [0, 1])");
}
}
}
catch(runtime_error err)
{
ss << "Exception catched:" << "\t";
ss << err.what() << endl << endl;
output_with_time(cerr, ss.str());
ss.str("");
usage(cerr);
return 1;
}
output_with_time(cout, string("expression estimation start!\n"));
//get the file names.
int flag = 0;
string in_rdmap_name_no_dir = get_file_name(in_rdmap_name, flag);
if(flag != 0){
ss << "Invalid sam file name!" << endl;
output_with_time(cerr, ss.str());
ss.str("");
return 1;
}
out_nurd_name = argu_parse_result["O"] + in_rdmap_name_no_dir + ".nurd";
output_with_time(cout, "sam file: " + out_nurd_name + "\n");
out_expr_rpkm_name = out_nurd_name + ".rpkm";
out_expr_rdcnt_name = out_nurd_name + ".rdcnt";
output_with_time(cout, "expression result:\n");
output_with_time(cout, "1: rpkm: " + out_expr_rpkm_name + "\n");
output_with_time(cout, "2: isoform readcnt: " + out_expr_rdcnt_name + "\n");
clock_t global_start, global_end;
clock_t local_start, local_end;
global_start = local_start = clock();
unordered_map<string, gene_info> map_g_info;
get_anno_info(in_anno, anno_choice, map_g_info);
local_end = clock();
ss << "annotation parsing time: ";
ss << ((double)local_end - local_start)/CLOCKS_PER_SEC << " seconds." << endl;
output_with_time(cout, ss.str());
ss.str("");
local_start = local_end;
vector<double> GBC = vector<double>(GBC_BIN_NUM, 0.0);
size_t tot_valid_rd_cnt = 0;
get_exon_rd_cnt(map_g_info, in_rdmap, tot_valid_rd_cnt, GBC);
local_end = clock();
ss << "read counting time: ";
ss << ((double)local_end - local_start)/CLOCKS_PER_SEC << " seconds." << endl;
output_with_time(cout, ss.str());
ss.str("");
local_start = local_end;
out_expr_rpkm.open(out_expr_rpkm_name.c_str());
if(! out_expr_rpkm.is_open()){
ss << "Cannot open file: " << out_expr_rpkm_name << endl;
output_with_time(cerr, ss.str());
ss.str("");
return 1;
}
out_expr_rdcnt.open(out_expr_rdcnt_name.c_str());
if(! out_expr_rdcnt.is_open()){
ss << "Cannot open file: " << out_expr_rdcnt_name << endl;
output_with_time(cerr, ss.str());
ss.str("");
return 1;
}
express_estimate(map_g_info, tot_valid_rd_cnt, alpha, GBC, out_expr_rpkm, out_expr_rdcnt);
local_end = clock();
ss << "expression estimating time: ";
ss << ((double)local_end - local_start)/CLOCKS_PER_SEC << " seconds." << endl;
output_with_time(cout, ss.str());
ss.str("");
local_start = local_end;
ss<<"expression done!"<<endl;
output_with_time(cout, ss.str());
ss.str("");
global_end = clock();
ss << "total time: " << ((double)global_end - global_start)/CLOCKS_PER_SEC << " seconds." <<endl;
output_with_time(cout, ss.str());
ss.str("");
return 0;
}