-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnwpipe
executable file
·377 lines (346 loc) · 10.6 KB
/
nwpipe
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
#!/bin/bash
#TODO blacklist failed subjects?
#TODO option to symlink dependencies instead of copying
Usage() {
cat <<EOF
nwpipe - A tool for pipelining together multiple tasks in
processing subject data. A call to this script represents a single stage
within a larger pipeline.
Nate Wetter <[email protected]>
Brad Sutton <[email protected]>
Magnetic Resonance Functional Imaging Lab <mrfil.bioen.illinois.edu>
University of Illinois at Urbana-Champaign <illinois.edu>
Usage: nwpipe <cmd> <subject(s)> [options]
<cmd> : The command to run on each subject. It needs to be enclosed in
double quotes. The command will be run in the context of the output
directory, so any paths should be relative to there.
Known issue: wildcards within cmd may break "-numthreads auto".
<subject(s)> : One or more names of subject directories, separated by
spaces. Globbing is supported. Alternatively, this can be the name of a
text file that contains all the names. Making a file called
"nwpipe_exclude" in a subject directory will remove that subject from
analysis.
Options:
-numthreads <number | auto> : The number of separate processes to run
multiple subjects in parallel. If "auto" is specified, the number of
threads will be determined based on the memory requirements of the
command, which will be determined automatically. In this case, -memory
is required. Defaults to "1".
-memory <number> : The amount of memory to allow the pipeline to use when
-numthreads is set to "auto".
-maxthreads <number> : The maximum number of threads to use when
-numthreads is set to "auto".
-analysis <path> : A path relative to the subject directory where all
analysis output directories reside. Defaults to ".".
-outdir <path> " A subdirectory of the analysis directory where the
output of this pipeline stage will go. Defaults to ".".
-in <source> [destination] : Existence of each <source> will be verified
for each subject before running. If [destination] is specified, the
file will be copied there. <source> is relative to the analysis
directory, while [destination] is relative to the output directory.
Multiple -in parameters may be specified.
-out <file> : Relative to the output directory. Before running each
subject, existence will indicate that the subject was already run, and
it will not be repeated. After running a subject, existence will
indicate success. Multiple -out parameters may be specified.
-v : Verbose. Print more information.
-test : Instead of running <cmd>, make fake -out files. Good for making
sure all -in and -out files match up between successive nwpipe calls.
Known issue: If -out contains directory structure, that structure will need
to be created manually.
EOF
exit 1
}
[ -z "$2" ] && Usage
###################################
# default values / initialization
###################################
numthreads=1
memamount=""
maxthreads=""
analysisdir="."
outdir="."
todo=()
done=()
tonotdo=()
excluded=()
depsource=()
depdest=()
outfile=()
sublist=()
verbose=0
###################################
# read inputs
###################################
#TODO optional input files
cmd=$1
shift
while [[ ! $1 = -* && -n "$1" ]]
do
sublist+=($1)
shift
done
while [ -n "$1" ]
do
if [ $1 = "-numthreads" ]; then
numthreads=$2
shift 2
elif [ $1 = "-memory" ]; then
memamount=$2
shift 2
elif [ $1 = "-maxthreads" ]; then
maxthreads=$2
shift 2
elif [ $1 = "-analysis" ]; then
analysisdir=$2
shift 2
elif [ $1 = "-outdir" ]; then
outdir=$2
shift 2
elif [ $1 = "-in" ]; then
depsource+=($2)
shift 2
if [[ ! $1 = -* ]] && [[ -n "$1" ]]; then
depdest+=($1)
shift
else
depdest+=("-nocopy")
fi
elif [ $1 = "-out" ]; then
outfile+=($2)
shift 2
elif [ $1 = "-v" ]; then
verbose=1
shift
elif [ $1 = "-test" ]; then
unset $sublist
sublist="nwpipe_test_subject"
shift
fi
done
if [ ! -d ${sublist[0]} ]; then
sublist=$(cat ${sublist[0]})
fi
if [ $sublist = "nwpipe_test_subject" ]; then
if [ ! -d $sublist ]; then
mkdir $sublist
fi
if [ ! -d $sublist/$analysisdir ]; then
mkdir $sublist/$analysisdir
fi
fi
#TODO validate input
###################################
# print parameters
###################################
echo "nwpipe - beginning stage"
echo "command: $cmd"
if [ $verbose = 1 ]; then
echo "subjects ($(echo ${sublist[@]} | wc -w)): $(echo ${sublist[@]} | tr '\n' ' ')"
echo "number of threads: $numthreads"
if [ -n "$memamount" ]; then
echo "memory to use: $memamount gb"
fi
if [ -n "$maxthreads" ]; then
echo "max threads: $maxthreads"
fi
if [ ! $analysisdir = "." ]; then
echo "analysis directory: $analysisdir"
fi
if [ ! $outdir = "." ]; then
echo "output directory: $outdir"
fi
if [ ${#depsource[@]} -gt 0 ]; then
echo "dependencies:"
i=0
for dep in ${depsource[@]}
do
echo " $dep --> ${depdest[i]}"
i=$[i+1]
done
fi
if [ ${#outfile[@]} -gt 0 ]; then
echo "outputs: ${outfile[@]}"
fi
echo ""
fi
###################################
# check each subject
###################################
for sub in ${sublist[@]}
do
willdo=true
cantdo=false
# is subject excluded?
if [ -e "${sub}/nwpipe_exclude" ]; then
excluded+=($sub)
else
# is subject already complete?
if [ ${#outfile[@]} -gt 0 ]; then
willdo=false
fi
for out in ${outfile[@]}
do
if [ ! -e $sub/$analysisdir/$outdir/$out ]; then
#TODO above line generates errors when globbing returns multiple files. Using quotes around path or [[ removes globbing support.
willdo=true
fi
done
if [ $willdo = false ]; then
done+=($sub)
fi
# does subject have required dependencies?
for file in ${depsource[@]}
do
if [ ! -e $sub/$analysisdir/$file ] #TODO allow processing to proceed if depsource is not available but depdest is. be sure to not try copying it over.
then
willdo=false
cantdo=true
fi
done
if $willdo
then
todo+=($sub)
fi
if $cantdo
then
tonotdo+=($sub)
fi
fi
done
echo "subjects already completed (${#done[@]}): ${done[@]}"
echo "subjects excluded (${#excluded[@]}): ${excluded[@]}"
echo "subjects missing dependencies (${#tonotdo[@]}): ${tonotdo[@]}"
echo "subjects that will run (${#todo[@]}): ${todo[@]}"
echo ""
if [ ${#todo[@]} = 0 ]
then
echo "Nothing to do! Exiting."
echo ""
exit ${#tonotdo[@]}
fi
###################################
# set up dependencies
###################################
echo "copying inputs"
for sub in ${todo[@]}
do
if [ ! -d $sub/$analysisdir/$outdir ]
then
mkdir $sub/$analysisdir/$outdir
fi
i=0
for file in ${depsource[@]}
do
if [ ! ${depdest[i]} = "-nocopy" ] && [ ! -e $sub/$analysisdir/$outdir/${depdest[i]} ]
then
cp $sub/$analysisdir/$file $sub/$analysisdir/$outdir/${depdest[i]}
fi
let "i=$i+1"
done
done
###################################
# if in testing mode, generate fake output instead of continuing
###################################
if [ $sublist = "nwpipe_test_subject" ]; then
for out in ${outfile[@]}
do
touch $sub/$analysisdir/$outdir/$out
#TODO allow for directory structure within $out
done
exit
fi
###################################
# determine memory requirements
###################################
cmd=$(echo $cmd | awk "{ gsub(\"%OUT\", \"%SUB/$analysisdir/$outdir\"); print }")
cmd=$(echo $cmd | awk "{ gsub(\"%AN\", \"%SUB/$analysisdir\"); print }")
if [ $numthreads = "auto" ]
then
runmemusg=false
if [ ! -e "memusg.csv" ]
then
runmemusg=true
else
if [ $(grep -c "^$cmd," memusg.csv) = 0 ]
then
runmemusg=true
fi
fi
#memusg="" #TODO
cmdmem="${cmd//"\""/\\\"}" # add escape to double quotes
#echo "cmd=$cmd"
#echo "cmdmem=$cmdmem"
if [ $runmemusg = true ]
then
firstsub=${todo[0]}
unset todo[0]
#cmdrun=$cmd
#cmdprint=$cmd #TODO get rid of cmdprint / cmdrun
#cmdrun=$(echo $cmd | awk "{ gsub(\"%SUB\", \"$firstsub\"); print }")
#cmdrun=$(echo $cmd | awk "{ gsub(\"\\\"\", \"\"\\\"\"); print }")
#cmdrun="${cmdrun//"\""/\\\"}"
echo "Running subject $firstsub to determine memory requirements"
echo "cmd=$cmdmem"
homedir=$(pwd)
cd $firstsub/$analysisdir/$outdir
#TODO: following line doesn't work quite right. possibly eliminate $()
$(eval "memusg -out $homedir/memusg.csv \"$cmdmem\"")
cd $homedir
#echo "memusg $memusg"
fi
memamount=$((memamount * 1048576))
#cmdsearch=$(echo $cmd | tr -d '"')
#echo "grepping...$cmdmem"
memusg=$(grep -m 1 "^$cmdmem," memusg.csv | cut -d',' -f2)
if [ -z $memusg ]; then
echo "Memory usage determination failed. Exiting."
exit 1
fi
echo "memusg $memusg"
numthreads=$((memamount / memusg))
echo "enough memory for $numthreads threads"
fi
if [ ! -z $maxthreads ]
then
if [ $numthreads -gt $maxthreads ]
then
numthreads=$maxthreads
fi
fi
###################################
# run command
###################################
echo "Running ${#todo[@]} subjects with up to $numthreads threads"
#todo=$(echo $todo | tr ' ' '\n') # translate spaces into newlines for better xarg compatibility
if [ ${#todo[@]} -gt 0 ]
then
eval "echo \"$( echo ${todo[@]} | tr ' ' '\n' )\" | xargs -P $numthreads -n 1 -I %SUB sh -c 'cd %SUB/$analysisdir/$outdir; $cmd; cd - >/dev/null'"
else
echo "Nothing to do!"
fi
###################################
# verify outputs
###################################
failedsubs=()
completesubs=()
for sub in ${sublist[@]}
do
failed=false
for out in ${outfile[@]}
do
if [ ! -e $sub/$analysisdir/$outdir/$out ]; then
failed=true
fi
done
if [ $failed = true ]; then
failedsubs+=($sub)
else
completesubs+=($sub)
fi
done
echo ""
echo "failed subjects (${#failedsubs[@]}): ${failedsubs[@]}"
echo "complete subjects (${#completesubs[@]}): ${completesubs[@]}"
echo ""
exit ${#failedsubs[@]}