-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfunctions.sh
203 lines (168 loc) · 6.57 KB
/
functions.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
#################
# FUNCTIONS #
#################
function check_and_fill_parameters {
# Function to check and fill missing parameters
# Calculates or estimates parameters if missing
# Sets default values if necessary
if [[ -z "$number_of_genomes" ]]; then
# Calculate the number of genomes by counting headers in the input sample
if [[ -z "$input_sample" || ! -f "$input_sample" ]]; then
echo "Error: Missing or invalid input sample file."
return 1
fi
number_of_genomes=$(zgrep ">" "${input_sample}" | wc -l)
echo "Missing parameter for number of genomes. Calculated: ${number_of_genomes}"
fi
if [[ -z "$percent_identity" && multiple_chromosomes -ne 1 ]]; then
# Calculate percent identity using 'mash triangle'
max_divergence=$(mash triangle -E "${input_sample}" | awk '{print $3}' | sort -g | tail -n1)
percent_identity=$(awk "BEGIN { print 100 - $max_divergence * 100 }")
echo "Missing parameter for percent identity. Estimated: ${percent_identity}"
if (( $(echo "$percent_identity < 75" | bc -l) )); then
echo "Error: Percent identity is low. Continuing with 75"
percent_identity=75
fi
fi
if [[ -z "$poa_parameters" && multiple_chromosomes -ne 1 ]]; then
# Determine POA parameters based on percent identity
if (( $(echo "$percent_identity < 99" | bc -l) )); then
poa_parameters="asm5"
elif (( $(echo "$percent_identity < 90" | bc -l) )); then
poa_parameters="asm10"
else
poa_parameters="asm20"
echo "Missing parameter for poa parameters. Using ${poa_parameters} (based on percent identity)"
fi
fi
if [[ -z "$segment_length" ]]; then
segment_length=10000
echo "Missing parameter for segment length. Using default (10000bp)"
fi
}
function combine_fasta {
# Function to combine multiple FASTA files into one
# Parameters:
# $1: Directory containing FASTA files
local directory="$1"
local output_file="${directory}combined.fa"
# Skip combination of fasta files if the combined file already exists
if [[ -f "$output_file" ]]; then
echo "'$output_file' already exists, skipping file combination."
return
fi
# Throw an error if the directory does not exist.
if [[ ! -d "$directory" ]]; then
echo "Error: Directory '$directory' does not exist."
return 1
fi
extension=".f*"
fasta_files=($(find "$directory" -type f -name "*$extension"))
# Throw an error if no fasta files were found in the directory.
if [[ ${#fasta_files[@]} -eq 0 ]]; then
echo "Error: No .fasta files found in directory '$directory'."
return 1
fi
# Combine all the fasta files into one big file
for filepath in "${fasta_files[@]}"; do
cat "$filepath" # Previously adjusted for genome name, not anymore due to incompatibility with snpEff
done > "$output_file"
local exit_code=$?
if [ "$exit_code" -ne 0 ]; then
echo "Error: Failed to combine .fasta files into '$output_file'."
return $exit_code
fi
echo "Combined ${#fasta_files[@]} .fasta files into '$output_file'."
}
function run_seqpart {
local wd="$1"
mkdir $wd
mv ${input_sample} ${wd}combined.fa
bgzip -@ 4 ${wd}combined.fa
mash dist ${wd}combined.fa.gz ${wd}combined.fa.gz -i > ${wd}distances.tsv
python3 scripts/mash2net.py -m ${wd}distances.tsv
python3 scripts/net2communities.py \
-e ${wd}distances.tsv.edges.list.txt \
-w ${wd}distances.tsv.edges.weights.txt \
-n ${wd}distances.tsv.vertices.id2name.txt
}
function run_snakemake {
# Function to prepare everything for snakemake to run and then run snakemake
# Parameters:
# $8: Community index. If none is supplied, will assume there is no sequence partitioning.
local number_of_genomes="$1"
local percent_identity="$2"
local poa_parameters="$3"
local segment_length="$4"
local threads="$5"
local runid="$6"
local input_sample="$7"
local community="${8:-}"
# Validate input parameters (add more validation if needed)
if [[ -z "$number_of_genomes" || -z "$percent_identity" || -z "$poa_parameters" || -z "$segment_length" || -z "$threads" || -z "$runid" || -z "$input_sample" ]]; then
echo "Error: Missing or empty input parameter."
return 1
fi
# Define the snakemake config path using conditional assignment
# Community number is already included in the runid.
CONFIG_FILE="output/${runid}/snakemake_config.yaml"
# Create a YAML-formatted config file
cat << EOF > "$CONFIG_FILE"
pggb:
number_of_genomes: $number_of_genomes
percent_identity: $percent_identity
poa_parameters: "$poa_parameters"
segment_length: $segment_length
threads: $threads
runid: $runid
input_sample: "$input_sample"
EOF
# Run Snakemake with the defined configuration
snakemake -p --forcerun --cores "$threads" --configfile "$CONFIG_FILE"
local exit_code=$?
if [ "$exit_code" -ne 0 ]; then
echo "Error: Snakemake pipeline failed with exit code $exit_code."
return $exit_code
fi
}
function analyse_community {
# Function to analyse a community in case of sequence partitioning
# Parameters:
# $1: Community index
local i="$1"
if [[ -z "$i" ]]; then
echo "Error: Missing community index."
return 1
fi
echo "Analysing community: ${i}"
local input_sample="${seqpart_dir}community.${i}.fa.gz"
local new_runid="${runid}/community${i}"
echo "Initialising..."
number_of_genomes=$(zgrep ">" "${input_sample}" | wc -l)
if [[ -z "$number_of_genomes" ]]; then
echo "Error: Unable to determine the number of genomes."
return 1
fi
local max_divergence=$(mash triangle -E "${input_sample}" | awk '{print $3}' | sort -g | tail -n1)
local percent_identity=$(awk "BEGIN { print 100 - $max_divergence * 100 }")
# Determine POA parameters based on percent identity
if (( $(echo "$percent_identity > 99" | bc -l) )); then
local poa_parameters="asm5"
elif (( $(echo "$percent_identity > 90" | bc -l) )); then
local poa_parameters="asm10"
elif (( $(echo "$percent_identity > 75" | bc -l) )); then
local poa_parameters="asm20"
else # pctid under 75 only really occurs in communities of accessory chromosomes
local percent_identity=75 # Otherwise PGGB will not handle it
local poa_parameters="asm20"
fi
echo "Calculated percent_identity: ${percent_identity}."
echo "Using ${poa_parameters} (based on percent identity)"
echo "Running snakemake..."
run_snakemake "$number_of_genomes" "$percent_identity" "$poa_parameters" "$segment_length" "$threads" "$new_runid" "$input_sample" "$i"
local exit_code=$?
if [ "$exit_code" -ne 0 ]; then
echo "Error: Analysis for community ${i} failed with exit code $exit_code."
return $exit_code
fi
}