jobCreator_VarCallAnnot.dev

#!/bin/bash

echo ">Sample ID: "
read ID

if [ ${#ID} -gt 11 ]
then
	shortID=$(echo $ID | cut -c 1-11)
fi

echo  -e ">Select reference: [Type hg19/hg38] (default hg19): \c"
read genomeref
if [ -z "$genomeref" ]
then
        genomeref=hg19
        echo -e $genomeref
fi

echo -e ">number of Cores (default 12): \c"
read CPU
if [ -z "$CPU" ]
then
	CPU=12
	echo -e $CPU
fi

echo -e ">Amount of Memory (default 120Gb): \c"
read RAM
if [ -z "$RAM" ]
then
	RAM=120
	echo -e $RAM
fi

echo -e ">Name of the queue (default "parallel"): \c"
read QUEUE
if [ -z "$QUEUE" ]
then
	QUEUE=parallel
	echo $QUEUE
fi

echo -e "#!/bin/bash\n"				 > $ID"_Job" 
echo -e "#SBATCH -N 1 -n "$CPU" --mem "$RAM"GB" >> $ID"_Job"
echo -e "#SBATCH -J "$shortID			>> $ID"_Job"
echo -e "#SBATCH -A "$ACCOUNT			>> $ID"_Job"
echo -e "#SBATCH --time 100:00:00"		>> $ID"_Job"
echo -e "#SBATCH --error %j.%x.err"		>> $ID"_Job"
echo -e "#SBATCH --output %j.%x.err"		>> $ID"_Job"
echo -e "#SBATCH --partition bdw_usr_prod\n"	>> $ID"_Job"

echo -e "umask 0002\n" >> $ID"_Job" 

echo ">Fastq Directory: "
read fastq_di
fastq_dir=$fastq_di"/"
if [ ! -d $fastq_dir ]
then
        echo $fastq_dir "does not exist!"
        exit 1
fi

echo -e "fastq_dir="$fastq_dir >> $ID"_Job"

echo ">How many fastq couples? "
read fq_couple

echo -e "fq_couple="$fq_couple >> $ID"_Job"

declare -a names=( $(seq 1 $fq_couple) )
for i in ${names[@]}; do
        echo ">First-pair fastqfile (e.g. _1.fq.gz) of couple "$i
        read fastqname1
	echo -e "fastq_name_original_1["$i"]="$fastqname1 >> $ID"_Job"
        echo ">Second-pair fastqfile (e.g. _2.fq.gz) of couple "$i
        read fastqname2
	echo -e "fastq_name_original_2["$i"]="$fastqname2 >> $ID"_Job"
	echo ">Lane ID for fastq couple "$i
	read laneid
	echo -e "lane_id["$i"]="$laneid >> $ID"_Job"
	fastqlane1[$i]=$fastqname1
        fastqlane2[$i]=$fastqname2
        if [ ! -f ${fastq_dir}${fastqlane1[$i]} ]
        then
                echo ${fastq_dir}${fastqlane1[$i]} " does not exist!"
                exit 1
        fi
        if [ ! -f ${fastq_dir}${fastqlane2[$i]} ]
        then
                echo ${fastq_dir}${fastqlane2[$i]} " does not exist!"
                exit 1
        fi
done

echo ">Target regions path: "
read targetRegChipDir
while [ ! -d $targetRegChipDir ]; do
	echo $targetRegChipDir " does not exist!"
	echo ">Target regions path: "
	read targetRegChipDir
done

echo ">Output directory: "
read outdi
outdir=$outdi"/"
if [ ! -d $outdir ]
then
        echo $outdir "does not exist!"
        exit 1
fi

echo ">Temporary directory: "
read tmpdi
tmpdir=$tmpdi"/"
if [ ! -d $tmpdir ]
then
        echo $tmpdir "does not exist!"
        exit 1
fi

echo ">Do you need phasing (e.g. if this is a proband sample with parents)? [Type y/n]"
read phase
if [ $phase == "y" ]
then
	echo -e "spidex="$phase >> $ID"_Job"
else
	echo ">Do you want splice sites analysis? [Type y/n]"
	read spid
	if [ $spid == "y" ]
	then
		echo -e "spidex="$spid >> $ID"_Job"	
	fi
fi	

echo ">Do you need a multisample vcf file (e.g. if you have a proband with other family members)? [Type y/n]"
read family

if [ $family == "y" ]
then
        echo ">How many other family members?"
        read members
	echo -e "fam_members="$members >> $ID"_Job"
	declare -a familymemb=( $(seq 1 $members) )
	for j in ${familymemb[@]}; do
        	echo ">ID (must be the same ID used for Job submissions, ran in the same dir) for member: "$j
        	read membID
        	echo -e "member_name["$j"]="$membID >> $ID"_Job"
	done
	echo ">Do you have a pedigree file (only trio, for phasing purpose)? [Type y/n]"
	read pedigree
	if [ $pedigree == "y" ]
	then
		echo ">Pedigree file name: "
	        read pedfile
        	if [ ! -f $pedfile ]
        	then
                	echo $pedfile " does not exist!"
	                echo ">Pedigree file name: "
        	        read pedfile
                	if [ ! -f $pedfile ]
                	then
                        	echo $pedfile " does not exist!"
                        	exit 1
                	fi
        	fi
	fi
fi

echo ">Do you want also somatic variant calling? [Type y/n]"
read somatic

echo ">Do you want FastQC data pre-processing? [Type y/n]"
read fqc

echo ">Do you have Illumina (Q PHRED+64) or Sanger fastq (Q PHRED+33)? [Type i/s]"
read qscore

echo ">Do you want to calculate global coverage? [Type y/n]"
read coverBedTools

echo ">Do you want to calculate HsMetrics? [Type y/n]"
read metrics

echo ">Do you want to perform VQSR? [Type y/n]"
read vqsr

if [ $vqsr == "y" ]
        then
                echo -e ">VCFs filelist name: (default "/pico/work/IscrC_FoRWArDS_1/NGS_tools/vcfFilelist"): \c"
		read vcfsfilelist
		if [ -z "$vcfsfilelist" ]
		then
        		vcfsfilelist=/pico/work/IscrC_FoRWArDS_1/NGS_tools/vcfFilelist
        		echo -e $vcfsfilelist
		fi
		if [ ! -f $vcfsfilelist ]
               	then
                       	echo $vcfsfilelist " does not exist!"
                       	echo -e ">VCFS filelist file name: (default "/pico/work/IscrC_FoRWArDS_1/NGS_tools/vcfFilelist"): \c"
                       	read vcfsfilelist
			if [ -z "$vcfsfilelist" ]
			then
        			vcfsfilelist=/pico/work/IscrC_FoRWArDS_1/NGS_tools/vcfFilelist
        			echo -e $vcfsfilelist
			fi
                       	if [ ! -f $vcfsfilelist ]
                   	then
                               	echo $vcfsfilelist " does not exist!"
                               	exit 1
                        fi
                fi

		# extract 50 random (excluding the oldest ones) g.vcf files from vcfFilelist
		# e' il numero massimo che ci possiamo permettere con le nostre risorse attuali
		# check che esistano tutti i gVCF nella lista
	        grep -v indels_HC_UG_ $vcfsfilelist | grep -v ISS_VCF | grep -v _prob_ | shuf | head -n 50 > $outdir$ID"_gvcf.tmp.list"
		sed -i 's/_HapCall_genotype_filtered/_HapCall/' $outdir$ID"_gvcf.tmp.list"
        	grep "_HapCall.g.vcf.gz$" $outdir$ID"_gvcf.tmp.list" > $outdir$ID"_gvcf.list"
		rm $outdir$ID"_gvcf.tmp.list"
		while read -r gvcf; do
			if [[ ! -e "${gvcf}.tbi" ]]; then
        			echo $gvcf "does not exist or it is not bgzipped and tabindexed"
        			exit 1
			fi
		done < "${outdir}${ID}_gvcf.list"
		

fi

if [ $fqc == "y" ]; then
	if [ $fq_couple -gt 1 ]; then
	        echo -e "\n  concatenating fastq files..."
	        CWD=$(pwd)
	        cd  ${fastq_dir}
	        cat $( echo ${fastqlane1[@]} ) > $outdir$ID"_all_fq_reads_1.fq.gz"
	        cat $( echo ${fastqlane2[@]} ) > $outdir$ID"_all_fq_reads_2.fq.gz"
	        cd ${CWD}
	else
	        CWD=$(pwd)
	        cd  ${fastq_dir}
	        ln -s ${fastq_dir}${fastqname1} $outdir$ID"_all_fq_reads_1.fq.gz"
	        ln -s ${fastq_dir}${fastqname2} $outdir$ID"_all_fq_reads_2.fq.gz"
	        cd ${CWD}
	fi
fi

echo -e "outdir="$outdir >> $ID"_Job"
echo -e "phasing="$phase >> $ID"_Job"
echo -e "ID="$ID >> $ID"_Job"
echo -e "tmpdir="$tmpdir >> $ID"_Job"
echo -e "somatic="$somatic >> $ID"_Job"
echo -e "fqc="$fqc >> $ID"_Job"
echo -e "qscore="$qscore >> $ID"_Job"
echo -e "coverBedTools="$coverBedTools >> $ID"_Job"
echo -e "metrics="$metrics >> $ID"_Job"
echo -e "targetRegChipDir="$targetRegChipDir >> $ID"_Job"
echo -e "VQSR="$vqsr >> $ID"_Job"

if [ ! -z $pedigree ]; then
	if [ $pedigree == "y" ]; then
		echo -e "pedfile="$pedfile >> $ID"_Job"
	fi
fi

if [ $genomeref == "hg19" ]
then
        cat "/pico/work/IscrC_FoRWArDS_1/NGS_tools/Pipeline/Tartagenia/v3.1/hg19_pipeline_BWA-GATK_genotyp_HC.dev" >> $ID"_Job"
else
        cat "/pico/work/IscrC_FoRWArDS_1/NGS_tools/Pipeline/Tartagenia/v3.1/hg38_pipeline_BWA-GATK_genotyp_HC.dev" >> $ID"_Job"
fi

mv -f $ID"_Job" $outdir

echo -e "#!/bin/bash\n" 			 > $ID"_annot_Job"
echo -e "#SBATCH -N 1 -n 6 --mem 120GB"		>> $ID"_annot_Job"
echo -e "#SBATCH -J "$shortID"_ann"		>> $ID"_annot_Job"
echo -e "#SBATCH -A "$ACCOUNT			>> $ID"_annot_Job"
echo -e "#SBATCH --time 12:00:00"		>> $ID"_annot_Job"
echo -e "#SBATCH --error %j.%x.err"		>> $ID"_annot_Job"
echo -e "#SBATCH --output %j.%x.err"		>> $ID"_annot_Job"
echo -e "#SBATCH --partition bdw_usr_prod\n"	>> $ID"_annot_Job"

echo -e "\numask 0002\n" >> $ID"_annot_Job"
echo -e "## definizione delle variabili di input" >> $ID"_annot_Job"
echo -e "ID="$ID >> $ID"_annot_Job"
echo -e "outdir="$outdir >> $ID"_annot_Job"
echo -e "somatic="$somatic >> $ID"_annot_Job"
echo -e "phasing="$phase >> $ID"_annot_Job"

if [ $phase == "y" ]
then
	echo -e "vcfinput="$outdir$ID"_raw_snps-indels_HapCall_genotype_filtered_phased.g.vcf.gz" >> $ID"_annot_Job"
else
	echo -e "vcfinput="$outdir$ID"_raw_snps-indels_HapCall_genotype_filtered.g.vcf.gz" >> $ID"_annot_Job"
fi

if [ $genomeref == "hg19" ]
then
        cat "/pico/work/IscrC_FoRWArDS_1/NGS_tools/Pipeline/Tartagenia/v3.1/hg19_g.custom_annotation_part.dev" >> $ID"_annot_Job"
else
        cat "/pico/work/IscrC_FoRWArDS_1/NGS_tools/Pipeline/Tartagenia/v3.1/hg38_g.custom_annotation_part.dev" >> $ID"_annot_Job"
fi

mv -f $ID"_annot_Job" $outdir

fq=$(grep 'fastq_name_original_1\[1\]' "$outdir$ID"_Job | head -n1 | cut -c 26-)

if [ $genomeref == "hg19" ]
then
        echo -e "\nTo add variants to in-house DB, on pico.cineca.it run:\n\t/pico/work/IscrC_FoRWArDS_1/NGS_tools/add_2_varDB_multisample.sh "$ID"_raw_snps-indels_HapCall_genotype_filtered.g.vcf.gz "$ID $fq" + output directory\n\nFor the annotation: sbatch "$outdir$ID"_annot_Job\n" > $outdir$ID"_custom_annotation.INFO"
else
	echo -e "\nTo add variants to in-house DB, on pico.cineca.it run:\n\t/pico/work/IscrC_FoRWArDS_1/NGS_tools/add_2_varDB_multisample_hg38.sh "$ID"_raw_snps-indels_HapCall_genotype_filtered.g.vcf.gz "$ID $fq" + output directory\n\nFor the annotation: sbatch "$outdir$ID"_annot_Job\n" > $outdir$ID"_custom_annotation.INFO"
fi
echo -e "\n  sbatch this file: "$outdir$ID"_Job\n\n  instructions for the annotation of the final vcf can be found in "$outdir$ID"_custom_annotation.INFO\n"