########################################## # Step 0. setup a list of sample names. # Assume that each of your gzipped # FASTQ files is named as follows: # sample1.1.fq.gz # sample1.2.fq.gz # sample2.1.fq.gz # sample2.2.fq.gz # ... # sampleN.1.fq.gz # sampleN.2.fq.gz ########################################## export SAMPLES="sample1 sample2 sample3" ########################################## # Step 1. Align the raw FASTQ with BWA ########################################## export EXHOME=/home/arq5x/cphg-home/projects/rs-exome export STEPNAME=bwa-aln export GENOME=/home/arq5x/cphg-home/shared/genomes/hg19/bwa/gatk/hg19_gatk.fa for sample in `echo $SAMPLES` do export QSUB="qsub -q cphg -W group_list=CPHG -V -l select=1:mem=4000m:ncpus=8 -N $STEPNAME" echo "cd $EXHOME; bwa aln -e 1 -t 8 $GENOME fastq/$sample.1.fq.gz > bam/$sample.1.sai" | $QSUB echo "cd $EXHOME; bwa aln -e 1 -t 8 $GENOME fastq/$sample.2.fq.gz > bam/$sample.2.sai" | $QSUB done ############################################################ # Step 2. Pair the alignments (assumes PE data). # Keep proper, on-target (i.e. +/- 500 bp of a probe) pairs # using bedtools. # Require mapping quality >= 20 # Add the sample ID as a RG ############################################################ export EXHOME=/home/arq5x/cphg-home/projects/rs-exome export STEPNAME=bwa-pair export GENOME=/home/arq5x/cphg-home/shared/genomes/hg19/bwa/gatk/hg19_gatk.fa for sample in `echo $SAMPLES` do # -m ea sends an email when job (e)nds or (a)borts export QSUB="qsub -q cphg -W group_list=CPHG -V -l select=1:mem=8000m:ncpus=3 -N $STEPNAME" echo "cd $EXHOME; bwa sampe -r '@RG\tID:$sample\tSM:$sample' $GENOME bam/$sample.1.sai bam/$sample.2.sai fastq/$sample.1.fq.gz fastq/$sample.2.fq.gz \ | samtools view -q 20 -f 2 -Su - \ | intersectBed -abam stdin -b bed/sureselect.liftedTohg19.slop500.bed \ > bam/$sample.conc.on.bam" | $QSUB done ############################################################ # Step 3. Sort the on-target concordants by genome position. ############################################################ export EXHOME=/home/arq5x/cphg-home/projects/rs-exome export STEPNAME=possrt for sample in `echo $SAMPLES` do # -m ea sends an email when job (e)nds or (a)borts export QSUB="qsub -q cphg -W group_list=CPHG -V -l select=1:mem=8000m:ncpus=1 -N $STEPNAME" echo "cd $EXHOME; samtools sort -m 4000000000 bam/$sample.conc.on.bam bam/$sample.conc.on.pos" | $QSUB done ############################################################ # Step 4. Index the position-sorted BAM files. ############################################################ export EXHOME=/home/arq5x/cphg-home/projects/rs-exome export STEPNAME=index for sample in `echo $SAMPLES` do export QSUB="qsub -q cphg -W group_list=CPHG -V -l select=1:mem=2000m:ncpus=1 -N $STEPNAME" echo "cd $EXHOME; samtools index bam/$sample.conc.on.pos.bam" | $QSUB done ############################################################ # Step 5. Mark duplicates ############################################################ export EXHOME=/home/arq5x/cphg-home/projects/rs-exome export STEPNAME=rs-ex-dupe export GENOME=/home/arq5x/cphg-home/shared/genomes/hg19/bwa/gatk/hg19_gatk.fa export BIN=/home/arq5x/cphg-home/shared/bin/picard-tools-1.60 for sample in `echo $SAMPLES`; do export QSUB="qsub -q cphg -W group_list=CPHG -V -l select=1:mem=8000m:ncpus=2 -N $STEPNAME"; echo "cd $EXHOME; java -Xmx2g -jar $BIN/MarkDuplicates.jar \ INPUT=bam/$sample.conc.on.pos.bam \ OUTPUT=bam/$sample.conc.on.pos.dedup.bam \ TMP_DIR=$EXHOME/bam \ VALIDATION_STRINGENCY=LENIENT \ ASSUME_SORTED=true \ METRICS_FILE=bam/$sample.markdup_metrics" | $QSUB; done ############################################################ # Step 6. Index the position-sorted, deduped BAM files. ############################################################ export EXHOME=/home/arq5x/cphg-home/projects/rs-exome export STEPNAME=dup-index for sample in `echo $SAMPLES` do export QSUB="qsub -q cphg -W group_list=CPHG -V -l select=1:mem=2000m:ncpus=1 -N $STEPNAME" echo "cd $EXHOME; samtools index bam/$sample.conc.on.pos.dedup.bam" | $QSUB done ############################################################ # Step 7. Merge BAM files from each sample into a _single_ bam # file. This makes variant calling more accurate and # facilitates visualization and variant calling. ############################################################ export EXHOME=/home/arq5x/cphg-home/projects/rs-exome export STEPNAME=merge export GENOME=/home/arq5x/cphg-home/shared/genomes/hg19/bwa/gatk/hg19_gatk.fa export BIN=/home/arq5x/cphg-home/shared/bin/picard-tools-1.60 export QSUB="qsub -q cphg -W group_list=CPHG -V -l select=1:mem=8000m:ncpus=4 -N $STEPNAME"; echo "cd $EXHOME; java -Xmx2g -jar $BIN/MergeSamFiles.jar \ O=bam/all.conc.on.pos.dedup.bam \ I=bam/sample1.conc.on.pos.dedup.bam \ I=bam/sample2.conc.on.pos.dedup.bam \ I=bam/sample3.conc.on.pos.dedup.bam \ TMP_DIR=$EXHOME/bam \ USE_THREADING=true" | $QSUB; ############################################################ # Step 8. Identify realignment targets. These are cases where # the alignments are suspicious owing to INDELs in the region #. This step identifies such regions and the next step # corrects them. Improves accuracy of variant detection. ############################################################ export EXHOME=/home/arq5x/cphg-home/projects/rs-exome export STEPNAME=realtgts export GENOME=/home/arq5x/cphg-home/shared/genomes/hg19/bwa/gatk/hg19_gatk.fa export BIN=/home/arq5x/cphg-home/shared/bin/GenomeAnalysisTK-1.4-9-g1f1233b export QSUB="qsub -q cphg -W group_list=CPHG -V -l select=1:mem=8000m:ncpus=3 -N $STEPNAME"; echo "cd $EXHOME; java -Xmx2g -jar $BIN/GenomeAnalysisTK.jar \ -T RealignerTargetCreator \ -R $GENOME \ -I bam/all.conc.on.pos.dedup.bam \ -o bam/all.conc.on.pos.dedup.bam.intervals" | $QSUB; ############################################################ # Step 9. Use GATK to realign the BAMs at the suspicious targets. ############################################################ export EXHOME=/home/arq5x/cphg-home/projects/rs-exome export STEPNAME=realtgts export GENOME=/home/arq5x/cphg-home/shared/genomes/hg19/bwa/gatk/hg19_gatk.fa export BIN=/home/arq5x/cphg-home/shared/bin/GenomeAnalysisTK-1.4-9-g1f1233b export QSUB="qsub -q cphg -W group_list=CPHG -V -l select=1:mem=16000m:ncpus=4 -N $STEPNAME"; echo "cd $RSHOME; java -Xmx2g -Djava.io.tmpdir=$EXHOME/bam -jar $BIN/GenomeAnalysisTK.jar \ -T IndelRealigner \ --num_threads 1 \ -R $GENOME \ -targetIntervals bam/all.conc.on.pos.dedup.bam.intervals \ -I bam/all.conc.on.pos.dedup.bam \ -o bam/all.conc.on.pos.dedup.realigned.bam" | $QSUB; ############################################################ # Step 10. Index the merged, deduped BAM file as well as the # merged, deduped, realigned file. ############################################################ export EXHOME=/home/arq5x/cphg-home/projects/rs-exome export STEPNAME=rsex-index export QSUB="qsub -q cphg -W group_list=CPHG -V -l select=1:mem=4000m:ncpus=4 -N $STEPNAME" echo "cd $EXHOME; samtools index bam/all.conc.on.pos.dedup.bam" | $QSUB echo "cd $EXHOME; samtools index bam/all.conc.on.pos.dedup.realigned.bam" | $QSUB ############################################################ # Step 11. Call SNPs and INDELs with GATK W/O using BAQ. ############################################################ export EXHOME=/home/arq5x/cphg-home/projects/rs-exome export STEPNAME=rsex-varbaq export GENOME=/home/arq5x/cphg-home/shared/genomes/hg19/bwa/gatk/hg19_gatk.fa export BIN=/home/arq5x/cphg-home/shared/bin/GenomeAnalysisTK-1.4-9-g1f1233b export QSUB="qsub -q cphg -W group_list=CPHG -V -l select=1:mem=16000m:ncpus=12 -N $STEPNAME"; echo "cd $EXHOME; java -Xmx2g -jar $BIN/GenomeAnalysisTK.jar \ -T UnifiedGenotyper \ -glm BOTH \ --num_threads 10 \ -baq OFF \ -R $GENOME \ -I bam/all.conc.on.pos.dedup.realigned.bam \ -o varCalling/2012-Feb-01/all.raw.nobaq.vcf" | $QSUB;