############# ## Download the 30X hg19-aligned bam from Google's public sequencing of HG002 ## and the respective BAI file. ############# wget https://storage.googleapis.com/brain-genomics-public/research/sequencing/grch37/bam/hiseqx/wgs_pcr_free/30x/HG002.hiseqx.pcr-free.30x.dedup.grch37.bam wget https://storage.googleapis.com/brain-genomics-public/research/sequencing/grch37/bam/hiseqx/wgs_pcr_free/30x/HG002.hiseqx.pcr-free.30x.dedup.grch37.bam.bai ############# ## Prepare the references so we can realign reads ############# ## Download the original hg19 / hsd37d5 reference ## and create and FAI index wget ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz gunzip hs37d5.fa.gz samtools faidx hs37d5.fa ## Download GRCh38 wget ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.15_GRCh38/seqs_for_alignment_pipelines.ucsc_ids/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz gunzip GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz ## Make a .fai index using samtools faidx samtools faidx GCA_000001405.15_GRCh38_no_alt_analysis_set.fna ## Create the BWA indices bwa index GCA_000001405.15_GRCh38_no_alt_analysis_set.fna ## Download the Gold Standard indels from 1kg to use as your known-sites file. wget https://storage.googleapis.com/genomics-public-data/resources/broad/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz ## Also grab the tabix index for the file wget https://storage.googleapis.com/genomics-public-data/resources/broad/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz.tbi ############ ## Run the bam2fq tool to extract reads from the BAM file ## Adjust the --num-threads argument to reflect the number of cores on your system. ## With 8 GPUs and 64 vCPUs this should take ~45 minutes. ############ time pbrun bam2fq \ --ref hs37d5.fa \ --in-bam HG002.hiseqx.pcr-free.30x.dedup.grch37.bam \ --out-prefix HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq \ --num-threads 64 ############## ## Run the fq2bam tool to align reads to GRCh38 ############## time pbrun fq2bam \ --in-fq HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq_1.fastq.gz HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq_1.fastq.gz \ --ref Homo_sapiens_assembly38.fasta \ --knownSites Mills_and_1000G_gold_standard.indels.hg38.vcf.gz \ --out-bam HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq.hg38.bam \ --out-recal-file HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq.hg38.BQSR-REPORT.txt