# The Snakefile that loads raw data and genome reference locally GENOME_FA = "griffithlab_brain_vs_uhr/GRCh38_Ens87_chr22_ERCC/chr22_ERCC92.fa" GENOME_GTF = "griffithlab_brain_vs_uhr/GRCh38_Ens87_chr22_ERCC/genes_chr22_ERCC92.gtf" HISAT2_INDEX_PREFIX = "hisat2_index/chr22_ERCC92" SAMPLES, *_ = glob_wildcards('griffithlab_brain_vs_uhr/HBR_UHR_ERCC_ds_10pc/{sample}.read1.fastq.gz') from pathlib import Path rule extract_genome_splice_sites: input: GENOME_GTF output: "hisat2_index/chr22_ERCC92.ss" shell: "hisat2_extract_splice_sites.py {input} > {output}" rule extract_genome_exons: input: GENOME_GTF output: "hisat2_index/chr22_ERCC92.exon" shell: "hisat2_extract_exons.py {input} > {output}" rule build_hisat_index: input: genome_fa=GENOME_FA, splice_sites="hisat2_index/chr22_ERCC92.ss", exons="hisat2_index/chr22_ERCC92.exon", output: expand(f"{HISAT2_INDEX_PREFIX}.{{ix}}.ht2", ix=range(1, 9)) log: "hisat2_index/build.log" threads: 8 shell: "hisat2-build -p {threads} {input.genome_fa} " "--ss {input.splice_sites} --exon {input.exons} {HISAT2_INDEX_PREFIX} " "2>{log}" rule align_hisat: input: hisat2_index=expand(f"{HISAT2_INDEX_PREFIX}.{{ix}}.ht2", ix=range(1, 9)), fastq1="griffithlab_brain_vs_uhr/HBR_UHR_ERCC_ds_10pc/{sample}.read1.fastq.gz", fastq2="griffithlab_brain_vs_uhr/HBR_UHR_ERCC_ds_10pc/{sample}.read2.fastq.gz", output: "align_hisat2/{sample}.bam" log: "align_hisat2/{sample}.log" threads: 4 shell: "hisat2 -p {threads} --dta -x {HISAT2_INDEX_PREFIX} " "-1 {input.fastq1} -2 {input.fastq2} 2>{log} | " "samtools sort -@ {threads} -o {output}" rule align_all_samples: input: expand("align_hisat2/{sample}.bam", sample=SAMPLES) rule stringtie_assemble: input: genome_gtf=GENOME_GTF, bam="align_hisat2/{sample}.bam" output: "stringtie/assembled/{sample}.gtf" threads: 4 shell: "stringtie -p {threads} -G {input.genome_gtf} " "-o {output} -l {wildcards.sample} {input.bam}" rule stringtie_merge_list: input: expand("stringtie/assembled/{sample}.gtf", sample=SAMPLES) output: "stringtie/merged_list.txt" run: with open(output[0], 'w') as f: for gtf in input: print(Path(gtf).resolve(), file=f) rule stringtie_merge: input: genome_gtf=GENOME_GTF, merged_list="stringtie/merged_list.txt", sample_gtfs=expand("stringtie/assembled/{sample}.gtf", sample=SAMPLES) output: "stringtie/merged.gtf" threads: 4 shell: "stringtie --merge -p {threads} -G {input.genome_gtf} " "-o {output} {input.merged_list}" rule stringtie_quant: input: merged_gtf="stringtie/merged.gtf", sample_bam="align_hisat2/{sample}.bam" output: gtf="stringtie/quant/{sample}/{sample}.gtf", gene_abund_tab="stringtie/quant/{sample}/gene_abund.tab", ctabs=expand( "stringtie/quant/{{sample}}/{name}.ctab", name=['i2t', 'e2t', 'i_data', 'e_data', 't_data'] ) threads: 4 shell: "stringtie -e -B -p {threads} -G {input.merged_gtf} " "-A {output.gene_abund_tab} -o {output.gtf} {input.sample_bam}" rule quant_all_samples: input: expand("stringtie/quant/{sample}/{sample}.gtf", sample=SAMPLES) rule clean: shell: "rm -rf align_hisat2 hisat2_index stringtie"