Skip to content

Instantly share code, notes, and snippets.

@Dudemanguy
Created October 18, 2017 16:37
Show Gist options
  • Select an option

  • Save Dudemanguy/e4da2676accecad955341377119c6cfc to your computer and use it in GitHub Desktop.

Select an option

Save Dudemanguy/e4da2676accecad955341377119c6cfc to your computer and use it in GitHub Desktop.

Revisions

  1. Dudemanguy created this gist Oct 18, 2017.
    156 changes: 156 additions & 0 deletions diffgene_script.R
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,156 @@
    library(HTSFilter)
    library(edgeR)
    suppressPackageStartupMessages(library(DESeq2))
    options(max.width=9999999, width=10000)

    #define functions

    #user input
    print("Script for performing differential gene expression between two groups")
    count_raw <- readline("Enter the file containing the count matrix \n")
    count_matrix <- read.table(count_raw)
    sample_query <- readline("Press 1 to enter samples manually. Press 2 to read from a file (each entry on a separate line) \n")

    #inputting sample groups; provide manual option and reading from a file
    if (sample_query==1){
    columns_to_keep <- readline("Enter the desired samples as numbers separated by a comma prefaced by a lowercase h (e.g. h001,h002,h003,etc.) \n")
    sample_split <- strsplit(columns_to_keep, ",")
    count_subset <- subset(count_matrix, select=eval(parse(text=sample_split)))

    #ask for condition entry
    condition_query2 <- readline("Press 1 to enter the condition manually. Press 2 to read from a file (each entry on a separate line) \n")
    if (condition_query2==1){
    define_group <- readline("Enter the condition for each sample separated by commas (e.g. 1,1,2,2). Note that the order must be the same as the samples. \n")
    group_split <- strsplit(define_group, ",")
    group <- unlist(group_split)
    group <- factor(group, levels=unique(unlist(group_split)))
    }

    else if (condition_query2==2){
    define_group <- readline("Enter the file name containing the condition \n")
    group <- readLines(define_group)
    group <- factor(group, levels=unique(readLines(define_group)))
    }
    }

    #sample entry
    if (sample_query==2){
    sample_query2 <- readline("Press 1 if the file only contains the desired samples. Press 2 if the file also defines the condition (tab delimited) \n")
    if (sample_query2==1){
    print("File only contains sample information")
    sample_file <- readline("Enter the file name \n")
    columns_to_keep <- readLines(sample_file)
    count_subset <- subset(count_matrix, select = eval(parse(text=list(columns_to_keep))))
    condition_query <- readline("Press 1 to enter the condition manually. Press 2 to read from a file (each entry on a separate line) \n")

    #ask for condition entry
    if (condition_query==1){
    define_group <- readline("Enter the condition for each sample separated by commas (e.g. 1,1,2,2). Note that the order must be the same as the samples. \n")
    group_split <- strsplit(define_group, ",")
    group <- unlist(group_split)
    group <- factor(group, levels=unique(unlist(group_split)))
    }

    else if (condition_query==2){
    define_group <- readline("Enter the file name containing the condition \n")
    group <- read.table(define_group)
    group <- factor(group, levels=unique(read.table(define_group)))
    }
    }

    else if (sample_query==2){
    print("File contains sample and condition information")
    complete_file <- readline("Enter the file name \n")
    complete_table <- read.table(complete_file)
    columns_to_keep <- as.vector(complete_table[,1])
    count_subset <- subset(count_matrix, select = eval(parse(text=list(columns_to_keep))))
    group <- complete_table[,2]
    group <- factor(group, levels=unique(complete_table[,2]))
    }
    }


    #error handling

    if (exists("count_subset")==FALSE){
    print("Error. No count matrix found. Retry entry.")
    stop()
    }

    #get the number of samples in the first group
    group_table <- table(group)
    n <- group_table[[1]][1]

    #output directory for results

    dir_output <- readline("Enter the name of the output directory \n")
    dir.create(dir_output)

    #apply HTSFilter
    filter <- HTSFilter(count_subset, group, s.min=1, s.max=200, s.len=25)
    count_filter <- filter$filteredData

    #apply edgeR
    y <- DGEList(counts=count_filter, group=group)
    y <- calcNormFactors(y)
    y <- estimateDisp(y)
    et <- exactTest(y)

    #apply DESeq2
    group_dataframe <- data.frame(group)
    dds <- DESeqDataSetFromMatrix(countData=count_filter, colData=group_dataframe, design=~group)
    dds <- DESeq(dds)
    res <- results(dds)

    #create data frame for edgeR results
    et_results <- topTags(et, n=Inf, sort.by="none")
    write.table(et_results, file="et_output", sep="\t")
    et_table <- read.table("et_output")
    et_table <- et_table[-(2)]
    file.remove("et_output")

    #create data frame for DESeq2 results
    resOrdered <- res[order(res$padj),]
    write.table(resOrdered, file="DESeq_output", sep="\t")
    res_order <- read.table("DESeq_output")
    file.remove("DESeq_output")

    #grab counts, average them, add to data frame, and sort

    a <- count_filter[,1:n]
    b <- count_filter[,(n+1):ncol(count_filter)]
    c <- data.frame(rowMeans(a))
    d <- data.frame(rowMeans(b))
    et_table["Avg Ct A"] <- c
    et_table["Avg Ct B"] <- d
    et_table <- et_table[,c(4,5,1,2,3)]
    et_table <- et_table[order(et_table$FDR, decreasing=FALSE),]

    res_order["Avg Ct A"] <- c
    res_order["Avg Ct B"] <- d
    res_order <- res_order[,c(7,8,1,2,3,4,5,6)]

    #print results and save to file

    setwd(dir_output)

    dir.create("edgeR")
    setwd("edgeR")
    sink("dgelist_output")
    print(y)
    sink()
    write.table(et_table, file="full_results", sep="\t", quote=FALSE)
    et_sig <- et_table[which(et_table$FDR<0.05),]
    write.table(et_sig, file="significant_results", sep="\t", quote=FALSE)
    setwd("..")

    dir.create("DESeq2")
    setwd("DESeq2")
    sink("deseq2_summary")
    print(summary(res))
    sink()
    write.table(res_order, file="full_results", sep="\t", quote=FALSE)
    res_sig <- res_order[which(res_order$padj<0.05),]
    write.table(res_sig, file="significant_results", sep="\t", quote=FALSE)
    setwd("..")
    setwd("..")