Skip to content

Instantly share code, notes, and snippets.

@biomadeira
Created December 1, 2015 13:28
Show Gist options
  • Select an option

  • Save biomadeira/3a29dc2ce84c93b4bf57 to your computer and use it in GitHub Desktop.

Select an option

Save biomadeira/3a29dc2ce84c93b4bf57 to your computer and use it in GitHub Desktop.

Revisions

  1. biomadeira created this gist Dec 1, 2015.
    112 changes: 112 additions & 0 deletions ensembl_ids_from_chrom_pos
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,112 @@
    #!/usr/bin/env python
    # -*- coding: utf-8 -*-

    from __future__ import print_function
    import os
    import urllib

    __author__ = 'Fábio Madeira'
    __date__ = '28.05.2015'


    def request_ensembl_gene_ids_from_chrom_position(inputFile, outputFile=None, verbose=False):
    """
    Loads a list of chromosome positions and gets the respective Ensembl Gene ID.
    :param inputFile: path to input File
    :param outputFile: path to output File
    :param verbose: verbosity level
    :return: list of Ensembl IDs for each chromosome position (tab separated)
    """

    if verbose:
    print('Querying UCSC genome for gene mappings for chromosome positions.')

    if outputFile is not None:
    out = open(outputFile, 'w')

    # load lines from file
    with open(inputFile, 'r') as inlines:
    for line in inlines:
    line = line.rstrip()
    if line != '':
    chromosome = line.split(':')[0]
    start = line.split(':')[1]
    end = line.split(':')[1]
    entry = line

    # query ensembl ids from chromosome positions
    try:
    url_query = ("http://genome.ucsc.edu/cgi-bin/hgTables?",
    "hgsid=428621279_mpFTnAQzSk4ckmyTaKRA12GKdN88&",
    "jsh_pageVertPos=0&",
    "clade=mammal&",
    "org=Human&",
    "db=hg19&",
    "hgta_group=genes&",
    "hgta_track=ensGene&",
    "hgta_table=ensGene&",
    "hgta_regionType=range" ,
    "&position=chr%s:%s-%s&" % (chromosome, start, end),
    "hgta_outputType=primaryTable&",
    "boolshad.sendToGalaxy=0&",
    "boolshad.sendToGreat=0&",
    "boolshad.sendToGenomeSpace=0&",
    "hgta_outFileName=&",
    "hgta_compressType=none&",
    "hgta_doTopSubmit=get+output")
    response = urllib.urlopen("".join(url_query))
    data = response.read()
    # parse result
    lines = data.rstrip()
    lines = lines.split('\n')
    ids = []
    for line in lines[1:]:
    line = line.rstrip()
    ensembl_id = line.split('\t')[12]
    if ensembl_id not in ids:
    ids.append(ensembl_id)

    if outputFile is not None:
    out.write("%s\t%s\n" % (entry, ", ".join(ids)))
    else:
    print("%s\t%s" % (entry, ", ".join(ids)))
    if verbose:
    print("%s\t%s" % (entry, ", ".join(ids)))
    except:
    if outputFile is not None:
    out.write("%s\t%s\n" % (entry, "-"))
    else:
    print("%s\t%s" % (entry, "-"))
    if verbose:
    print("%s\t%s" % (entry, "-"))
    continue

    if outputFile is not None:
    out.close()

    if verbose:
    print('Done.')
    return

    if __name__ == '__main__':

    import argparse
    parser = argparse.ArgumentParser(description='Gets Ensembl gene ids from chromosome positions.')

    parser.add_argument('-i', type=str, required=True,
    dest='input', help='path of input File')
    parser.add_argument('-o', type=str, required=False,
    dest='output', help='path of output File. If not provided print to stdout.')
    parser.add_argument('-v', '-verbose', dest='verbose', default=False,
    help='turns verbosity on', action='store_true')

    args = parser.parse_args()
    if isinstance(args.input, str):
    if isinstance(args.output, str):
    request_ensembl_gene_ids_from_chrom_position(args.input, args.output, verbose=args.verbose)
    else:
    request_ensembl_gene_ids_from_chrom_position(args.input, None, verbose=args.verbose)
    else:
    print("...No input provided! Check the program\'s help with 'python %s -h'" \
    % os.path.basename(__file__))