Created
December 1, 2015 13:28
-
-
Save biomadeira/3a29dc2ce84c93b4bf57 to your computer and use it in GitHub Desktop.
Revisions
-
biomadeira created this gist
Dec 1, 2015 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,112 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- from __future__ import print_function import os import urllib __author__ = 'Fábio Madeira' __date__ = '28.05.2015' def request_ensembl_gene_ids_from_chrom_position(inputFile, outputFile=None, verbose=False): """ Loads a list of chromosome positions and gets the respective Ensembl Gene ID. :param inputFile: path to input File :param outputFile: path to output File :param verbose: verbosity level :return: list of Ensembl IDs for each chromosome position (tab separated) """ if verbose: print('Querying UCSC genome for gene mappings for chromosome positions.') if outputFile is not None: out = open(outputFile, 'w') # load lines from file with open(inputFile, 'r') as inlines: for line in inlines: line = line.rstrip() if line != '': chromosome = line.split(':')[0] start = line.split(':')[1] end = line.split(':')[1] entry = line # query ensembl ids from chromosome positions try: url_query = ("http://genome.ucsc.edu/cgi-bin/hgTables?", "hgsid=428621279_mpFTnAQzSk4ckmyTaKRA12GKdN88&", "jsh_pageVertPos=0&", "clade=mammal&", "org=Human&", "db=hg19&", "hgta_group=genes&", "hgta_track=ensGene&", "hgta_table=ensGene&", "hgta_regionType=range" , "&position=chr%s:%s-%s&" % (chromosome, start, end), "hgta_outputType=primaryTable&", "boolshad.sendToGalaxy=0&", "boolshad.sendToGreat=0&", "boolshad.sendToGenomeSpace=0&", "hgta_outFileName=&", "hgta_compressType=none&", "hgta_doTopSubmit=get+output") response = urllib.urlopen("".join(url_query)) data = response.read() # parse result lines = data.rstrip() lines = lines.split('\n') ids = [] for line in lines[1:]: line = line.rstrip() ensembl_id = line.split('\t')[12] if ensembl_id not in ids: ids.append(ensembl_id) if outputFile is not None: out.write("%s\t%s\n" % (entry, ", ".join(ids))) else: print("%s\t%s" % (entry, ", ".join(ids))) if verbose: print("%s\t%s" % (entry, ", ".join(ids))) except: if outputFile is not None: out.write("%s\t%s\n" % (entry, "-")) else: print("%s\t%s" % (entry, "-")) if verbose: print("%s\t%s" % (entry, "-")) continue if outputFile is not None: out.close() if verbose: print('Done.') return if __name__ == '__main__': import argparse parser = argparse.ArgumentParser(description='Gets Ensembl gene ids from chromosome positions.') parser.add_argument('-i', type=str, required=True, dest='input', help='path of input File') parser.add_argument('-o', type=str, required=False, dest='output', help='path of output File. If not provided print to stdout.') parser.add_argument('-v', '-verbose', dest='verbose', default=False, help='turns verbosity on', action='store_true') args = parser.parse_args() if isinstance(args.input, str): if isinstance(args.output, str): request_ensembl_gene_ids_from_chrom_position(args.input, args.output, verbose=args.verbose) else: request_ensembl_gene_ids_from_chrom_position(args.input, None, verbose=args.verbose) else: print("...No input provided! Check the program\'s help with 'python %s -h'" \ % os.path.basename(__file__))