Created
December 1, 2015 13:28
-
-
Save biomadeira/3a29dc2ce84c93b4bf57 to your computer and use it in GitHub Desktop.
Gets Ensembl gene ids from chromosome positions.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python | |
| # -*- coding: utf-8 -*- | |
| from __future__ import print_function | |
| import os | |
| import urllib | |
| __author__ = 'Fábio Madeira' | |
| __date__ = '28.05.2015' | |
| def request_ensembl_gene_ids_from_chrom_position(inputFile, outputFile=None, verbose=False): | |
| """ | |
| Loads a list of chromosome positions and gets the respective Ensembl Gene ID. | |
| :param inputFile: path to input File | |
| :param outputFile: path to output File | |
| :param verbose: verbosity level | |
| :return: list of Ensembl IDs for each chromosome position (tab separated) | |
| """ | |
| if verbose: | |
| print('Querying UCSC genome for gene mappings for chromosome positions.') | |
| if outputFile is not None: | |
| out = open(outputFile, 'w') | |
| # load lines from file | |
| with open(inputFile, 'r') as inlines: | |
| for line in inlines: | |
| line = line.rstrip() | |
| if line != '': | |
| chromosome = line.split(':')[0] | |
| start = line.split(':')[1] | |
| end = line.split(':')[1] | |
| entry = line | |
| # query ensembl ids from chromosome positions | |
| try: | |
| url_query = ("http://genome.ucsc.edu/cgi-bin/hgTables?", | |
| "hgsid=428621279_mpFTnAQzSk4ckmyTaKRA12GKdN88&", | |
| "jsh_pageVertPos=0&", | |
| "clade=mammal&", | |
| "org=Human&", | |
| "db=hg19&", | |
| "hgta_group=genes&", | |
| "hgta_track=ensGene&", | |
| "hgta_table=ensGene&", | |
| "hgta_regionType=range" , | |
| "&position=chr%s:%s-%s&" % (chromosome, start, end), | |
| "hgta_outputType=primaryTable&", | |
| "boolshad.sendToGalaxy=0&", | |
| "boolshad.sendToGreat=0&", | |
| "boolshad.sendToGenomeSpace=0&", | |
| "hgta_outFileName=&", | |
| "hgta_compressType=none&", | |
| "hgta_doTopSubmit=get+output") | |
| response = urllib.urlopen("".join(url_query)) | |
| data = response.read() | |
| # parse result | |
| lines = data.rstrip() | |
| lines = lines.split('\n') | |
| ids = [] | |
| for line in lines[1:]: | |
| line = line.rstrip() | |
| ensembl_id = line.split('\t')[12] | |
| if ensembl_id not in ids: | |
| ids.append(ensembl_id) | |
| if outputFile is not None: | |
| out.write("%s\t%s\n" % (entry, ", ".join(ids))) | |
| else: | |
| print("%s\t%s" % (entry, ", ".join(ids))) | |
| if verbose: | |
| print("%s\t%s" % (entry, ", ".join(ids))) | |
| except: | |
| if outputFile is not None: | |
| out.write("%s\t%s\n" % (entry, "-")) | |
| else: | |
| print("%s\t%s" % (entry, "-")) | |
| if verbose: | |
| print("%s\t%s" % (entry, "-")) | |
| continue | |
| if outputFile is not None: | |
| out.close() | |
| if verbose: | |
| print('Done.') | |
| return | |
| if __name__ == '__main__': | |
| import argparse | |
| parser = argparse.ArgumentParser(description='Gets Ensembl gene ids from chromosome positions.') | |
| parser.add_argument('-i', type=str, required=True, | |
| dest='input', help='path of input File') | |
| parser.add_argument('-o', type=str, required=False, | |
| dest='output', help='path of output File. If not provided print to stdout.') | |
| parser.add_argument('-v', '-verbose', dest='verbose', default=False, | |
| help='turns verbosity on', action='store_true') | |
| args = parser.parse_args() | |
| if isinstance(args.input, str): | |
| if isinstance(args.output, str): | |
| request_ensembl_gene_ids_from_chrom_position(args.input, args.output, verbose=args.verbose) | |
| else: | |
| request_ensembl_gene_ids_from_chrom_position(args.input, None, verbose=args.verbose) | |
| else: | |
| print("...No input provided! Check the program\'s help with 'python %s -h'" \ | |
| % os.path.basename(__file__)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment