Skip to content

Instantly share code, notes, and snippets.

@biomadeira
Created December 1, 2015 13:28
Show Gist options
  • Select an option

  • Save biomadeira/3a29dc2ce84c93b4bf57 to your computer and use it in GitHub Desktop.

Select an option

Save biomadeira/3a29dc2ce84c93b4bf57 to your computer and use it in GitHub Desktop.
Gets Ensembl gene ids from chromosome positions.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import print_function
import os
import urllib
__author__ = 'Fábio Madeira'
__date__ = '28.05.2015'
def request_ensembl_gene_ids_from_chrom_position(inputFile, outputFile=None, verbose=False):
"""
Loads a list of chromosome positions and gets the respective Ensembl Gene ID.
:param inputFile: path to input File
:param outputFile: path to output File
:param verbose: verbosity level
:return: list of Ensembl IDs for each chromosome position (tab separated)
"""
if verbose:
print('Querying UCSC genome for gene mappings for chromosome positions.')
if outputFile is not None:
out = open(outputFile, 'w')
# load lines from file
with open(inputFile, 'r') as inlines:
for line in inlines:
line = line.rstrip()
if line != '':
chromosome = line.split(':')[0]
start = line.split(':')[1]
end = line.split(':')[1]
entry = line
# query ensembl ids from chromosome positions
try:
url_query = ("http://genome.ucsc.edu/cgi-bin/hgTables?",
"hgsid=428621279_mpFTnAQzSk4ckmyTaKRA12GKdN88&",
"jsh_pageVertPos=0&",
"clade=mammal&",
"org=Human&",
"db=hg19&",
"hgta_group=genes&",
"hgta_track=ensGene&",
"hgta_table=ensGene&",
"hgta_regionType=range" ,
"&position=chr%s:%s-%s&" % (chromosome, start, end),
"hgta_outputType=primaryTable&",
"boolshad.sendToGalaxy=0&",
"boolshad.sendToGreat=0&",
"boolshad.sendToGenomeSpace=0&",
"hgta_outFileName=&",
"hgta_compressType=none&",
"hgta_doTopSubmit=get+output")
response = urllib.urlopen("".join(url_query))
data = response.read()
# parse result
lines = data.rstrip()
lines = lines.split('\n')
ids = []
for line in lines[1:]:
line = line.rstrip()
ensembl_id = line.split('\t')[12]
if ensembl_id not in ids:
ids.append(ensembl_id)
if outputFile is not None:
out.write("%s\t%s\n" % (entry, ", ".join(ids)))
else:
print("%s\t%s" % (entry, ", ".join(ids)))
if verbose:
print("%s\t%s" % (entry, ", ".join(ids)))
except:
if outputFile is not None:
out.write("%s\t%s\n" % (entry, "-"))
else:
print("%s\t%s" % (entry, "-"))
if verbose:
print("%s\t%s" % (entry, "-"))
continue
if outputFile is not None:
out.close()
if verbose:
print('Done.')
return
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser(description='Gets Ensembl gene ids from chromosome positions.')
parser.add_argument('-i', type=str, required=True,
dest='input', help='path of input File')
parser.add_argument('-o', type=str, required=False,
dest='output', help='path of output File. If not provided print to stdout.')
parser.add_argument('-v', '-verbose', dest='verbose', default=False,
help='turns verbosity on', action='store_true')
args = parser.parse_args()
if isinstance(args.input, str):
if isinstance(args.output, str):
request_ensembl_gene_ids_from_chrom_position(args.input, args.output, verbose=args.verbose)
else:
request_ensembl_gene_ids_from_chrom_position(args.input, None, verbose=args.verbose)
else:
print("...No input provided! Check the program\'s help with 'python %s -h'" \
% os.path.basename(__file__))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment