Source code for genepi.step1_downloadUCSCDB

# -*- coding: utf-8 -*-
"""
Created on Feb 2018

@author: Chester (Yu-Chuan Chang)
"""

""""""""""""""""""""""""""""""
# import libraries
""""""""""""""""""""""""""""""
import os
import pymysql

""""""""""""""""""""""""""""""
# main function
""""""""""""""""""""""""""""""
# database schema: http://hgdownload.cse.ucsc.edu/goldenpath/hg19/database/
# human genome build could be: hg18, hg19, hg38, etc.
[docs]def DownloadUCSCDB(str_outputFilePath = os.path.dirname(os.path.abspath(__file__)), str_hgbuild = "hg19"): """ To obtain the gene information such as official gene symbols and genomic coordinates, this function is for retrieving kgXref and knownGene data table from the UCSC human genome annotation database Args: str_outputFilePath (str): File path of output database str_hgbuild (str): Genome build (eg. "hg19") Returns: - Expected Success Response:: "step1: Down load UCSC Database. DONE!" """ ### create connection conv = {pymysql.constants.FIELD_TYPE.LONG: int} conn = pymysql.Connect(host = "genome-mysql.soe.ucsc.edu", user = "genome", passwd = "",db = str_hgbuild, local_infile = 1, conv = conv) ### execute sql command str_sqlCommand = "SELECT chr, CASE WHEN strand='+' THEN txStart-1000 ELSE txStart END AS txStart, CASE WHEN strand='-' THEN txEnd+1000 ELSE txEnd END AS txEnd, strand, geneSymbol FROM " str_sqlCommand = str_sqlCommand + "(SELECT REPLACE(chr, 'chr', '') AS chr, txStart, txEnd, strand, geneSymbol, MAX(ABS(txEnd-txStart)) FROM ( " str_sqlCommand = str_sqlCommand + "SELECT knownGene.chrom AS chr, knownGene.txStart AS txStart, knownGene.txEnd AS txEnd, knownGene.strand AS strand, kgXref.geneSymbol AS geneSymbol FROM kgXref INNER JOIN knownGene ON kgXref.kgID=knownGene.name WHERE LEFT(kgXref.mRNA, 2) IN ('NR', 'NM')) AS L1 " str_sqlCommand = str_sqlCommand + "GROUP BY geneSymbol) AS L2 WHERE LEFT(chr, 1) NOT IN ('X', 'Y', 'M', 'U') AND chr NOT LIKE '%\_%' ORDER BY CAST(chr AS UNSIGNED), txStart" cur = conn.cursor() cur.execute(str_sqlCommand) db_result = cur.fetchall() cur.close() conn.close() ### output database with open(str_outputFilePath + "/UCSCGenomeDatabase.txt", "w") as file_outputFile: for item in db_result: file_outputFile.writelines(",".join(item) + "\n") print("step1: Down load UCSC Database. DONE!")
""" list_output = [] with open("/Users/chester/Data/Parkinson/genepi/genepi/UCSCGenomeDatabase.txt", "r") as file_input: list_line_pre = file_input.readline().strip().split(",") list_output.append(",".join(list_line_pre)) for line in file_input.readlines(): list_line_this = line.strip().split(",") if list_line_pre[0] != list_line_this[0]: list_line_pre = list_line_this list_output.append(",".join(list_line_this)) continue if list_line_pre[2] <= list_line_this[1]: list_inter = [list_line_this[0], str(int(list_line_pre[2])+1), str(int(list_line_this[1])-1), "=", list_line_pre[4] + "=" + list_line_this[4]] list_line_pre = list_line_this list_output.append(",".join(list_inter)) list_output.append(",".join(list_line_this)) else: list_line_pre = list_line_this list_output.append(",".join(list_line_this)) with open("/Users/chester/Data/Parkinson/hg19_inter.txt", "w") as file_output: for line in list_output: file_output.writelines(line + "\n") """