# -*- coding: utf-8 -*-
"""
Created on Feb 2018
@author: Chester (Yu-Chuan Chang)
"""
""""""""""""""""""""""""""""""
# import libraries
""""""""""""""""""""""""""""""
import os
import numpy as np
""""""""""""""""""""""""""""""
# main function
""""""""""""""""""""""""""""""
[docs]def SplitMegaGene(list_snpsOnGene, int_window, int_step, str_outputFilePath, str_outputFileName):
"""
In order to extract genetic features for a gene, this function used the start and end positions of each gene from the local UCSC database to split the genetic features. Then, generate the .GEN files for each gene in the folder named snpSubsets.
Args:
list_snpsOnGene (list): A list contains SNPs on a gene
int_window (int): The size of the sliding window
int_step (int): The step of the sliding window
str_outputFilePath (str): File path of output file
str_outputFileName (str): File name of output file
Returns:
None
"""
### write to gen file if this gene is not mega gene
int_total_window = int((len(list_snpsOnGene)-int_window)/int_step)
if int_total_window <= 0:
with open(os.path.join(str_outputFilePath, str_outputFileName + ".gen"), "w") as file_outputFile:
for item in list_snpsOnGene:
file_outputFile.writelines(item)
### write gen file of each window on current gene (output file name: geneSymbol_numOfSNPOnGene@windowNum.gen)
else:
for idx_w in range(int_total_window):
with open(os.path.join(str_outputFilePath, str_outputFileName.split("_")[0] + "@" + str(idx_w) + "_" + str_outputFileName.split("_")[1] + ".gen"), "w") as file_outputFile:
for item in list_snpsOnGene[int_step*idx_w:int_step*idx_w+int_window]:
file_outputFile.writelines(item)
### write reminder SNPs to gen file
with open(os.path.join(str_outputFilePath, str_outputFileName.split("_")[0] + "@" + str(int_total_window) + "_" + str_outputFileName.split("_")[1] + ".gen"), "w") as file_outputFile:
for item in list_snpsOnGene[int_step*int_total_window:]:
file_outputFile.writelines(item)
return
[docs]def SplitByGene(str_inputFileName_genotype, str_inputFileName_UCSCDB = os.path.dirname(os.path.abspath(__file__)) + "/UCSCGenomeDatabase.txt", str_outputFilePath = ""):
"""
In order to extract genetic features for a gene, this function used the start and end positions of each gene from the local UCSC database to split the genetic features. Then, generate the .GEN files for each gene in the folder named snpSubsets.
Args:
str_inputFileName_genotype (str): File name of input genotype data
str_inputFileName_UCSCDB (str): File name of input genome regions
str_outputFilePath (str): File path of output file
Returns:
- Expected Success Response::
"step3: Split by gene. DONE!"
Warnings:
"Warning of step3: .gen file should be sorted by chromosome and position"
"""
print("Warning of step3: .gen file should be sorted by chromosome and position")
int_window = 1000
int_step = 200
### set default output path
if str_outputFilePath == "":
str_outputFilePath = os.path.join(os.path.dirname(str_inputFileName_genotype), "snpSubsets")
### if output folder doesn't exist then create it
if not os.path.exists(str_outputFilePath):
os.makedirs(str_outputFilePath)
### load UCSC Genome Database
list_UCSCGenomeDatabase = []
with open(str_inputFileName_UCSCDB, "r") as file_inputFile:
for line in file_inputFile:
list_UCSCGenomeDatabase.append(line.strip().split(","))
np_UCSCGenomeDatabase = np.array(list_UCSCGenomeDatabase)
### scan all snp
with open(str_inputFileName_genotype, "r") as file_inputFile:
idx_gene = 0
list_snpsOnGene = []
for line in file_inputFile:
### get information of each snp
list_thisSnp = line.strip().split(" ")
int_chromosome = int(list_thisSnp[0])
int_position = int(list_thisSnp[2])
### current gene is in next chromosome
if int_chromosome < int(np_UCSCGenomeDatabase[idx_gene, 0]):
continue
### current snp of genotype data is in next chromosome
elif int_chromosome > int(np_UCSCGenomeDatabase[idx_gene, 0]):
if len(list_snpsOnGene) != 0:
#### write gen file of current gene (output file name: geneSymbol_numOfSNPOnGene.gen)
#str_outputFileName = str(np_UCSCGenomeDatabase[idx_gene, 4]) + "_" + str(len(list_snpsOnGene)) + ".gen"
#with open(os.path.join(str_outputFilePath, str_outputFileName), "w") as file_outputFile:
# for item in list_snpsOnGene:
# file_outputFile.writelines(item)
str_outputFileName = str(np_UCSCGenomeDatabase[idx_gene, 4]) + "_" + str(len(list_snpsOnGene))
SplitMegaGene(list_snpsOnGene, int_window, int_step, str_outputFilePath, str_outputFileName)
list_snpsOnGene = []
while int_chromosome > int(np_UCSCGenomeDatabase[idx_gene, 0]):
### jump to next gene
idx_gene = idx_gene + 1
### if no next gene then break
if idx_gene == np_UCSCGenomeDatabase.shape[0]:
break
### current snp on next gene
if int(np_UCSCGenomeDatabase[idx_gene, 1]) <= int_position and int_position <= int(np_UCSCGenomeDatabase[idx_gene, 2]) and int_chromosome == int(np_UCSCGenomeDatabase[idx_gene, 0]):
list_snpsOnGene.append(line)
### chromosome numbers of current snp and gene are match
else:
### current snp on current gene
if int(np_UCSCGenomeDatabase[idx_gene, 1]) <= int_position and int_position <= int(np_UCSCGenomeDatabase[idx_gene, 2]):
list_snpsOnGene.append(line)
### snp position exceed this gene
elif int_position > int(np_UCSCGenomeDatabase[idx_gene, 2]):
if len(list_snpsOnGene) != 0:
#### write gen file of current gene (output file name: geneSymbol_numOfSNPOnGene.gen)
#str_outputFileName = str(np_UCSCGenomeDatabase[idx_gene, 4]) + "_" + str(len(list_snpsOnGene)) + ".gen"
#with open(os.path.join(str_outputFilePath, str_outputFileName), "w") as file_outputFile:
# for item in list_snpsOnGene:
# file_outputFile.writelines(item)
str_outputFileName = str(np_UCSCGenomeDatabase[idx_gene, 4]) + "_" + str(len(list_snpsOnGene))
SplitMegaGene(list_snpsOnGene, int_window, int_step, str_outputFilePath, str_outputFileName)
list_snpsOnGene = []
while int_position > int(np_UCSCGenomeDatabase[idx_gene, 2]) and int_chromosome == int(np_UCSCGenomeDatabase[idx_gene, 0]):
### jump to next gene
idx_gene = idx_gene + 1
### if no next gene then break
if idx_gene == np_UCSCGenomeDatabase.shape[0]:
break
### snp on next gene
if int(np_UCSCGenomeDatabase[idx_gene, 1]) <= int_position and int_position <= int(np_UCSCGenomeDatabase[idx_gene, 2]) and int_chromosome == int(np_UCSCGenomeDatabase[idx_gene, 0]):
list_snpsOnGene.append(line)
### if the index of gene out of the boundary of DB then break
if idx_gene >= np_UCSCGenomeDatabase.shape[0]:
break
print("step3: Split by gene. DONE!")