Source code for genepi.step3_splitByGene

# -*- coding: utf-8 -*-
"""
Created on Feb 2018

@author: Chester (Yu-Chuan Chang)
"""

""""""""""""""""""""""""""""""
# import libraries
""""""""""""""""""""""""""""""
import os
import numpy as np

""""""""""""""""""""""""""""""
# main function
""""""""""""""""""""""""""""""
[docs]def SplitMegaGene(list_snpsOnGene, int_window, int_step, str_outputFilePath, str_outputFileName):
    """

    In order to extract genetic features for a gene, this function used the start and end positions of each gene from the local UCSC database to split the genetic features. Then, generate the .GEN files for each gene in the folder named snpSubsets.

    Args:
        list_snpsOnGene (list): A list contains SNPs on a gene
        int_window (int): The size of the sliding window
        int_step (int): The step of the sliding window
        str_outputFilePath (str): File path of output file
        str_outputFileName (str): File name of output file

    Returns:
        None
    
    """
    
    ### write to gen file if this gene is not mega gene
    int_total_window = int((len(list_snpsOnGene)-int_window)/int_step)
    if int_total_window <= 0:
        with open(os.path.join(str_outputFilePath, str_outputFileName + ".gen"), "w") as file_outputFile:
            for item in list_snpsOnGene:
                file_outputFile.writelines(item)

    ### write gen file of each window on current gene (output file name: geneSymbol_numOfSNPOnGene@windowNum.gen)
    else: 
        for idx_w in range(int_total_window):
            with open(os.path.join(str_outputFilePath, str_outputFileName.split("_")[0] + "@" + str(idx_w) + "_" + str_outputFileName.split("_")[1] + ".gen"), "w") as file_outputFile:
                for item in list_snpsOnGene[int_step*idx_w:int_step*idx_w+int_window]:
                    file_outputFile.writelines(item)
        
        ### write reminder SNPs to gen file
        with open(os.path.join(str_outputFilePath, str_outputFileName.split("_")[0] + "@" + str(int_total_window) + "_" + str_outputFileName.split("_")[1] + ".gen"), "w") as file_outputFile:
            for item in list_snpsOnGene[int_step*int_total_window:]:
                file_outputFile.writelines(item)
    
    return

[docs]def SplitByGene(str_inputFileName_genotype, str_inputFileName_UCSCDB = os.path.dirname(os.path.abspath(__file__)) + "/UCSCGenomeDatabase.txt", str_outputFilePath = ""):
    """

    In order to extract genetic features for a gene, this function used the start and end positions of each gene from the local UCSC database to split the genetic features. Then, generate the .GEN files for each gene in the folder named snpSubsets.

    Args:
        str_inputFileName_genotype (str): File name of input genotype data
        str_inputFileName_UCSCDB (str): File name of input genome regions
        str_outputFilePath (str): File path of output file

    Returns:
        - Expected Success Response::

            "step3: Split by gene. DONE!"
    
    Warnings:
        "Warning of step3: .gen file should be sorted by chromosome and position"
    
    """
    
    print("Warning of step3: .gen file should be sorted by chromosome and position")

    int_window = 1000
    int_step = 200
    
    ### set default output path
    if str_outputFilePath == "":
        str_outputFilePath = os.path.join(os.path.dirname(str_inputFileName_genotype), "snpSubsets")
    ### if output folder doesn't exist then create it
    if not os.path.exists(str_outputFilePath):
        os.makedirs(str_outputFilePath)
    
    ### load UCSC Genome Database
    list_UCSCGenomeDatabase = []
    with open(str_inputFileName_UCSCDB, "r") as file_inputFile:
        for line in file_inputFile:
            list_UCSCGenomeDatabase.append(line.strip().split(","))
    np_UCSCGenomeDatabase = np.array(list_UCSCGenomeDatabase)
    
    ### scan all snp
    with open(str_inputFileName_genotype, "r") as file_inputFile:
        idx_gene = 0
        list_snpsOnGene = []
        for line in file_inputFile:
            ### get information of each snp
            list_thisSnp = line.strip().split(" ")
            int_chromosome = int(list_thisSnp[0])
            int_position = int(list_thisSnp[2])

            ### current gene is in next chromosome
            if int_chromosome < int(np_UCSCGenomeDatabase[idx_gene, 0]):
                continue
            ### current snp of genotype data is in next chromosome
            elif int_chromosome > int(np_UCSCGenomeDatabase[idx_gene, 0]):
                if len(list_snpsOnGene) != 0:
                    #### write gen file of current gene (output file name: geneSymbol_numOfSNPOnGene.gen)
                    #str_outputFileName = str(np_UCSCGenomeDatabase[idx_gene, 4]) + "_" + str(len(list_snpsOnGene)) + ".gen"
                    #with open(os.path.join(str_outputFilePath, str_outputFileName), "w") as file_outputFile:
                    #    for item in list_snpsOnGene:
                    #        file_outputFile.writelines(item)
                    str_outputFileName = str(np_UCSCGenomeDatabase[idx_gene, 4]) + "_" + str(len(list_snpsOnGene))
                    SplitMegaGene(list_snpsOnGene, int_window, int_step, str_outputFilePath, str_outputFileName)
                list_snpsOnGene = []
                while int_chromosome > int(np_UCSCGenomeDatabase[idx_gene, 0]):
                    ### jump to next gene
                    idx_gene = idx_gene + 1
                    ### if no next gene then break
                    if idx_gene == np_UCSCGenomeDatabase.shape[0]:
                        break
                    ### current snp on next gene
                    if int(np_UCSCGenomeDatabase[idx_gene, 1]) <= int_position and int_position <= int(np_UCSCGenomeDatabase[idx_gene, 2]) and int_chromosome == int(np_UCSCGenomeDatabase[idx_gene, 0]):
                        list_snpsOnGene.append(line)
    
            ### chromosome numbers of current snp and gene are match
            else:
                ### current snp on current gene
                if int(np_UCSCGenomeDatabase[idx_gene, 1]) <= int_position and int_position <= int(np_UCSCGenomeDatabase[idx_gene, 2]):
                    list_snpsOnGene.append(line)
                ### snp position exceed this gene
                elif int_position > int(np_UCSCGenomeDatabase[idx_gene, 2]):
                    if len(list_snpsOnGene) != 0:
                        #### write gen file of current gene (output file name: geneSymbol_numOfSNPOnGene.gen)
                        #str_outputFileName = str(np_UCSCGenomeDatabase[idx_gene, 4]) + "_" + str(len(list_snpsOnGene)) + ".gen"
                        #with open(os.path.join(str_outputFilePath, str_outputFileName), "w") as file_outputFile:
                        #    for item in list_snpsOnGene:
                        #        file_outputFile.writelines(item)
                        str_outputFileName = str(np_UCSCGenomeDatabase[idx_gene, 4]) + "_" + str(len(list_snpsOnGene))
                        SplitMegaGene(list_snpsOnGene, int_window, int_step, str_outputFilePath, str_outputFileName)
                    list_snpsOnGene = []
                    while int_position > int(np_UCSCGenomeDatabase[idx_gene, 2]) and int_chromosome == int(np_UCSCGenomeDatabase[idx_gene, 0]):
                        ### jump to next gene
                        idx_gene = idx_gene + 1
                        ### if no next gene then break
                        if idx_gene == np_UCSCGenomeDatabase.shape[0]:
                            break
                        ### snp on next gene
                        if int(np_UCSCGenomeDatabase[idx_gene, 1]) <= int_position and int_position <= int(np_UCSCGenomeDatabase[idx_gene, 2]) and int_chromosome == int(np_UCSCGenomeDatabase[idx_gene, 0]):
                            list_snpsOnGene.append(line)

            ### if the index of gene out of the boundary of DB then break
            if idx_gene >= np_UCSCGenomeDatabase.shape[0]:
                break
    
    print("step3: Split by gene. DONE!")