Source code for genepi.GenEpi

# -*- coding: utf-8 -*-
"""
Created on Apr 2019

@author: Chester (Yu-Chuan Chang)
"""

""""""""""""""""""""""""""""""
# import libraries
""""""""""""""""""""""""""""""
import argparse
import time
import os
import sys
import numpy as np
import multiprocessing as mp
from . import *

""""""""""""""""""""""""""""""
# define functions 
""""""""""""""""""""""""""""""
[docs]def ArgumentsParser():
    """

    To obtain and parse the arguments from user.

    Args:
        None

    Returns:
        argparse.ArgumentParser
    
    """

    ### define arguments
    str_description = ''
    'GenEpi is a package to uncover epistasis associated with phenotypes by a machine learning approach, '
    'developed by Yu-Chuan Chang at c4Lab of National Taiwan University. '
    'Github: https://github.com/Chester75321/GenEpi '
    'Reference: GenEpi: Gene-based Epistasis Discovery Using Machine Learning '
    '(https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-020-3368-2)'
    '(https://www.biorxiv.org/content/10.1101/421719v5) '
    parser = argparse.ArgumentParser(prog='GenEpi', description=str_description)
    
    ### define arguments for app
    parser.add_argument('--app', action='store_true', default=False, help="open AppGenEpi")

    ### define arguments for I/O
    parser.add_argument("-g", required=True, help="filename of the input .gen file")
    parser.add_argument("-p", required=True, help="filename of the input phenotype")
    parser.add_argument("-s", required=False, help="self-defined genome regions")
    parser.add_argument("-o", required=False, help="output file path")
    
    ### define arguments for modeling
    parser.add_argument("-m", required=False, default="c", choices=["c", "r"], help="choose model type: c for classification; r for regression")
    parser.add_argument("-k", required=False, default=2, help="k of k-fold cross validation")
    parser.add_argument("-t", required=False, default=mp.cpu_count(), help="number of threads")
    
    ### define arguments for step1_downloadUCSCDB
    parser_group_1 = parser.add_argument_group("update UCSC database")
    parser_group_1.add_argument('--updatedb', action='store_true', default=False, help="enable this function")
    parser_group_1.add_argument("-b", required=False, default="hg19", choices=["hg19", "hg38"], help="human genome build")
    
    ### define arguments for step2_estimateLD
    parser_group_2 = parser.add_argument_group("compress data by LD block")
    parser_group_2.add_argument('--compressld', action='store_true', default=False, help="enable this function")
    parser_group_2.add_argument("-d", required = False, default=0.9, type=float, help="threshold for compression: D prime")
    parser_group_2.add_argument("-r", required = False, default=0.9, type=float, help="threshold for compression: R square")

    ### define arguments for isolated test
    parser.add_argument('-i', action='store_true', default=False, help="enable isolated test")
 
    return parser

[docs]def InputChecking(str_inputFileName_genotype, str_inputFileName_phenotype, args):
    """

    To check the numbers of sample are consistent in genotype and phenotype data.

    Args:
        str_inputFileName_genotype (str): File name of input genotype data
        str_inputFileName_phenotype (str): File name of input phenotype data

    Returns:
        (tuple): tuple containing:

            - int_num_genotype (int): The sample number of genotype data
            - int_num_phenotype (int): The sample number of phenotype data 
    
    """

    ### check file name exist
    if str_inputFileName_genotype is None:
        sys.exit("There is no input genotype file.")
    if str_inputFileName_phenotype is None:
        sys.exit("There is no input phenotype file.")
    
    ### count lines of input files
    int_num_genotype = sum(1 for line in open(str_inputFileName_genotype))
    int_num_phenotype = sum(1 for line in open(str_inputFileName_phenotype))
    
    ### count sample num. in genotype file
    with open(str_inputFileName_genotype, 'r') as file_inputFile:
        list_line = file_inputFile.readline().strip().split(" ")
        int_num_genotype_sample = (len(list_line) - 5) / 3
        if int_num_genotype_sample != int_num_phenotype:
            sys.exit("The number of samples in genotype file does not match the number of samples in phenotype file.")
    
    ### get phenotype file
    if args.m=="c":
        list_phenotype = []
        with open(str_inputFileName_phenotype, 'r') as file_inputFile:
            for line in file_inputFile:
                list_phenotype.append(line.strip().split(","))
        np_phenotype = np.array(list_phenotype, dtype=np.float)
        del list_phenotype
        if set(list(np_phenotype[:,-1])) != {0.0, 1.0}:
            sys.exit("The phenotype value should be 0=control or 1=case in classification mode.")
    
    return int_num_genotype, int_num_phenotype

""""""""""""""""""""""""""""""
# main function
""""""""""""""""""""""""""""""
[docs]def main(args=None):
    """

    Main function for obtaining user arguments, controling workflow and recording log file.

    Args:
        None

    Returns:
        None
    
    """

    ### open app if need
    if '--app' in sys.argv:
        str_command = str(sys.executable) + " " + os.path.join(os.path.dirname(__file__), "AppGenEpi.py")
        os.system(str_command)
        return

    ### obtain arguments from argument parser
    args = ArgumentsParser().parse_args(args)

    ### get arguments for I/O
    str_inputFileName_genotype = os.path.abspath(args.g)
    str_inputFileName_phenotype = os.path.abspath(args.p)
    str_inputFileName_regions = ""
    if args.s is not None:
        str_inputFileName_regions = args.s
    else:
        str_inputFileName_regions = "None"
    str_outputFilePath = os.getcwd()
    if args.o is not None:
        str_outputFilePath = args.o
    elif str_inputFileName_genotype != "example":
        str_outputFilePath = os.path.dirname(str_inputFileName_genotype)
    int_thread = mp.cpu_count()
    if int(args.t) is not None:
        if int(args.t) < mp.cpu_count():
            int_thread = int(args.t)
        
    if str_inputFileName_genotype == "example" and str_inputFileName_phenotype == "example":
        str_command = "cp " + os.path.join(os.path.dirname(__file__), "example", "sample.csv") + " " + str_outputFilePath
        os.system(str_command)
        str_command = "cp " + os.path.join(os.path.dirname(__file__), "example", "sample.gen") + " " + str_outputFilePath
        os.system(str_command)
        str_command = "GenEpi -g " + os.path.join(str_outputFilePath, "sample.gen") + " "
        str_command += "-p " + os.path.join(str_outputFilePath, "sample.csv") + " "
        str_command += "-o " + str_outputFilePath + " "
        str_command += "--updatedb --compressld"
        os.system(str_command)
        return
    
    with open(os.path.join(str_outputFilePath, "GenEpi_Log_" + time.strftime("%Y%m%d-%H%M", time.localtime()) + ".txt"), "w") as file_outputFile:
        ### create log
        file_outputFile.writelines("start analysis at: " + time.strftime("%Y%m%d-%H:%M:%S", time.localtime()) + "\n" + "\n")
    
        ### log arguments
        file_outputFile.writelines("Arguments in effect:" + "\n")
        file_outputFile.writelines("\t" + "-g (input genotype filename): " + str_inputFileName_genotype + "\n")
        file_outputFile.writelines("\t" + "-p (input phenotype filename): " + str_inputFileName_phenotype + "\n")
        file_outputFile.writelines("\t" + "-s (self-defined genome regions): " + str_inputFileName_regions + "\n")
        file_outputFile.writelines("\t" + "-o (output filepath): " + str_outputFilePath + "\n" + "\n")
        
        file_outputFile.writelines("\t" + "-m (model type): " + "Classification" if args.m=="c" else "Regression"  + "\n")
        file_outputFile.writelines("\t" + "-k (k-fold cross validation): " + str(args.k) + "\n")
        file_outputFile.writelines("\t" + "-t (number of threads): " + str(int_thread) + "\n" + "\n")
        
        file_outputFile.writelines("\t" + "--updatedb (enable function of update UCSC database): " + str(args.updatedb) + "\n")
        file_outputFile.writelines("\t" + "-b (human genome build): " + args.b + "\n" + "\n")
        
        file_outputFile.writelines("\t" + "--compressld (enable function of LD data compression): " + str(args.compressld) + "\n")
        file_outputFile.writelines("\t" + "-d (D prime threshold): " + str(args.d) + "\n")
        file_outputFile.writelines("\t" + "-r (R square threshold): " + str(args.r) + "\n" + "\n")

        file_outputFile.writelines("\t" + "-i (enable isolated test): " + str(args.i) + "\n" + "\n")
        
        ### check input format
        int_num_genotype, int_num_phenotype = InputChecking(str_inputFileName_genotype, str_inputFileName_phenotype, args)
        file_outputFile.writelines("Number of variants: " + str(int_num_genotype) + "\n")
        file_outputFile.writelines("Number of samples: " + str(int_num_phenotype) + "\n")
        print("Number of variants: " + str(int_num_genotype))
        print("Number of samples: " + str(int_num_phenotype))
        
        ### step0_splittingDataAsIsolatedData
        if args.i:
            SplittingDataAsIsolatedData(str_inputFileName_genotype, str_inputFileName_phenotype, str_outputFilePath=str_outputFilePath, int_randomState = 0)
            str_inputFileName_genotype = os.path.join(str_outputFilePath, os.path.basename(str_inputFileName_genotype).replace(".gen", "_subset_1.gen"))
            str_inputFileName_phenotype = os.path.join(str_outputFilePath, os.path.basename(str_inputFileName_phenotype).replace(".csv", "_subset_1.csv"))

        ### step1_downloadUCSCDB
        if args.updatedb:
            DownloadUCSCDB(str_hgbuild=args.b)
    
        ### step2_estimateLD
        if args.compressld:
            EstimateLDBlock(str_inputFileName_genotype, str_outputFilePath=str_outputFilePath, float_threshold_DPrime=float(args.d), float_threshold_RSquare=float(args.r))
            str_inputFileName_genotype = os.path.join(str_outputFilePath, os.path.basename(str_inputFileName_genotype.replace(".gen", "_LDReduced.gen")))
        
        ### step3_splitByGene
        if str_inputFileName_regions == "None":
            SplitByGene(str_inputFileName_genotype, str_outputFilePath=os.path.join(str_outputFilePath, "snpSubsets"))
        else:
            SplitByGene(str_inputFileName_genotype, str_inputFileName_UCSCDB=str_inputFileName_regions, str_outputFilePath=os.path.join(str_outputFilePath, "snpSubsets"))
        
        if args.m=="c":
            ### step4_singleGeneEpistasis_Logistic (for case/control trial)
            BatchSingleGeneEpistasisLogistic(os.path.join(str_outputFilePath, "snpSubsets"), str_inputFileName_phenotype, int_kOfKFold=int(args.k), int_nJobs=int(int_thread))
            ### step5_crossGeneEpistasis_Logistic (for case/control trial)
            float_score_train, float_score_test = CrossGeneEpistasisLogistic(os.path.join(str_outputFilePath, "singleGeneResult"), str_inputFileName_phenotype, int_kOfKFold=int(args.k), int_nJobs=1)
            file_outputFile.writelines("Overall genetic feature performance (F1 score)" + "\n")
            file_outputFile.writelines("Training: " + str(float_score_train) + "\n")
            file_outputFile.writelines("Testing (" + str(args.k) + "-fold CV): " + str(float_score_test) + "\n" + "\n")
            ### step6_ensembleWithCovariates (for case/control trial)
            float_score_train, float_score_test = EnsembleWithCovariatesClassifier(os.path.join(str_outputFilePath, "crossGeneResult", "Feature.csv"), str_inputFileName_phenotype, int_kOfKFold=int(args.k), int_nJobs=1)
            file_outputFile.writelines("Ensemble with co-variate performance (F1 score)" + "\n")
            file_outputFile.writelines("Training: " + str(float_score_train) + "\n")
            file_outputFile.writelines("Testing (" + str(args.k) + "-fold CV): " + str(float_score_test) + "\n" + "\n")
            ### step7_validateByIsolatedData
            if args.i == True:
                str_inputFileName_genotype = os.path.join(str_outputFilePath, os.path.basename(args.g).replace(".gen", "_subset_2.gen"))
                str_inputFileName_phenotype = os.path.join(str_outputFilePath, os.path.basename(args.p).replace(".csv", "_subset_2.csv"))
                float_score_test_gen = ValidateByIsolatedDataClassifier(os.path.join(str_outputFilePath, "crossGeneResult", "Classifier.pkl"), os.path.join(str_outputFilePath, "crossGeneResult", "Feature.csv"), str_inputFileName_genotype, str_inputFileName_phenotype, str_outputFilePath=os.path.join(str_outputFilePath, "isolatedValidation"))
                float_score_test_cov = ValidateByIsolatedDataCovariateClassifier(os.path.join(str_outputFilePath, "crossGeneResult", "Classifier_Covariates.pkl"), os.path.join(str_outputFilePath, "crossGeneResult", "Feature.csv"), str_inputFileName_genotype, str_inputFileName_phenotype, str_outputFilePath=os.path.join(str_outputFilePath, "isolatedValidation"))
                file_outputFile.writelines("Performance on the isolated test data (F1 score)" + "\n")
                file_outputFile.writelines("Genetic feature: " + str(float_score_test_gen) + "\n")
                file_outputFile.writelines("With co-variate: " + str(float_score_test_cov) + "\n" + "\n")
        else:
            ### step4_singleGeneEpistasis_Lasso (for quantitative trial)
            BatchSingleGeneEpistasisLasso(os.path.join(str_outputFilePath, "snpSubsets"), str_inputFileName_phenotype, int_kOfKFold=int(args.k), int_nJobs=int(int_thread))
            ### step5_crossGeneEpistasis_Lasso (for quantitative trial)
            float_score_train, float_score_test = CrossGeneEpistasisLasso(os.path.join(str_outputFilePath, "singleGeneResult"), str_inputFileName_phenotype, int_kOfKFold=int(args.k), int_nJobs=1)
            file_outputFile.writelines("Overall genetic feature performance (Average of the Pearson and Spearman correlation)" + "\n")
            file_outputFile.writelines("Training: " + str(float_score_train) + "\n")
            file_outputFile.writelines("Testing (" + str(args.k) + "-fold CV): " + str(float_score_test) + "\n" + "\n")
            ### step6_ensembleWithCovariates (for quantitative trial)
            float_score_train, float_score_test = EnsembleWithCovariatesRegressor(os.path.join(str_outputFilePath, "crossGeneResult", "Feature.csv"), str_inputFileName_phenotype, int_kOfKFold=int(args.k), int_nJobs=1)
            file_outputFile.writelines("Ensemble with co-variate performance (Average of the Pearson and Spearman correlation)" + "\n")
            file_outputFile.writelines("Training: " + str(float_score_train) + "\n")
            file_outputFile.writelines("Testing (" + str(args.k) + "-fold CV): " + str(float_score_test) + "\n" + "\n")
            ### step7_validateByIsolatedData
            if args.i == True:
                str_inputFileName_genotype = os.path.join(str_outputFilePath, os.path.basename(args.g).replace(".gen", "_subset_2.gen"))
                str_inputFileName_phenotype = os.path.join(str_outputFilePath, os.path.basename(args.p).replace(".csv", "_subset_2.csv"))
                float_score_test_gen = ValidateByIsolatedDataRegressor(os.path.join(str_outputFilePath, "crossGeneResult", "Regressor.pkl"), os.path.join(str_outputFilePath, "crossGeneResult", "Feature.csv"), str_inputFileName_genotype, str_inputFileName_phenotype, str_outputFilePath=os.path.join(str_outputFilePath, "isolatedValidation"))
                float_score_test_cov = ValidateByIsolatedDataCovariateRegressor(os.path.join(str_outputFilePath, "crossGeneResult", "Regressor_Covariates.pkl"), os.path.join(str_outputFilePath, "crossGeneResult", "Feature.csv"), str_inputFileName_genotype, str_inputFileName_phenotype, str_outputFilePath=os.path.join(str_outputFilePath, "isolatedValidation"))
                file_outputFile.writelines("Performance on the isolated test data (F1 score)" + "\n")
                file_outputFile.writelines("Genetic feature: " + str(float_score_test_gen) + "\n")
                file_outputFile.writelines("With co-variate: " + str(float_score_test_cov) + "\n" + "\n")
        
        file_outputFile.writelines("end analysis at: " + time.strftime("%Y%m%d-%H:%M:%S", time.localtime()) + "\n")

if __name__ == "__main__":
    main()