Source code for genepi.step6_ensembleWithCovariates

# -*- coding: utf-8 -*-
"""
Created on Feb 2018

@author: Chester (Yu-Chuan Chang)
"""

""""""""""""""""""""""""""""""
# import libraries
""""""""""""""""""""""""""""""
import os
import warnings
warnings.filterwarnings('ignore')
# ignore all warnings
warnings.simplefilter("ignore")
os.environ["PYTHONWARNINGS"] = "ignore"

import os
import numpy as np
np.seterr(divide='ignore', invalid='ignore')

from sklearn import linear_model
from scipy.sparse import coo_matrix
from sklearn.utils import shuffle
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.externals import joblib

from genepi.step4_singleGeneEpistasis_Logistic import LogisticRegressionL1CV
from genepi.step4_singleGeneEpistasis_Lasso import LassoRegressionCV
from genepi.step5_crossGeneEpistasis_Logistic import LogisticRegressionL1
from genepi.step5_crossGeneEpistasis_Lasso import LassoRegression

""""""""""""""""""""""""""""""
# define functions 
""""""""""""""""""""""""""""""
[docs]def LoadDataForEnsemble(str_inputFileName_feature, str_inputFileName_phenotype):
    """

    Loading genetic features for ensembling with covariates

    Args:
        str_inputFilePath_feature (str): File path of input feature files from stage 2 - crossGeneEpistasis
        str_inputFileName_phenotype (str): File name of input phenotype data
        
    Returns:
        (tuple): tuple containing:

            - np_genotype (ndarray): 2D array containing genotype data with `int8` type
            - np_phenotype (ndarray): 2D array containing phenotype data with `float` type
    
    """

    ### check feature file exist
    if not os.path.isfile(str_inputFileName_feature):
        print("step6: There is no variant remained in previous step.")
        return None, None

    ### get all selected snp ids
    list_genotype_rsid = []
    with open(str_inputFileName_feature, "r") as file_inputFile:
        ### grep the header
        list_rsids = file_inputFile.readline().strip().split(",")
        for rsid in list_rsids:
            list_genotype_rsid.append(rsid)
    np_genotype_rsid = np.array(list_genotype_rsid)
    
    ### count lines of input files
    int_num_genotype = len(np_genotype_rsid)
    int_num_phenotype = sum(1 for line in open(str_inputFileName_phenotype))
    
    ### get phenotype file
    list_phenotype = []
    with open(str_inputFileName_phenotype, 'r') as file_inputFile:
        for line in file_inputFile:
            list_phenotype.append(line.strip().split(","))
    np_phenotype = np.array(list_phenotype)
    del list_phenotype
    
    if np_phenotype.shape[1] < 2:
        print("step6: There is no other factors exist.")
        return None, None
    
    ### get genotype file
    np_genotype = np.empty([int_num_phenotype, int_num_genotype], dtype=np.float16)
    with open(str_inputFileName_feature, "r") as file_inputFile:
        ### skip header
        file_inputFile.readline()
        idx_phenotype = 0
        ### read feaure and write into np_genotype
        for line in file_inputFile:
            np_genotype[idx_phenotype, :len(list_rsids)] = np.array([float(x) for x in line.strip().split(",")], dtype='int')
            idx_phenotype = idx_phenotype + 1
    
    ### concatenate genotype and other factors
    np_genotype = np.concatenate((np_genotype, np_phenotype[:, :-1]), axis=1).astype(float)
    
    return np_genotype, np_phenotype

[docs]def ClassifierModelPersistence(np_X, np_y, str_outputFilePath = "", int_nJobs = 1):
    """

    Dumping ensemble classifier for model persistence

    Args:
        np_X (ndarray): 2D array containing genotype data with `int8` type
        np_y (ndarray): 2D array containing phenotype data with `float` type
        str_outputFilePath (str): File path of output file
        int_nJobs (int): The number of thread (default: 1)

    Returns:
        None
    
    """

    X = np_X
    y = np_y
    X_sparse = coo_matrix(X)
    X, X_sparse, y = shuffle(X, X_sparse, y, random_state=0)
    
    cost = [2**x for x in range(-8, 8)]
    parameters = [{'C':cost, 'penalty':['l1'], 'dual':[False], 'class_weight':['balanced']}]
    kf_estimator = KFold(n_splits=2)
    estimator_logistic = linear_model.LogisticRegression(max_iter=100, solver='liblinear')
    estimator_grid = GridSearchCV(estimator_logistic, parameters, scoring='f1', n_jobs=int_nJobs, cv=kf_estimator)
    estimator_grid.fit(X, y)
    
    joblib.dump(estimator_grid.best_estimator_, os.path.join(str_outputFilePath, "Classifier_Covariates.pkl"))

[docs]def RegressorModelPersistence(np_X, np_y, str_outputFilePath = "", int_nJobs = 1):
    """

    Dumping ensemble regressor for model persistence

    Args:
        np_X (ndarray): 2D array containing genotype data with `int8` type
        np_y (ndarray): 2D array containing phenotype data with `float` type
        str_outputFilePath (str): File path of output file
        int_nJobs (int): The number of thread (default: 1)

    Returns:
        None
    
    """
    
    X = np_X
    y = np_y
    X_sparse = coo_matrix(X)
    X, X_sparse, y = shuffle(X, X_sparse, y, random_state=0)
    
    alpha = np.logspace(-10, 10, 200)
    parameters = [{'alpha':alpha}]
    kf_estimator = KFold(n_splits=2)
    estimator_lasso = linear_model.Lasso(max_iter=1000)
    estimator_grid = GridSearchCV(estimator_lasso, parameters, scoring='neg_mean_squared_error', n_jobs=int_nJobs, cv=kf_estimator)
    estimator_grid.fit(X, y)
    
    joblib.dump(estimator_grid.best_estimator_, os.path.join(str_outputFilePath, "Regressor_Covariates.pkl"))

""""""""""""""""""""""""""""""
# main function
""""""""""""""""""""""""""""""
[docs]def EnsembleWithCovariatesClassifier(str_inputFileName_feature, str_inputFileName_phenotype, str_outputFilePath = "", int_kOfKFold = 2, int_nJobs = 1):
    """

    A workflow to ensemble genetic features with covariates for L1-regularized Logistic regression.

    Args:
        str_inputFilePath_feature (str): File path of input feature files from stage 2 - crossGeneEpistasis
        str_inputFileName_phenotype (str): File name of input phenotype data
        str_outputFilePath (str): File path of output file
        int_kOfKFold (int): The k for k-fold cross validation (default: 2)
        int_nJobs (int): The number of thread (default: 1)

    Returns:
        (tuple): tuple containing:

            - float_f1Score_train (float): The F1 score of the model for training set
            - float_f1Score_test (float): The F1 score of the model for testing set

        - Expected Success Response::

            "step6: Ensemble with covariates. DONE!"
    
    """
    
    ### set default output path
    if str_outputFilePath == "":
        str_outputFilePath = os.path.dirname(str_inputFileName_feature)
    
    #-------------------------
    # load data
    #-------------------------
    np_genotype, np_phenotype = LoadDataForEnsemble(str_inputFileName_feature, str_inputFileName_phenotype)
    if np_genotype is None and np_phenotype is None:
        return 0.0, 0.0
    
    #-------------------------
    # build model
    #-------------------------
    float_f1Score_test, np_weight, dict_y = LogisticRegressionL1CV(np_genotype, np_phenotype[:, -1].astype(int), int_kOfKFold, int_nJobs)
    float_f1Score_train = LogisticRegressionL1(np_genotype, np_phenotype[:, -1].astype(int), int_nJobs)
    
    #-------------------------
    # dump persistent model
    #-------------------------
    ClassifierModelPersistence(np_genotype, np_phenotype[:, -1].astype(int), str_outputFilePath, int_nJobs)
    
    print("step6: Ensemble with covariates. DONE! (Training score:" + "{0:.2f}".format(float_f1Score_train) + "; " + str(int_kOfKFold) + "-fold Test Score:" + "{0:.2f}".format(float_f1Score_test) + ")")

    return float_f1Score_train, float_f1Score_test

[docs]def EnsembleWithCovariatesRegressor(str_inputFileName_feature, str_inputFileName_phenotype, str_outputFilePath = "", int_kOfKFold = 2, int_nJobs = 1):
    """

    A workflow to ensemble genetic features with covariates for L1-regularized Lasso regression.

    Args:
        str_inputFilePath_feature (str): File path of input feature files from stage 2 - crossGeneEpistasis
        str_inputFileName_phenotype (str): File name of input phenotype data
        str_outputFilePath (str): File path of output file
        int_kOfKFold (int): The k for k-fold cross validation (default: 2)
        int_nJobs (int): The number of thread (default: 1)

    Returns:
        (tuple): tuple containing:

            - float_AVG_S_P_train (float): The average of the Peason's and Spearman's correlation of the model for training set
            - float_AVG_S_P_test (float): The average of the Peason's and Spearman's correlation of the model for testing set

        - Expected Success Response::

            "step6: Ensemble with covariates. DONE!"
    
    """
    
    ### set default output path
    if str_outputFilePath == "":
        str_outputFilePath = os.path.dirname(str_inputFileName_feature)
    
    #-------------------------
    # load data
    #-------------------------
    np_genotype, np_phenotype = LoadDataForEnsemble(str_inputFileName_feature, str_inputFileName_phenotype)
    if np_genotype is None and np_phenotype is None:
        return 0.0, 0.0
    
    #-------------------------
    # build model
    #-------------------------
    float_AVG_S_P_test, np_weight = LassoRegressionCV(np_genotype, np_phenotype[:, -1].astype(float), int_kOfKFold, int_nJobs)
    float_AVG_S_P_train = LassoRegression(np_genotype, np_phenotype[:, -1].astype(float), int_nJobs)
    
    #-------------------------
    # dump persistent model
    #-------------------------
    RegressorModelPersistence(np_genotype, np_phenotype[:, -1].astype(int), str_outputFilePath, int_nJobs)
    
    print("step6: Ensemble with covariates. DONE! (Training score:" + "{0:.2f}".format(float_AVG_S_P_train) + "; " + str(int_kOfKFold) + "-fold Test Score:" + "{0:.2f}".format(float_AVG_S_P_test) + ")")
    
    return float_AVG_S_P_train, float_AVG_S_P_test