Source code for genepi.step6_ensembleWithCovariates

# -*- coding: utf-8 -*-
"""
Created on Feb 2018

@author: Chester (Yu-Chuan Chang)
"""

""""""""""""""""""""""""""""""
# import libraries
""""""""""""""""""""""""""""""
import os
import warnings
warnings.filterwarnings('ignore')
# ignore all warnings
warnings.simplefilter("ignore")
os.environ["PYTHONWARNINGS"] = "ignore"

import os
import numpy as np
np.seterr(divide='ignore', invalid='ignore')

from sklearn import linear_model
from scipy.sparse import coo_matrix
from sklearn.utils import shuffle
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.externals import joblib

from genepi.step4_singleGeneEpistasis_Logistic import LogisticRegressionL1CV
from genepi.step4_singleGeneEpistasis_Lasso import LassoRegressionCV
from genepi.step5_crossGeneEpistasis_Logistic import LogisticRegressionL1
from genepi.step5_crossGeneEpistasis_Lasso import LassoRegression

""""""""""""""""""""""""""""""
# define functions 
""""""""""""""""""""""""""""""
[docs]def LoadDataForEnsemble(str_inputFileName_feature, str_inputFileName_phenotype): """ Loading genetic features for ensembling with covariates Args: str_inputFilePath_feature (str): File path of input feature files from stage 2 - crossGeneEpistasis str_inputFileName_phenotype (str): File name of input phenotype data Returns: (tuple): tuple containing: - np_genotype (ndarray): 2D array containing genotype data with `int8` type - np_phenotype (ndarray): 2D array containing phenotype data with `float` type """ ### check feature file exist if not os.path.isfile(str_inputFileName_feature): print("step6: There is no variant remained in previous step.") return None, None ### get all selected snp ids list_genotype_rsid = [] with open(str_inputFileName_feature, "r") as file_inputFile: ### grep the header list_rsids = file_inputFile.readline().strip().split(",") for rsid in list_rsids: list_genotype_rsid.append(rsid) np_genotype_rsid = np.array(list_genotype_rsid) ### count lines of input files int_num_genotype = len(np_genotype_rsid) int_num_phenotype = sum(1 for line in open(str_inputFileName_phenotype)) ### get phenotype file list_phenotype = [] with open(str_inputFileName_phenotype, 'r') as file_inputFile: for line in file_inputFile: list_phenotype.append(line.strip().split(",")) np_phenotype = np.array(list_phenotype) del list_phenotype if np_phenotype.shape[1] < 2: print("step6: There is no other factors exist.") return None, None ### get genotype file np_genotype = np.empty([int_num_phenotype, int_num_genotype], dtype=np.float16) with open(str_inputFileName_feature, "r") as file_inputFile: ### skip header file_inputFile.readline() idx_phenotype = 0 ### read feaure and write into np_genotype for line in file_inputFile: np_genotype[idx_phenotype, :len(list_rsids)] = np.array([float(x) for x in line.strip().split(",")], dtype='int') idx_phenotype = idx_phenotype + 1 ### concatenate genotype and other factors np_genotype = np.concatenate((np_genotype, np_phenotype[:, :-1]), axis=1).astype(float) return np_genotype, np_phenotype
[docs]def ClassifierModelPersistence(np_X, np_y, str_outputFilePath = "", int_nJobs = 1): """ Dumping ensemble classifier for model persistence Args: np_X (ndarray): 2D array containing genotype data with `int8` type np_y (ndarray): 2D array containing phenotype data with `float` type str_outputFilePath (str): File path of output file int_nJobs (int): The number of thread (default: 1) Returns: None """ X = np_X y = np_y X_sparse = coo_matrix(X) X, X_sparse, y = shuffle(X, X_sparse, y, random_state=0) cost = [2**x for x in range(-8, 8)] parameters = [{'C':cost, 'penalty':['l1'], 'dual':[False], 'class_weight':['balanced']}] kf_estimator = KFold(n_splits=2) estimator_logistic = linear_model.LogisticRegression(max_iter=100, solver='liblinear') estimator_grid = GridSearchCV(estimator_logistic, parameters, scoring='f1', n_jobs=int_nJobs, cv=kf_estimator) estimator_grid.fit(X, y) joblib.dump(estimator_grid.best_estimator_, os.path.join(str_outputFilePath, "Classifier_Covariates.pkl"))
[docs]def RegressorModelPersistence(np_X, np_y, str_outputFilePath = "", int_nJobs = 1): """ Dumping ensemble regressor for model persistence Args: np_X (ndarray): 2D array containing genotype data with `int8` type np_y (ndarray): 2D array containing phenotype data with `float` type str_outputFilePath (str): File path of output file int_nJobs (int): The number of thread (default: 1) Returns: None """ X = np_X y = np_y X_sparse = coo_matrix(X) X, X_sparse, y = shuffle(X, X_sparse, y, random_state=0) alpha = np.logspace(-10, 10, 200) parameters = [{'alpha':alpha}] kf_estimator = KFold(n_splits=2) estimator_lasso = linear_model.Lasso(max_iter=1000) estimator_grid = GridSearchCV(estimator_lasso, parameters, scoring='neg_mean_squared_error', n_jobs=int_nJobs, cv=kf_estimator) estimator_grid.fit(X, y) joblib.dump(estimator_grid.best_estimator_, os.path.join(str_outputFilePath, "Regressor_Covariates.pkl"))
"""""""""""""""""""""""""""""" # main function """"""""""""""""""""""""""""""
[docs]def EnsembleWithCovariatesClassifier(str_inputFileName_feature, str_inputFileName_phenotype, str_outputFilePath = "", int_kOfKFold = 2, int_nJobs = 1): """ A workflow to ensemble genetic features with covariates for L1-regularized Logistic regression. Args: str_inputFilePath_feature (str): File path of input feature files from stage 2 - crossGeneEpistasis str_inputFileName_phenotype (str): File name of input phenotype data str_outputFilePath (str): File path of output file int_kOfKFold (int): The k for k-fold cross validation (default: 2) int_nJobs (int): The number of thread (default: 1) Returns: (tuple): tuple containing: - float_f1Score_train (float): The F1 score of the model for training set - float_f1Score_test (float): The F1 score of the model for testing set - Expected Success Response:: "step6: Ensemble with covariates. DONE!" """ ### set default output path if str_outputFilePath == "": str_outputFilePath = os.path.dirname(str_inputFileName_feature) #------------------------- # load data #------------------------- np_genotype, np_phenotype = LoadDataForEnsemble(str_inputFileName_feature, str_inputFileName_phenotype) if np_genotype is None and np_phenotype is None: return 0.0, 0.0 #------------------------- # build model #------------------------- float_f1Score_test, np_weight, dict_y = LogisticRegressionL1CV(np_genotype, np_phenotype[:, -1].astype(int), int_kOfKFold, int_nJobs) float_f1Score_train = LogisticRegressionL1(np_genotype, np_phenotype[:, -1].astype(int), int_nJobs) #------------------------- # dump persistent model #------------------------- ClassifierModelPersistence(np_genotype, np_phenotype[:, -1].astype(int), str_outputFilePath, int_nJobs) print("step6: Ensemble with covariates. DONE! (Training score:" + "{0:.2f}".format(float_f1Score_train) + "; " + str(int_kOfKFold) + "-fold Test Score:" + "{0:.2f}".format(float_f1Score_test) + ")") return float_f1Score_train, float_f1Score_test
[docs]def EnsembleWithCovariatesRegressor(str_inputFileName_feature, str_inputFileName_phenotype, str_outputFilePath = "", int_kOfKFold = 2, int_nJobs = 1): """ A workflow to ensemble genetic features with covariates for L1-regularized Lasso regression. Args: str_inputFilePath_feature (str): File path of input feature files from stage 2 - crossGeneEpistasis str_inputFileName_phenotype (str): File name of input phenotype data str_outputFilePath (str): File path of output file int_kOfKFold (int): The k for k-fold cross validation (default: 2) int_nJobs (int): The number of thread (default: 1) Returns: (tuple): tuple containing: - float_AVG_S_P_train (float): The average of the Peason's and Spearman's correlation of the model for training set - float_AVG_S_P_test (float): The average of the Peason's and Spearman's correlation of the model for testing set - Expected Success Response:: "step6: Ensemble with covariates. DONE!" """ ### set default output path if str_outputFilePath == "": str_outputFilePath = os.path.dirname(str_inputFileName_feature) #------------------------- # load data #------------------------- np_genotype, np_phenotype = LoadDataForEnsemble(str_inputFileName_feature, str_inputFileName_phenotype) if np_genotype is None and np_phenotype is None: return 0.0, 0.0 #------------------------- # build model #------------------------- float_AVG_S_P_test, np_weight = LassoRegressionCV(np_genotype, np_phenotype[:, -1].astype(float), int_kOfKFold, int_nJobs) float_AVG_S_P_train = LassoRegression(np_genotype, np_phenotype[:, -1].astype(float), int_nJobs) #------------------------- # dump persistent model #------------------------- RegressorModelPersistence(np_genotype, np_phenotype[:, -1].astype(int), str_outputFilePath, int_nJobs) print("step6: Ensemble with covariates. DONE! (Training score:" + "{0:.2f}".format(float_AVG_S_P_train) + "; " + str(int_kOfKFold) + "-fold Test Score:" + "{0:.2f}".format(float_AVG_S_P_test) + ")") return float_AVG_S_P_train, float_AVG_S_P_test