Source code for genepi.step4_singleGeneEpistasis_Lasso

# -*- coding: utf-8 -*-
"""
Created on Feb 2018

@author: Chester (Yu-Chuan Chang)
"""

""""""""""""""""""""""""""""""
# import libraries
""""""""""""""""""""""""""""""
import os
import warnings
warnings.filterwarnings('ignore')
# ignore all warnings
warnings.simplefilter("ignore")
os.environ["PYTHONWARNINGS"] = "ignore"

import sys
import itertools
import numpy as np
np.seterr(divide='ignore', invalid='ignore')
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import f_regression
from scipy.sparse import coo_matrix
from sklearn.utils import shuffle
from sklearn import linear_model
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
import scipy.stats as stats
import multiprocessing as mp

from genepi.tools import randomized_l1

""""""""""""""""""""""""""""""
# define functions 
""""""""""""""""""""""""""""""
[docs]def RandomizedLassoRegression(np_X, np_y): """ Implementation of the stability selection. Args: np_X (ndarray): 2D array containing genotype data with `int8` type np_y (ndarray): 2D array containing phenotype data with `float` type Returns: (ndarray): estimator.scores 1D array containing the scores of each genetic features with `float` type """ X = np_X y = np_y X_sparse = coo_matrix(X) X, X_sparse, y = shuffle(X, X_sparse, y, random_state=0) estimator = randomized_l1.RandomizedLasso(n_jobs=1, n_resampling=100) estimator.fit(X, y) return estimator.scores_
[docs]def LassoRegressionCV(np_X, np_y, int_kOfKFold = 2, int_nJobs = 1): """ Implementation of the L1-regularized Lasso regression with k-fold cross validation. Args: np_X (ndarray): 2D array containing genotype data with `int8` type np_y (ndarray): 2D array containing phenotype data with `float` type int_kOfKFold (int): The k for k-fold cross validation (default: 2) int_nJobs (int): The number of thread (default: 1) Returns: (ndarray): estimator.scores 1D array containing the scores of each genetic features with `float` type """ X = np_X y = np_y X_sparse = coo_matrix(X) X, X_sparse, y = shuffle(X, X_sparse, y, random_state=0) kf = KFold(n_splits=int_kOfKFold) list_target = [] list_predict = [] list_weight = [] for idxTr, idxTe in kf.split(X): alpha = np.logspace(-10, 10, 200) parameters = [{'alpha':alpha}] kf_estimator = KFold(n_splits=2) estimator_lasso = linear_model.Lasso(max_iter=1000) estimator_grid = GridSearchCV(estimator_lasso, parameters, scoring='neg_mean_squared_error', n_jobs=1, cv=kf_estimator) estimator_grid.fit(X[idxTr], y[idxTr]) list_label = estimator_grid.best_estimator_.predict(X[idxTe]) list_weight.append([float(item) for item in estimator_grid.best_estimator_.coef_]) for idx_y, idx_label in zip(list(y[idxTe]), list_label): list_target.append(float(idx_y)) list_predict.append(idx_label) np_weight = np.array(list_weight) np_weight = np.average(list_weight, axis=0) float_pearson = stats.stats.pearsonr(list_target, list_predict)[0] float_spearman = stats.stats.spearmanr(list_target, list_predict)[0] return (float_pearson + float_spearman) / 2, np_weight
[docs]def FeatureEncoderLasso(np_genotype_rsid, np_genotype, np_phenotype, int_dim): """ Implementation of the two-element combinatorial encoding. Args: np_genotype_rsid (ndarray): 1D array containing rsid of genotype data with `str` type np_genotype (ndarray): 2D array containing genotype data with `int8` type np_phenotype (ndarray): 2D array containing phenotype data with `float` type int_dim (int): The dimension of a variant (default: 3. AA, AB and BB) Returns: (tuple): tuple containing: - list_interaction_rsid (ndarray): 1D array containing rsid of genotype data with `str` type - np_interaction (ndarray): 2D array containing genotype data with `int8` type """ ### combinatorial encoding np_interaction = np_genotype list_interaction_rsid = list(np_genotype_rsid) list_combs = list(itertools.combinations(range(int(np_interaction.shape[1]/int_dim)), 2)) for idx_combs in range(len(list_combs)): try: ### generate interaction terms tuple_comb = list_combs[idx_combs] np_this_interaction = np.zeros([np_phenotype.shape[0], int_dim**2], dtype='int8') list_this_interaction_id = [] for idx_x in range(int_dim): for idx_y in range(int_dim): np_this_interaction_term = (np_genotype[:, tuple_comb[0] * int_dim + idx_x] * np_genotype[:, tuple_comb[1] * int_dim + idx_y]).astype(np.int8) if not(np.array_equal(np_this_interaction_term, np_genotype[:, tuple_comb[0] * int_dim + idx_x])) and not(np.array_equal(np_this_interaction_term, np_genotype[:, tuple_comb[1] * int_dim + idx_y])): np_this_interaction[:, idx_x * int_dim + idx_y] = np_this_interaction_term list_this_interaction_id.append(np_genotype_rsid[tuple_comb[0] * int_dim + idx_x] + "*" + np_genotype_rsid[tuple_comb[1] * int_dim + idx_y]) ### variance check (detect variance < 0.05) sk_variance = VarianceThreshold(threshold=(.95 * (1 - .95))) np_this_interaction = sk_variance.fit_transform(np_this_interaction) np_this_interaction_id = np.array(list_this_interaction_id) np_this_interaction_id = np.array(np_this_interaction_id[sk_variance.get_support()]) ### f regression feature selection np_fRegression = -np.log10(f_regression(np_this_interaction.astype(int), np_phenotype[:, -1].astype(float))[1]) np_selectedIdx = np.array([x > 2 for x in np_fRegression]) np_this_interaction = np_this_interaction[:, np_selectedIdx] np_this_interaction_id = np_this_interaction_id[np_selectedIdx] ### append insteraction terms int_num_interaction = np_this_interaction.shape[1] if int_num_interaction == 0: continue np_interaction_append = np.empty((np_interaction.shape[0], np_interaction.shape[1] + int_num_interaction), dtype='int') np_interaction_append[:,:-(int_num_interaction)] = np_interaction np_interaction_append[:,-(int_num_interaction):] = np_this_interaction np_interaction = np_interaction_append list_interaction_rsid.extend(list(np_this_interaction_id)) except: pass return np.array(list_interaction_rsid), np_interaction
[docs]def FilterInLoading(np_genotype, np_phenotype): """ This function is for filtering low quality varaint. Before modeling each subset of genotype features, two criteria were adopted to exclude low quality data. The first criterion is that the genotype frequency of a feature should exceed 5%, where the genotype frequency means the proportion of genotype among the total samples in the dataset. The second criterion is regarding the association between the feature and the phenotype. We used χ2 test to estimate the association between the feature and the phenotype, and the p-value should be smaller than 0.01. Args: np_genotype (ndarray): 2D array containing genotype data with `int8` type np_phenotype (ndarray): 2D array containing phenotype data with `float` type Returns: (ndarray): np_genotype 2D array containing genotype data with `int8` type """ try: ### variance check (detect variance < 0.05) sk_variance = VarianceThreshold(threshold=(.95 * (1 - .95))) np_genotype = sk_variance.fit_transform(np_genotype) ### f regression feature selection np_fRegression = -np.log10(f_regression(np_genotype.astype(int), np_phenotype[:, -1].astype(float))[1]) np_selectedIdx = np.array([x > 2 for x in np_fRegression]) np_genotype = np_genotype[:, np_selectedIdx] return np_genotype.shape[1] except: return 0
"""""""""""""""""""""""""""""" # main function """"""""""""""""""""""""""""""
[docs]def SingleGeneEpistasisLasso(str_inputFileName_genotype, str_inputFileName_phenotype, str_outputFilePath = "", int_kOfKFold = 2, int_nJobs = 1): """ A workflow to model a single gene containing two-element combinatorial encoding, stability selection, filtering low quality varaint and L1-regularized Lasso regression with k-fold cross validation. Args: str_inputFileName_genotype (str): File name of input genotype data str_inputFileName_phenotype (str): File name of input phenotype data str_outputFilePath (str): File path of output file int_kOfKFold (int): The k for k-fold cross validation (default: 2) int_nJobs (int): The number of thread (default: 1) Returns: (float): float_AVG_S_P The average of the Peason's and Spearman's correlation of the model """ ### set path of output file if str_outputFilePath == "": str_outputFilePath = os.path.dirname(str_inputFileName_genotype) #------------------------- # load data #------------------------- ### count lines of input files int_num_phenotype = sum(1 for line in open(str_inputFileName_phenotype)) ### get phenotype file list_phenotype = [] with open(str_inputFileName_phenotype, 'r') as file_inputFile: for line in file_inputFile: list_phenotype.append(line.strip().split(",")) np_phenotype = np.array(list_phenotype, dtype=np.float) del list_phenotype ### get genotype file list_genotype = [[] for x in range(int_num_phenotype)] list_genotype_rsid = [] with open(str_inputFileName_genotype, 'r') as file_inputFile: for line in file_inputFile: list_thisSnp = line.strip().split(" ") np_this_genotype = np.empty([int_num_phenotype, 3], dtype='int8') for idx_subject in range(0, int_num_phenotype): list_allelType = [0, 0, 0] list_allelType[np.argmax(list_thisSnp[idx_subject * 3 + 5 : idx_subject * 3 + 8])] = 1 np_this_genotype[idx_subject, :] = list_allelType if FilterInLoading(np_this_genotype, np_phenotype) == 0: continue for idx_subject in range(0, int_num_phenotype): list_allelType = [0, 0, 0] list_allelType[np.argmax(list_thisSnp[idx_subject * 3 + 5 : idx_subject * 3 + 8])] = 1 list_genotype[idx_subject].extend(list_allelType) list_genotype_rsid.append(list_thisSnp[1] + "_AA") list_genotype_rsid.append(list_thisSnp[1] + "_AB") list_genotype_rsid.append(list_thisSnp[1] + "_BB") np_genotype = np.array(list_genotype, dtype=np.int8) np_genotype_rsid = np.array(list_genotype_rsid) if np_genotype_rsid.shape[0] == 0: return 0.0 #------------------------- # preprocess data #------------------------- ### generate interaction terms np_genotype_rsid, np_genotype = FeatureEncoderLasso(np_genotype_rsid, np_genotype, np_phenotype, 3) #------------------------- # select feature #------------------------- ### random lasso feature selection np_randWeight = np.array(RandomizedLassoRegression(np_genotype, np_phenotype[:, -1].astype(float))) np_selectedIdx = np.array([x >= 0.1 for x in np_randWeight]) np_randWeight = np_randWeight[np_selectedIdx] np_genotype = np_genotype[:, np_selectedIdx] np_genotype_rsid = np_genotype_rsid[np_selectedIdx] if np_genotype_rsid.shape[0] == 0: return 0.0 #------------------------- # build model #------------------------- float_AVG_S_P, np_weight = LassoRegressionCV(np_genotype, np_phenotype[:, -1].astype(float), int_kOfKFold, int_nJobs) if float_AVG_S_P == 0.0: return 0.0 ### filter out zero-weight features np_selectedIdx = np.array([x != 0.0 for x in np_weight]) np_weight = np_weight[np_selectedIdx] np_genotype = np_genotype[:, np_selectedIdx] np_genotype_rsid = np_genotype_rsid[np_selectedIdx] if np_genotype_rsid.shape[0] == 0: return 0.0 #------------------------- # analyze result #------------------------- ### calculate student t-test p-value np_fRegression = -np.log10(f_regression(np_genotype.astype(int), np_phenotype[:, -1].astype(float))[1]) ### calculate genotype frequency np_genotypeFreq = np.sum(np_genotype, axis=0).astype(float) / np_genotype.shape[0] #------------------------- # output results #------------------------- ### output statistics of features with open(os.path.join(str_outputFilePath, os.path.basename(str_inputFileName_genotype).split("_")[0] + "_Result.csv"), "w") as file_outputFile: file_outputFile.writelines("rsID,weight,student-t-test_log_p-value,genotype_frequency" + "\n") for idx_feature in range(0, np_genotype_rsid.shape[0]): file_outputFile.writelines(str(np_genotype_rsid[idx_feature,]) + "," + str(np_weight[idx_feature,]) + "," + str(np_fRegression[idx_feature,]) + "," + str(np_genotypeFreq[idx_feature]) + "\n") ### output feature with open(os.path.join(str_outputFilePath, os.path.basename(str_inputFileName_genotype).split("_")[0] + "_Feature.csv"), "w") as file_outputFile: file_outputFile.writelines(",".join(np_genotype_rsid) + "\n") for idx_subject in range(0, np_genotype.shape[0]): file_outputFile.writelines(",".join(np_genotype[idx_subject, :].astype(str)) + "\n") return float_AVG_S_P
[docs]def BatchSingleGeneEpistasisLasso(str_inputFilePath_genotype, str_inputFileName_phenotype, str_outputFilePath = "", int_kOfKFold = 2, int_nJobs = mp.cpu_count()): """ Batch running for the single gene workflow. Args: str_inputFilePath_genotype (str): File path of input genotype data str_inputFileName_phenotype (str): File name of input phenotype data str_outputFilePath (str): File path of output file int_kOfKFold (int): The k for k-fold cross validation (default: 2) int_nJobs (int): The number of thread (default: 1) Returns: - Expected Success Response:: "step4: Detect single gene epistasis. DONE!" """ ### set default output path if str_outputFilePath == "": str_outputFilePath = os.path.abspath(os.path.join(str_inputFilePath_genotype, os.pardir)) + "/singleGeneResult/" ### if output folder doesn't exist then create it if not os.path.exists(str_outputFilePath): os.makedirs(str_outputFilePath) ### scan all of the gen file in path list_genotypeFileName = [] for str_fileName in os.listdir(str_inputFilePath_genotype): if ".gen" in str_fileName: list_genotypeFileName.append(str_fileName) ### batch PolyLassoRegression ### inital multiprocessing pool mp_pool = mp.Pool(int_nJobs) ### apply pool on the function that need be parallelizing dict_result = {} for int_count_gene, float_AVG_S_P in enumerate(mp_pool.starmap(SingleGeneEpistasisLasso, [(os.path.join(str_inputFilePath_genotype, gene), str_inputFileName_phenotype, str_outputFilePath, int_kOfKFold, int_nJobs) for gene in list_genotypeFileName]), 0): if list_genotypeFileName[int_count_gene] not in dict_result: dict_result[list_genotypeFileName[int_count_gene]] = float_AVG_S_P str_print = "step4: Processing: " + "{0:.2f}".format(float(int_count_gene) / len(list_genotypeFileName) * 100) + "% - " + list_genotypeFileName[int_count_gene] + ": " + "\t\t" sys.stdout.write('%s\r' % str_print) sys.stdout.flush() mp_pool.close() ### output result with open(str_outputFilePath + "All_Lasso_k" + str(int_kOfKFold) + ".csv", "w") as file_outputFile: file_outputFile.writelines("GeneSymbol,AVG_S_P" + "\n") for key, value in dict_result.items(): file_outputFile.writelines(key.split("_")[0] + "," + str(value) + "\n") ''' ### batch PolyLassoRegression int_count_gene = 0 with open(str_outputFilePath + "All_Lasso_k" + str(int_kOfKFold) + ".csv", "w") as file_outputFile: file_outputFile.writelines("GeneSymbol,AVG_S_P" + "\n") for item in list_genotypeFileName: int_count_gene = int_count_gene + 1 str_genotypeFileName = os.path.join(str_inputFilePath_genotype, item) float_AVG_S_P = SingleGeneEpistasisLasso(str_genotypeFileName, str_inputFileName_phenotype, str_outputFilePath, int_kOfKFold, int_nJobs) file_outputFile.writelines(item.split("_")[0] + "," + str(float_AVG_S_P) + "\n") str_print = "step4: Processing: " + "{0:.2f}".format(float(int_count_gene) / len(list_genotypeFileName) * 100) + "% - " + item + ": " + str(float_AVG_S_P) + "\t\t" sys.stdout.write('%s\r' % str_print) sys.stdout.flush() ''' print("step4: Detect single gene epistasis. DONE! \t\t\t\t")