# -*- coding: utf-8 -*-
"""
Created on Feb 2018
@author: Chester (Yu-Chuan Chang)
"""
""""""""""""""""""""""""""""""
# import libraries
""""""""""""""""""""""""""""""
import os
import warnings
warnings.filterwarnings('ignore')
# ignore all warnings
warnings.simplefilter("ignore")
os.environ["PYTHONWARNINGS"] = "ignore"
import sys
import itertools
import numpy as np
np.seterr(divide='ignore', invalid='ignore')
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import f_regression
from scipy.sparse import coo_matrix
from sklearn.utils import shuffle
from sklearn import linear_model
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
import scipy.stats as stats
import multiprocessing as mp
from genepi.tools import randomized_l1
""""""""""""""""""""""""""""""
# define functions
""""""""""""""""""""""""""""""
[docs]def RandomizedLassoRegression(np_X, np_y):
"""
Implementation of the stability selection.
Args:
np_X (ndarray): 2D array containing genotype data with `int8` type
np_y (ndarray): 2D array containing phenotype data with `float` type
Returns:
(ndarray): estimator.scores
1D array containing the scores of each genetic features with `float` type
"""
X = np_X
y = np_y
X_sparse = coo_matrix(X)
X, X_sparse, y = shuffle(X, X_sparse, y, random_state=0)
estimator = randomized_l1.RandomizedLasso(n_jobs=1, n_resampling=100)
estimator.fit(X, y)
return estimator.scores_
[docs]def LassoRegressionCV(np_X, np_y, int_kOfKFold = 2, int_nJobs = 1):
"""
Implementation of the L1-regularized Lasso regression with k-fold cross validation.
Args:
np_X (ndarray): 2D array containing genotype data with `int8` type
np_y (ndarray): 2D array containing phenotype data with `float` type
int_kOfKFold (int): The k for k-fold cross validation (default: 2)
int_nJobs (int): The number of thread (default: 1)
Returns:
(ndarray): estimator.scores
1D array containing the scores of each genetic features with `float` type
"""
X = np_X
y = np_y
X_sparse = coo_matrix(X)
X, X_sparse, y = shuffle(X, X_sparse, y, random_state=0)
kf = KFold(n_splits=int_kOfKFold)
list_target = []
list_predict = []
list_weight = []
for idxTr, idxTe in kf.split(X):
alpha = np.logspace(-10, 10, 200)
parameters = [{'alpha':alpha}]
kf_estimator = KFold(n_splits=2)
estimator_lasso = linear_model.Lasso(max_iter=1000)
estimator_grid = GridSearchCV(estimator_lasso, parameters, scoring='neg_mean_squared_error', n_jobs=1, cv=kf_estimator)
estimator_grid.fit(X[idxTr], y[idxTr])
list_label = estimator_grid.best_estimator_.predict(X[idxTe])
list_weight.append([float(item) for item in estimator_grid.best_estimator_.coef_])
for idx_y, idx_label in zip(list(y[idxTe]), list_label):
list_target.append(float(idx_y))
list_predict.append(idx_label)
np_weight = np.array(list_weight)
np_weight = np.average(list_weight, axis=0)
float_pearson = stats.stats.pearsonr(list_target, list_predict)[0]
float_spearman = stats.stats.spearmanr(list_target, list_predict)[0]
return (float_pearson + float_spearman) / 2, np_weight
[docs]def FeatureEncoderLasso(np_genotype_rsid, np_genotype, np_phenotype, int_dim):
"""
Implementation of the two-element combinatorial encoding.
Args:
np_genotype_rsid (ndarray): 1D array containing rsid of genotype data with `str` type
np_genotype (ndarray): 2D array containing genotype data with `int8` type
np_phenotype (ndarray): 2D array containing phenotype data with `float` type
int_dim (int): The dimension of a variant (default: 3. AA, AB and BB)
Returns:
(tuple): tuple containing:
- list_interaction_rsid (ndarray): 1D array containing rsid of genotype data with `str` type
- np_interaction (ndarray): 2D array containing genotype data with `int8` type
"""
### combinatorial encoding
np_interaction = np_genotype
list_interaction_rsid = list(np_genotype_rsid)
list_combs = list(itertools.combinations(range(int(np_interaction.shape[1]/int_dim)), 2))
for idx_combs in range(len(list_combs)):
try:
### generate interaction terms
tuple_comb = list_combs[idx_combs]
np_this_interaction = np.zeros([np_phenotype.shape[0], int_dim**2], dtype='int8')
list_this_interaction_id = []
for idx_x in range(int_dim):
for idx_y in range(int_dim):
np_this_interaction_term = (np_genotype[:, tuple_comb[0] * int_dim + idx_x] * np_genotype[:, tuple_comb[1] * int_dim + idx_y]).astype(np.int8)
if not(np.array_equal(np_this_interaction_term, np_genotype[:, tuple_comb[0] * int_dim + idx_x])) and not(np.array_equal(np_this_interaction_term, np_genotype[:, tuple_comb[1] * int_dim + idx_y])):
np_this_interaction[:, idx_x * int_dim + idx_y] = np_this_interaction_term
list_this_interaction_id.append(np_genotype_rsid[tuple_comb[0] * int_dim + idx_x] + "*" + np_genotype_rsid[tuple_comb[1] * int_dim + idx_y])
### variance check (detect variance < 0.05)
sk_variance = VarianceThreshold(threshold=(.95 * (1 - .95)))
np_this_interaction = sk_variance.fit_transform(np_this_interaction)
np_this_interaction_id = np.array(list_this_interaction_id)
np_this_interaction_id = np.array(np_this_interaction_id[sk_variance.get_support()])
### f regression feature selection
np_fRegression = -np.log10(f_regression(np_this_interaction.astype(int), np_phenotype[:, -1].astype(float))[1])
np_selectedIdx = np.array([x > 2 for x in np_fRegression])
np_this_interaction = np_this_interaction[:, np_selectedIdx]
np_this_interaction_id = np_this_interaction_id[np_selectedIdx]
### append insteraction terms
int_num_interaction = np_this_interaction.shape[1]
if int_num_interaction == 0:
continue
np_interaction_append = np.empty((np_interaction.shape[0], np_interaction.shape[1] + int_num_interaction), dtype='int')
np_interaction_append[:,:-(int_num_interaction)] = np_interaction
np_interaction_append[:,-(int_num_interaction):] = np_this_interaction
np_interaction = np_interaction_append
list_interaction_rsid.extend(list(np_this_interaction_id))
except:
pass
return np.array(list_interaction_rsid), np_interaction
[docs]def FilterInLoading(np_genotype, np_phenotype):
"""
This function is for filtering low quality varaint. Before modeling each subset of genotype features, two criteria were adopted to exclude low quality data. The first criterion is that the genotype frequency of a feature should exceed 5%, where the genotype frequency means the proportion of genotype among the total samples in the dataset. The second criterion is regarding the association between the feature and the phenotype. We used χ2 test to estimate the association between the feature and the phenotype, and the p-value should be smaller than 0.01.
Args:
np_genotype (ndarray): 2D array containing genotype data with `int8` type
np_phenotype (ndarray): 2D array containing phenotype data with `float` type
Returns:
(ndarray): np_genotype
2D array containing genotype data with `int8` type
"""
try:
### variance check (detect variance < 0.05)
sk_variance = VarianceThreshold(threshold=(.95 * (1 - .95)))
np_genotype = sk_variance.fit_transform(np_genotype)
### f regression feature selection
np_fRegression = -np.log10(f_regression(np_genotype.astype(int), np_phenotype[:, -1].astype(float))[1])
np_selectedIdx = np.array([x > 2 for x in np_fRegression])
np_genotype = np_genotype[:, np_selectedIdx]
return np_genotype.shape[1]
except:
return 0
""""""""""""""""""""""""""""""
# main function
""""""""""""""""""""""""""""""
[docs]def SingleGeneEpistasisLasso(str_inputFileName_genotype, str_inputFileName_phenotype, str_outputFilePath = "", int_kOfKFold = 2, int_nJobs = 1):
"""
A workflow to model a single gene containing two-element combinatorial encoding, stability selection, filtering low quality varaint and L1-regularized Lasso regression with k-fold cross validation.
Args:
str_inputFileName_genotype (str): File name of input genotype data
str_inputFileName_phenotype (str): File name of input phenotype data
str_outputFilePath (str): File path of output file
int_kOfKFold (int): The k for k-fold cross validation (default: 2)
int_nJobs (int): The number of thread (default: 1)
Returns:
(float): float_AVG_S_P
The average of the Peason's and Spearman's correlation of the model
"""
### set path of output file
if str_outputFilePath == "":
str_outputFilePath = os.path.dirname(str_inputFileName_genotype)
#-------------------------
# load data
#-------------------------
### count lines of input files
int_num_phenotype = sum(1 for line in open(str_inputFileName_phenotype))
### get phenotype file
list_phenotype = []
with open(str_inputFileName_phenotype, 'r') as file_inputFile:
for line in file_inputFile:
list_phenotype.append(line.strip().split(","))
np_phenotype = np.array(list_phenotype, dtype=np.float)
del list_phenotype
### get genotype file
list_genotype = [[] for x in range(int_num_phenotype)]
list_genotype_rsid = []
with open(str_inputFileName_genotype, 'r') as file_inputFile:
for line in file_inputFile:
list_thisSnp = line.strip().split(" ")
np_this_genotype = np.empty([int_num_phenotype, 3], dtype='int8')
for idx_subject in range(0, int_num_phenotype):
list_allelType = [0, 0, 0]
list_allelType[np.argmax(list_thisSnp[idx_subject * 3 + 5 : idx_subject * 3 + 8])] = 1
np_this_genotype[idx_subject, :] = list_allelType
if FilterInLoading(np_this_genotype, np_phenotype) == 0:
continue
for idx_subject in range(0, int_num_phenotype):
list_allelType = [0, 0, 0]
list_allelType[np.argmax(list_thisSnp[idx_subject * 3 + 5 : idx_subject * 3 + 8])] = 1
list_genotype[idx_subject].extend(list_allelType)
list_genotype_rsid.append(list_thisSnp[1] + "_AA")
list_genotype_rsid.append(list_thisSnp[1] + "_AB")
list_genotype_rsid.append(list_thisSnp[1] + "_BB")
np_genotype = np.array(list_genotype, dtype=np.int8)
np_genotype_rsid = np.array(list_genotype_rsid)
if np_genotype_rsid.shape[0] == 0:
return 0.0
#-------------------------
# preprocess data
#-------------------------
### generate interaction terms
np_genotype_rsid, np_genotype = FeatureEncoderLasso(np_genotype_rsid, np_genotype, np_phenotype, 3)
#-------------------------
# select feature
#-------------------------
### random lasso feature selection
np_randWeight = np.array(RandomizedLassoRegression(np_genotype, np_phenotype[:, -1].astype(float)))
np_selectedIdx = np.array([x >= 0.1 for x in np_randWeight])
np_randWeight = np_randWeight[np_selectedIdx]
np_genotype = np_genotype[:, np_selectedIdx]
np_genotype_rsid = np_genotype_rsid[np_selectedIdx]
if np_genotype_rsid.shape[0] == 0:
return 0.0
#-------------------------
# build model
#-------------------------
float_AVG_S_P, np_weight = LassoRegressionCV(np_genotype, np_phenotype[:, -1].astype(float), int_kOfKFold, int_nJobs)
if float_AVG_S_P == 0.0:
return 0.0
### filter out zero-weight features
np_selectedIdx = np.array([x != 0.0 for x in np_weight])
np_weight = np_weight[np_selectedIdx]
np_genotype = np_genotype[:, np_selectedIdx]
np_genotype_rsid = np_genotype_rsid[np_selectedIdx]
if np_genotype_rsid.shape[0] == 0:
return 0.0
#-------------------------
# analyze result
#-------------------------
### calculate student t-test p-value
np_fRegression = -np.log10(f_regression(np_genotype.astype(int), np_phenotype[:, -1].astype(float))[1])
### calculate genotype frequency
np_genotypeFreq = np.sum(np_genotype, axis=0).astype(float) / np_genotype.shape[0]
#-------------------------
# output results
#-------------------------
### output statistics of features
with open(os.path.join(str_outputFilePath, os.path.basename(str_inputFileName_genotype).split("_")[0] + "_Result.csv"), "w") as file_outputFile:
file_outputFile.writelines("rsID,weight,student-t-test_log_p-value,genotype_frequency" + "\n")
for idx_feature in range(0, np_genotype_rsid.shape[0]):
file_outputFile.writelines(str(np_genotype_rsid[idx_feature,]) + "," + str(np_weight[idx_feature,]) + "," + str(np_fRegression[idx_feature,]) + "," + str(np_genotypeFreq[idx_feature]) + "\n")
### output feature
with open(os.path.join(str_outputFilePath, os.path.basename(str_inputFileName_genotype).split("_")[0] + "_Feature.csv"), "w") as file_outputFile:
file_outputFile.writelines(",".join(np_genotype_rsid) + "\n")
for idx_subject in range(0, np_genotype.shape[0]):
file_outputFile.writelines(",".join(np_genotype[idx_subject, :].astype(str)) + "\n")
return float_AVG_S_P
[docs]def BatchSingleGeneEpistasisLasso(str_inputFilePath_genotype, str_inputFileName_phenotype, str_outputFilePath = "", int_kOfKFold = 2, int_nJobs = mp.cpu_count()):
"""
Batch running for the single gene workflow.
Args:
str_inputFilePath_genotype (str): File path of input genotype data
str_inputFileName_phenotype (str): File name of input phenotype data
str_outputFilePath (str): File path of output file
int_kOfKFold (int): The k for k-fold cross validation (default: 2)
int_nJobs (int): The number of thread (default: 1)
Returns:
- Expected Success Response::
"step4: Detect single gene epistasis. DONE!"
"""
### set default output path
if str_outputFilePath == "":
str_outputFilePath = os.path.abspath(os.path.join(str_inputFilePath_genotype, os.pardir)) + "/singleGeneResult/"
### if output folder doesn't exist then create it
if not os.path.exists(str_outputFilePath):
os.makedirs(str_outputFilePath)
### scan all of the gen file in path
list_genotypeFileName = []
for str_fileName in os.listdir(str_inputFilePath_genotype):
if ".gen" in str_fileName:
list_genotypeFileName.append(str_fileName)
### batch PolyLassoRegression
### inital multiprocessing pool
mp_pool = mp.Pool(int_nJobs)
### apply pool on the function that need be parallelizing
dict_result = {}
for int_count_gene, float_AVG_S_P in enumerate(mp_pool.starmap(SingleGeneEpistasisLasso, [(os.path.join(str_inputFilePath_genotype, gene), str_inputFileName_phenotype, str_outputFilePath, int_kOfKFold, int_nJobs) for gene in list_genotypeFileName]), 0):
if list_genotypeFileName[int_count_gene] not in dict_result:
dict_result[list_genotypeFileName[int_count_gene]] = float_AVG_S_P
str_print = "step4: Processing: " + "{0:.2f}".format(float(int_count_gene) / len(list_genotypeFileName) * 100) + "% - " + list_genotypeFileName[int_count_gene] + ": " + "\t\t"
sys.stdout.write('%s\r' % str_print)
sys.stdout.flush()
mp_pool.close()
### output result
with open(str_outputFilePath + "All_Lasso_k" + str(int_kOfKFold) + ".csv", "w") as file_outputFile:
file_outputFile.writelines("GeneSymbol,AVG_S_P" + "\n")
for key, value in dict_result.items():
file_outputFile.writelines(key.split("_")[0] + "," + str(value) + "\n")
'''
### batch PolyLassoRegression
int_count_gene = 0
with open(str_outputFilePath + "All_Lasso_k" + str(int_kOfKFold) + ".csv", "w") as file_outputFile:
file_outputFile.writelines("GeneSymbol,AVG_S_P" + "\n")
for item in list_genotypeFileName:
int_count_gene = int_count_gene + 1
str_genotypeFileName = os.path.join(str_inputFilePath_genotype, item)
float_AVG_S_P = SingleGeneEpistasisLasso(str_genotypeFileName, str_inputFileName_phenotype, str_outputFilePath, int_kOfKFold, int_nJobs)
file_outputFile.writelines(item.split("_")[0] + "," + str(float_AVG_S_P) + "\n")
str_print = "step4: Processing: " + "{0:.2f}".format(float(int_count_gene) / len(list_genotypeFileName) * 100) + "% - " + item + ": " + str(float_AVG_S_P) + "\t\t"
sys.stdout.write('%s\r' % str_print)
sys.stdout.flush()
'''
print("step4: Detect single gene epistasis. DONE! \t\t\t\t")