I intended to extract highly correlated features by measuring its Pearson correlation, and I got a correlation matrix by doing that. However, for filtering high correlated features, I selected correlation coefficient arbitrarily, I don't know the optimal threshold for filtering highly correlated features. I am thinking about to quantify positive and negative correlated features first, then get credible figures to set up a threshold for filtering features. Can anyone point me out how to quantify positive and negative correlated features from the correlation matrix? Is there any efficient way to select the optimal threshold for filtering highly correlated features?
reproducible data
Here is the reproducible data that I used whereas row is the number of samples, the column in the number of raw features:
> dput(my_df)
structure(list(SampleID = c("Tarca_001_P1A01", "Tarca_013_P1B01",
"Tarca_025_P1C01", "Tarca_037_P1D01", "Tarca_049_P1E01", "Tarca_061_P1F01",
"Tarca_051_P1E03", "Tarca_063_P1F03", "Tarca_075_P1G03", "Tarca_087_P1H03"
), GA = c(11, 15.3, 21.7, 26.7, 31.3, 32.1, 19.7, 23.6, 27.6,
30.6), `1_at` = c(6.06221469449721, 5.8755020052495, 6.12613148162098,
6.1345548976595, 6.28953417729806, 6.08561779473768, 6.25857984382111,
6.22016811759586, 6.22269236303877, 6.11986885253451), `10_at` = c(3.79648446367096,
3.45024474095539, 3.62841140410044, 3.51232455992681, 3.56819306931016,
3.54911765491621, 3.59024881523945, 3.69553021972333, 3.61860245801661,
3.74019994293802), `100_at` = c(5.84933778267459, 6.55052475296263,
6.42187743053935, 6.15489279092855, 6.34807354206396, 6.11780116002087,
6.24635169763079, 6.25479583503303, 6.16095987926232, 6.26979789563404
), `1000_at` = c(3.5677794435745, 3.31613364795286, 3.43245075704917,
3.63813996294905, 3.39904385276621, 3.54214650423219, 3.51532853598111,
3.50451431462302, 3.38965905673286, 3.54646930636612), `10000_at` = c(6.16681461038468,
6.18505928400759, 5.6337568741831, 5.14814946571171, 5.64064316609978,
6.25755205471611, 5.68110995701518, 5.14171528059565, 5.48711836247815,
5.69671814694115), `100009613_at` = c(4.44302662142323, 4.3934877055859,
4.6237834519809, 4.66743523288194, 4.97483476597509, 4.78673497541689,
4.77791032146269, 4.64089637146557, 4.4415876428342, 4.57411708287226
), `100009676_at` = c(5.83652223195279, 5.89836406552412, 6.01979203584278,
5.98400432133011, 6.1149144301085, 5.74573650612351, 6.04564052289621,
6.10594091413241, 5.89779877157418, 6.08906323192048), `10001_at` = c(6.33001755606083,
6.13798360106589, 5.78750241567476, 5.5920698678248, 5.84077907831575,
6.19490161026853, 5.80941714030283, 5.80320733931781, 6.05345724391988,
5.84386016796266), `10002_at` = c(4.92233877299356, 4.62812370798939,
4.79628294150335, 4.79729686531453, 4.91913790102029, 4.79997095951811,
4.90838062744781, 4.73415922096939, 4.77466915267328, 4.78437458350139
), `10003_at` = c(2.68934375273141, 2.55675627493564, 2.61341541015611,
2.69430042092269, 2.73207812554522, 2.65268941561582, 2.66697993437978,
2.59784138580729, 2.74247110877575, 2.511875309739), `100033411_at` = c(2.74561888109989,
2.70765553292035, 2.80774129091983, 2.8653583834812, 3.00137677271996,
2.83262780533507, 2.85563184073152, 2.9364732038239, 3.04291003006152,
2.87464057209658), `100033413_at` = c(2.76060893169324, 3.03645581534102,
2.64583376265592, 3.24800269901788, 2.62090678070501, 3.40648642432304,
2.3166708613396, 2.62819739311836, 2.97367900843303, 2.62634568261552
), `100033414_at` = c(3.79468365910661, 4.29971184424969, 3.81085169542991,
3.81895258294878, 4.03594900960396, 3.82989979044012, 3.29585327836005,
3.27434364943932, 3.10419531747282, 4.48509833313903), `100033418_at` = c(2.84818282222582,
2.48325694938049, 3.2386968734862, 2.72080210986981, 2.58058159047299,
2.53965338068817, 2.1940368933459, 2.39335155022896, 2.59875871802789,
2.1053634999615), `100033420_at` = c(2.81277398177906, 3.51308266658033,
2.78489562992621, 2.63705084722617, 2.63479468288161, 2.7893378666207,
2.57252259415358, 3.6809929352922, 3.33486815632383, 3.26518578675427
), `100033422_at` = c(2.14058977019523, 2.26880029802564, 2.3315210232915,
2.33064119419619, 2.24052626899434, 2.33982101586472, 2.18436254317561,
2.45046620859257, 2.56645806945223, 2.3405394322417), `100033423_at` = c(2.6928480064245,
3.03461160119094, 2.75618624035735, 2.77388400895015, 3.2286586324064,
2.93047368426024, 2.8187821442941, 3.056923038096, 2.90637516892824,
2.70751558441428), `100033424_at` = c(2.35292391447048, 2.3853610213164,
2.36292219228603, 2.46939314182722, 2.30413560438815, 2.61148325229634,
2.34045470681792, 2.48995835642741, 2.32083529534773, 2.40632218044949
), `100033425_at` = c(2.48476830655452, 2.28880889278209, 2.31409329648109,
2.28927162732448, 2.38147147362554, 2.33334530852942, 2.44322869233962,
2.34064030240538, 2.67362452592881, 2.33750820349888), `100033426_at` = c(6.53876010917445,
7.38935014141236, 6.89661896623484, 6.93808821971072, 6.58149585137493,
7.76996534217549, 6.08470562892749, 7.07455266815876, 6.94555867772862,
6.96998299746459)), class = "data.frame", row.names = c("Tarca_001_P1A01",
"Tarca_013_P1B01", "Tarca_025_P1C01", "Tarca_037_P1D01", "Tarca_049_P1E01",
"Tarca_061_P1F01", "Tarca_051_P1E03", "Tarca_063_P1F03", "Tarca_075_P1G03",
"Tarca_087_P1H03"))
my attempt:
Here is my attempt to get Pearson correlation matrix and intended to filter out highly correlated features (here I just used correlation coefficient which was chosen arbitrarily):
target <- my_df$GA
raw_feats <- my_df[,-c(1:2)]
corr_df = do.call(rbind,
apply(raw_feats, 2, function(x){
temp = cor.test(target, as.numeric(x),
alternative = "two.sided",method = "pearson")
data.frame(t = temp$statistic, p = temp$p.value,
cor_coef=temp$estimate)
}))
then I selected correlation coefficient arbitrarily as the default threshold for filtering highly correlated features.
indx <- which(corr_df$cor_coef > 0.0785 | corr_df$cor_coef<=-0.01)
mydf_new <- my_df[indx,]
I think doing this way is not accurate. any idea?
I am curious about how to quantify positive and negative correlated features, then find out optimal threshold value for filtering. How can I make this happen? any efficient way to quantify pos/neg correlated features? How can I select optimal correlation coefficient values as the threshold for filtering? any thought? Thanks in advance
I doubt you want to select on correlations -- features that are highly correlated with the target may also be highly correlated with each other, so not offer much new information. Regularized regression with cross validation is a pretty typical way of handling this sort of thing. The following fits the data (artificially extended so examples could work) using the glmnet package for regularised / cross validated regression, and gives an index at the end representing features that are likely to be useful to include in a linear model.
### using regularized regression
my_df2 <- my_df[,-1] #drop id
for(i in 1:2){ #add a bit more data for this example
my_df2 <- rbind(my_df2,my_df2+rnorm(length(my_df2),0,.1))
}
# install.packages('glmnet')
library(glmnet)
res=glmnet( y=my_df2$GA,
x=sapply(my_df2[,-1],function(x) x), #convert data to matrix
, standardize=TRUE,
alpha=1) #reduce alpha for less lasso like more ridge like behavior
plot(res,label=TRUE) #variable importance as penalty decreases
### selecting ideal regularization level using cross validation
cvres=cv.glmnet( y=my_df2$GA,
x=sapply(my_df2[,-1],function(x) x), #convert data to matrix
, standardize=TRUE,
alpha=1)
plot(cvres) #out of sample prediction error as penalty decreases
# install.packages('coefplot')
library(coefplot)
nonzero <- extract.coef(cvres) #useful features
indx <- which(colnames(my_df) %in% nonzero$Coefficient) #indexed
ranked <- nonzero[order(abs(nonzero$Value),decreasing = TRUE),] #ranked list at best penalty
plot(res,xvar='lambda',label=TRUE) #variable importance as penalty increases
nonzeromatrix<-apply(res$beta,2,function(x) (x!=0))
nonzerocount <- apply(nonzeromatrix,2,sum)
nonzeroindices <- match(0:max(nonzerocount), nonzerocount) #which lambdas does n vars change at
names(nonzeroindices) <- 0:(length(nonzeroindices)-1)
nonzeroindices <- nonzeroindices[!is.na(nonzeroindices)] #dropping NA's in case > 1 increase in n vars
incrementalbeta <-res$beta[,nonzeroindices] #beta weights as number of variables changes
matplot(names(nonzeroindices),t(incrementalbeta),type='l',xlab='N vars',ylab='beta')
rankpernvars <- lapply(nonzeroindices, function(x) {
ret <- res$beta[nonzeromatrix[,x],x]
ret <- ret[order(abs(ret))]
})