I have a data.frame, delta_counts, with values for half a million genes by 500 samples, and another data.frame, ci_data, with confidence intervals for each gene (2 rows, half a million columns).
I want a data.frame/matrix of binary indicators showing if each value in delta_counts falls outside the confidence interval for that specific gene or not.
Here's a small working example but there must be a more efficient way to handle the generation of bin_ind.
#generate some data 5 samples by 5 genes
delta_counts <- data.frame(replicate(5,sample(-10:10,5,rep=TRUE)))
names(delta_counts) <- sprintf("sample[%s]",seq(1:5))
row.names(delta_counts) <- sprintf("gene[%s]",seq(1:5))
#c.i. function
c.i.func = function(c.i.data){
#input sample size, sample mean, and sample standard deviation
n <- length(c.i.data)
xbar <- mean(as.numeric(c.i.data))
s <- sd(c.i.data)
#calculate margin of error
margin <- qt(0.975,df=n-1)*s/sqrt(n)
#calculate lower and upper bounds of confidence interval
c.i.low <- xbar - margin
c.i.high <- xbar + margin
return(c(c.i.low,c.i.high))
}
#calculate c.i.
ci_data <- apply(delta_counts, 1, c.i.func)
row.names(ci_data) <- c("low","high")
#the data so far
delta_counts
ci_data
#make a binary indicator df/matrix showing which values of delta_counts fall outside the c.i. for that specific gene.
#so for the example 5x5 data shown above it would be
bin_ind <- delta_counts
bin_ind[] <- 0
for(gene in 1:5) {
for(sample in 1:5){
if(delta_counts[gene,sample]<ci_data["low",gene]|delta_counts[gene,sample]>ci_data["high",gene]){
bin_ind[gene,sample]<-1
}
}
}
bin_ind
#but how to do that efficiently over a delta_counts of half a million genes and 500 samples?
I think you should transpose the ci_data into long format (making it the same length as delta_counts
, and make these both data.tables. You can do a fast merge of the two, using gene
column, and then use between()
to check for each column being between the lower and upper bounds. This would return a data.table of boolean columns, which I negate using ==F
and then convert to integer by wrapping in +()
library(data.table)
df = merge(
data.table(delta_counts,keep.rownames="gene"),
data.table(t(ci_data), keep.rownames="gene"),
by="gene"
)
df[, .(gene, +(between(.SD, low, high)==F)), .SDcols = patterns("sample")]
Output:
gene sample.1. sample.2. sample.3. sample.4. sample.5.
1: gene[1] 0 0 1 0 0
2: gene[2] 0 0 0 1 0
3: gene[3] 0 0 1 0 0
4: gene[4] 1 0 0 0 0
5: gene[5] 0 0 1 0 0