I have two dataset and I want to do this:
dput(gt_df)
structure(list(rs1 = c(1, 1), rs2 = c(0, 0), rs8 = c(1, 1), rs21 = c(0,
0)), row.names = c("1117F", "1XZ7S"), class = "data.frame")
dput(weights)
structure(list(varID = c("rs12", "rs8", "rs2"), weight = c(0.119982752530867,
-0.0375093345760517, 0.0320329747257121)), row.names = c(NA,
3L), class = "data.frame")
I want to do this: So for first sampleID in gt_df 1117F, I want to add its value across rows. If for this sampleID 1117F, if its rows matches with varID in weights then multiply the weights with gt_df else multiply 0*with gt_df for that sample and do this across all the gt_df columns for this sample and add the values.
I wrote this code:
pred <- matrix(nrow=nrow(gt_df),ncol=1)
colnames(pred) <- "Predicted"
rownames(pred) <- rownames(gt_df)
for(i in 1:nrow(gt_df)){ ##for one
for(j in 1:ncol(gt_df)){ ##loop across the sample
if(colnames(gt_df)[j] %in% weights$varID){
x <- which(weights$varID %in% colnames(gt_df)[j])
pred1 <- weights$weight[x] * gt_df[i,j]
pred[i] <- sum(pred1,pred[i])
}
else{
pred2 <- 0*gt_df[i,j]
pred[i] <- sum(pred[i],pred2)
}
}
}
In my case, I am getting only
NA being added to pred.
Is there any other way to do this or add the values across the rows for one IDs?
gt_Df: rs1 rs2 rs8 rs21
1117F 1 0 1 0
IIEI 1 0 1 1
weights file:
varID weights
rs21 0.119
rs8 -0.037
rs2 0.032
Expected outcome:
Predicted_Value
1117F -0.037
11EI 0.082
Calculation: for 1117F: 1*0 + 0*0.032 +1*-0.037 + 0*0.119=-0.037
gt_df <- structure(list(rs1 = c(1, 1), rs2 = c(0, 0), rs8 = c(1, 1), rs21 = c(0,
1)), row.names = c("1117F", "IIEI"), class = "data.frame")
weights <- structure(list(varID = c("rs21", "rs8", "rs2"), weight = c(0.119982752530867,
-0.0375093345760517, 0.0320329747257121)), row.names = c(NA,
3L), class = "data.frame")
library(dplyr)
library(tidyr)
gt_df |>
rownames_to_column("rowID") |> # Place the rownames as a separate variable called rowID to make it easier to manipulate
pivot_longer(-rowID, names_to = "varID") |> # Reshape the data to a long format (i.e. move all the variable names into its own variable called "varID", and the values of those variable into its own variable called "value"), keeping the rowID variable as is
left_join(weights, by = "varID") |> # Merge the weights data into the data frame
mutate(weight = replace_na(weight, 0), # If no weight is available, set it to 0
Predicted_value = value * weight) |> # Create a new variable that multiplies the value (1 or 0) to the weight variable
group_by(rowID) |> # Self explanatory
summarize(Predicted_value = sum(Predicted_value)) # Self explanatory
# A tibble: 2 × 2
rowID Predicted_value
<chr> <dbl>
1 1117F -0.0375
2 IIEI 0.0825