突变列以检测变体数量

时间:2019-01-28 07:22:38

标签: r dataframe apply

我是R语言的新手。我有一个包含很多参考和很多示例的数据框。我想编写函数来计算变体的数量,例如,如果样本的基因型等于ref列,则为0,如果等于het列,则为1,如果等于风险列,它就到了2。

df:
SNP    ref  het risk Sample1 Sample2 ...
rs1     GG  AG  AA  AG  GG
rs2     AA  AG  GG  AG  AA
rs3     AA  AG  GG  AG  AG
rs4     GG  AG  AA  AG  AA
rs5     GG  AG  AA  AG  AA
rs6     GG  AG  AA  AG  AG
rs7     AA  AG  GG  AA  AA
rs8     CC  AC  AA  AC  CC
rs9     GG  AG  AA  GG  GG
rs10    GG  AG  AA  GG  AG
rs11    AA  AG  GG  AA  GG
rs12    GG  AG  AA  AA  AG
rs13    GG  AG  AA  AG  AA
rs14    AA  AG  GG  AG  AA
rs15    GG  AG  AA  AA  AA
rs16    AA  AC  CC  AA  AA
rs17    AA  AG  GG  AA  AA
rs18    GG  AG  AA  GG  GG
rs19    GG  AG  AA  GG  AG
rs20    GG  AG  AA  AG  AG
...

desired output:
SNP     ref het risk Sample1 Sample2 Sample1.vd Sample2.vd ...
rs1     GG  AG  AA  AG  GG  1   0
rs2     AA  AG  GG  AG  AA  1   0
rs3     AA  AG  GG  AG  AG  1   1
rs4     GG  AG  AA  AG  AA  1   2
rs5     GG  AG  AA  AG  AA  1   2
rs6     GG  AG  AA  AG  AG  1   1
rs7     AA  AG  GG  AA  AA  0   0
rs8     CC  AC  AA  AC  CC  1   0
rs9     GG  AG  AA  GG  GG  0   0
rs10    GG  AG  AA  GG  AG  0   1
rs11    AA  AG  GG  AA  GG  0   2
rs12    GG  AG  AA  AA  AG  2   1
rs13    GG  AG  AA  AG  AA  1   2
rs14    AA  AG  GG  AG  AA  1   0
rs15    GG  AG  AA  AA  AA  2   2
rs16    AA  AC  CC  AA  AA  0   0
rs17    AA  AG  GG  AA  AA  0   0
rs18    GG  AG  AA  GG  GG  0   0
rs19    GG  AG  AA  GG  AG  0   1
rs20    GG  AG  AA  AG  AG  1   1   
...

我尝试编写一个函数,然后使用Apply函数。

VariantDetected <- function(df) {
x <- which(df[5:length(df)] == df[,c("ref","het","risk_hom")])
return(x)
}
apply(df, 1, VariantDetected)

但是它带有错误,有什么建议吗?谢谢。

2 个答案:

答案 0 :(得分:2)

使用dplyr软件包更容易,因为它使您的代码更具可读性。如果您不介意列的数据类型,则可以删除函数中的倒数第二行。希望对您有帮助。

#Needed library---------
library(dplyr)
# Your function------------
VariantDetected <- function(dataset) {
df1 <- data.frame(sapply(dataset, function(x) as.character(x)),stringsAsFactors = F)
df1 <- df1 %>% mutate(Sample1.vd = ifelse(Sample1 == ref,0,ifelse(Sample1 == het,1,2)),
                    Sample2.vd = ifelse(Sample2 == ref,0,ifelse(Sample2 == het,1,2)))
df1[1:6] <- data.frame(sapply(df1[1:6], function(x) as.factor(x))) # you can delete this line if you dont mind the first 6 columns to be charecters
return(df1)
}
#execute it on your dataset-----------
df <- VariantDetected(df)

致谢/ Revanth Nemani

答案 1 :(得分:0)

我认为您需要在此处嵌套ifelse

df[paste0("Sample", seq_along(5:ncol(df)), ".vd")] <- t(apply(df, 1, function(x) 
                  ifelse(x[5:length(x)] == x["ref"], 0,
                  ifelse(x[5:length(x)] == x["het"], 1, 2))))


df
#    SNP ref het risk Sample1 Sample2 Sample1.vd Sample2.vd
#1   rs1  GG  AG   AA      AG      GG          1          0
#2   rs2  AA  AG   GG      AG      AA          1          0
#3   rs3  AA  AG   GG      AG      AG          1          1
#4   rs4  GG  AG   AA      AG      AA          1          2
#5   rs5  GG  AG   AA      AG      AA          1          2
#6   rs6  GG  AG   AA      AG      AG          1          1
#7   rs7  AA  AG   GG      AA      AA          0          0
#8   rs8  CC  AC   AA      AC      CC          1          0
#9   rs9  GG  AG   AA      GG      GG          0          0
#10 rs10  GG  AG   AA      GG      AG          0          1
#....

或者如果您想将其用作功能

VariantDetected <- function(x) {
   ifelse(x[5:length(x)] == x["ref"], 0,
        ifelse(x[5:length(x)] == x["het"], 1, 2))
}

df[paste0("Sample", seq_along(5:ncol(df)), ".vd")]<-t(apply(df, 1, VariantDetected))

数据

df <- structure(list(SNP = structure(c(1L, 12L, 14L, 15L, 16L, 17L, 
18L, 19L, 20L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 13L), .Label = 
c("rs1", 
"rs10", "rs11", "rs12", "rs13", "rs14", "rs15", "rs16", "rs17", 
"rs18", "rs19", "rs2", "rs20", "rs3", "rs4", "rs5", "rs6", "rs7", 
"rs8", "rs9"), class = "factor"), ref = structure(c(3L, 1L, 1L, 
3L, 3L, 3L, 1L, 2L, 3L, 3L, 1L, 3L, 3L, 1L, 3L, 1L, 1L, 3L, 3L, 
3L), .Label = c("AA", "CC", "GG"), class = "factor"), het = 
structure(c(2L, 
2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 
2L, 2L, 2L), .Label = c("AC", "AG"), class = "factor"), risk = 
structure(c(1L, 
3L, 3L, 1L, 1L, 1L, 3L, 1L, 1L, 1L, 3L, 1L, 1L, 3L, 1L, 2L, 3L, 
1L, 1L, 1L), .Label = c("AA", "CC", "GG"), class = "factor"), 
Sample1 = structure(c(3L, 3L, 3L, 3L, 3L, 3L, 1L, 2L, 4L, 
4L, 1L, 1L, 3L, 3L, 1L, 1L, 1L, 4L, 4L, 3L), .Label = c("AA", 
"AC", "AG", "GG"), class = "factor"), Sample2 = structure(c(4L, 
1L, 2L, 1L, 1L, 2L, 1L, 3L, 4L, 2L, 4L, 2L, 1L, 1L, 1L, 1L, 
1L, 4L, 2L, 2L), .Label = c("AA", "AG", "CC", "GG"), class = "factor"), 
Sample1.vd = c(1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 2, 1, 1, 
2, 0, 0, 0, 0, 1), Sample2.vd = c(0, 0, 1, 2, 2, 1, 0, 0, 
0, 1, 2, 1, 2, 0, 2, 0, 0, 0, 1, 1)), row.names = c(NA, -20L
), .Names = c("SNP", "ref", "het", "risk", "Sample1", "Sample2", 
"Sample1.vd", "Sample2.vd"), class = "data.frame")