将缺失的染色体的频率计数添加为0

时间:2018-12-12 08:06:35

标签: r bioinformatics

这是根据染色体的cn0文件和频率值

  $gw6.00033
 X  Sample_Name Chr_No Copy_No Frequence
  1 gw6.00033  chr1:    cn=0         1
 12 gw6.00033  chr2:    cn=0         1
 16 gw6.00033  chr4:    cn=0         1
 20 gw6.00033  chr6:    cn=0         1

   $gw6.0006
  X Sample_Name Chr_No Copy_No Frequence
  26 gw6.0006  chr1:    cn=0         1
  29 gw6.0006  chr10    cn=0         3
  31 gw6.0006  chr11    cn=0         2
  34 gw6.0006  chr13    cn=0         1
  37 gw6.0006  chr15    cn=0         1
  38 gw6.0006  chr16    cn=0         1
  41 gw6.0006  chr2:    cn=0         1
  47 gw6.0006  chr3:    cn=0         1
  57 gw6.0006  chr8:    cn=0         2

这是R中的代码:-

sp <- split(cn0, cn0$sample_name)
N <- 22
sp <- lapply(sp, function(DF){
M <- as.numeric(sub("[^[:digit:]]+([[:digit:]]+)[^[:digit:]]*", "\\1",     DF$Chr_no))
Chr_no <- sub("[[:digit:]]+[^[:digit:]]*$", "", DF$Chr_no)
Chr_no <- paste0(Chr_no, 1:N)
Chr_no <- ifelse(nchar(Chr_no) == 4, paste0(Chr_no, ":"), Chr_no)
sample_name <- rep(DF$sample_name[1], length(Chr_no))
res <- data.frame(sample_name, Chr_no)
res$Frequence <- 0
res$Frequence[1:N %in% M] <- DF$Frequence
res
  })
   abc <- as.data.frame(t(sapply(sp, '[[', 'Frequence')))
    names(abc) <- sp[[1]]$Chr_no

它给出输出:-

 $gw6.00033
      Sample_Name Chr_No Frequence
    1  gw6.00033  chr1:         1
    2  gw6.00033  chr2:         1
    3  gw6.00033  chr3:         0
    4  gw6.00033  chr4:         1
    5  gw6.00033  chr5:         0
    6  gw6.00033  chr6:         1
    7  gw6.00033  chr7:         0
    8  gw6.00033  chr8:         0
    9  gw6.00033  chr9:         0


$gw6.0006
 Sample_Name Chr_No Frequence
1  gw6.0006  chr1:         1
2  gw6.0006  chr2:         3?
3  gw6.0006  chr3:         2?
4  gw6.0006  chr4:         0
5  gw6.0006  chr5:         0
6  gw6.0006  chr6:         0
7  gw6.0006  chr7:         0
8  gw6.0006  chr8:         1?

它给出结果,但在创建矩阵时出现错误的频率,将丢失的染色体放在其位置,但没有以频率出现,因此应将0对应于丢失的染色体。对于样本gw6.00033,它创建正确的数据帧,因为所有染色体都按升序排列,但是对于样本gw6.0006,频率是错误的,需要对代码进行哪些改进。

1 个答案:

答案 0 :(得分:1)

使用 tidyr :: complete

df1 <- read.table(text = "X  Sample_Name Chr_No Copy_No Frequence
1 gw6.00033  chr1:    cn=0         1
12 gw6.00033  chr2:    cn=0         1
16 gw6.00033  chr4:    cn=0         1
20 gw6.00033  chr6:    cn=0         1
26 gw6.0006  chr1:    cn=0         1
29 gw6.0006  chr10    cn=0         3
31 gw6.0006  chr11    cn=0         2
34 gw6.0006  chr13    cn=0         1
37 gw6.0006  chr15    cn=0         1
38 gw6.0006  chr16    cn=0         1
41 gw6.0006  chr2:    cn=0         1
47 gw6.0006  chr3:    cn=0         1
57 gw6.0006  chr8:    cn=0         2", header = TRUE, stringsAsFactors = FALSE)

library(dplyr)
library(tidyr)

# ordered factor vector
allChroms <- factor(c(paste0("chr", 1:9, ":"), paste0("chr", 10:22)),
                    levels = c(paste0("chr", 1:9, ":"), paste0("chr", 10:22)))

res <- complete(df1[, -1], 
                Sample_Name,
                Chr_No = allChroms,
                fill = list(Copy_No = "cn=0", Frequence = 0))

res
# # A tibble: 44 x 4
#    Sample_Name Chr_No Copy_No Frequence
#    <chr>       <chr>  <chr>       <dbl>
#  1 gw6.00033   chr1:  cn=0            1
#  2 gw6.00033   chr2:  cn=0            1
#  3 gw6.00033   chr3:  cn=0            0
#  4 gw6.00033   chr4:  cn=0            1
#  5 gw6.00033   chr5:  cn=0            0
#  6 gw6.00033   chr6:  cn=0            1
#  7 gw6.00033   chr7:  cn=0            0
#  8 gw6.00033   chr8:  cn=0            0
#  9 gw6.00033   chr9:  cn=0            0
# 10 gw6.00033   chr10  cn=0            0

编辑::检查它是否正常运行...

res[ res$Chr_No == "chr10", ]
#   Sample_Name Chr_No Copy_No Frequence
# 1 gw6.00033   chr10  cn=0            0
# 2 gw6.0006    chr10  cn=0            3

res[ res$Chr_No == "chr1:", ]
#   Sample_Name Chr_No Copy_No Frequence
# 1 gw6.00033   chr1:  cn=0            1
# 2 gw6.0006    chr1:  cn=0            1