Question

我从基因组范围的数据框架（染色体和起始位置）开始。我正在尝试组合1）相邻位置的行和2）在另外两列中共享值的行。注意：我想要一种有效的方法，因为我的真实数据是＆gt; 1000万行。（如果可能的话，data.table）

玩具数据：

DF <- data.frame(SampleID = c(1,1,1,1,1,2,2),
                 Chr = c(1,1,1,1,2,1,1),
                 Start = c(1, 101, 201, 401, 500, 1, 101),
                 End = c(100, 200, 300, 499, 599, 100, 200),
                 State = c(3,3,2,3,3,2,2)
                 )
DF
   SampleID Chr Start End State
1:        1   1     1 100     3
2:        1   1   101 200     3
3:        1   1   201 300     2
4:        1   1   401 499     3
5:        1   2   500 501     3
6:        2   1     1 100     2
7:        2   1   101 200     2

第1行和第1行可以组合2，因为它们相邻（1-100＆amp; 101-200）并且共享SampleID（1）和State（3）。

以下内容无法合并：

第2行＆amp; 3不匹配State s
第3行和第3行4不相邻，不共享State
第4行＆amp; 5染色体（Chr）
第6行＆amp; 7是不同的SampleID。

等等。当我们应用所有这些时，我们有这个决赛桌。

FinalDF <- data.frame(SampleID = c(1,1,1,1,2),
                      Chr = c(1,1,1,2,1),
                      Start = c(1,201,401,500,1),
                      End = c(200,300,499,599,200),
                      State = c(3,2,3,3,2))
FinalDF
  SampleID Chr Start End State
1        1   1     1 200     3
2        1   1   201 300     2
3        1   1   401 499     3
4        1   2   500 599     3
5        2   1     1 200     2

所以，到目前为止，我已尝试使用GenomicRanges包中的reduce函数，但它不起作用。

INCORRECT OUTPUT

reduce(DF2)
GRanges object with 3 ranges and 0 metadata columns:
      seqnames     ranges strand
         <Rle>  <IRanges>  <Rle>
  [1]        1 [  1, 300]      *
  [2]        1 [401, 499]      *
  [3]        2 [500, 501]      *
  -------
  seqinfo: 2 sequences from an unspecified genome; no seqlengths

我试图用data.table做一些事情，因为我的data.frames是1000万行或更长，但还是无法解决。

以下问题沿着相同的路线（可能有点复杂），但没有解决方案。 R- collapse rows based on contents of two columns

Answer 1

library(data.table)

dt = as.data.table(DF) # or convert in place using setDT

dt[, .(Start = min(Start), End = max(End), State = State[1])
   , by = .(SampleID, Chr, rleid(State),
            cumsum(c(FALSE, head(End + 1, -1) < tail(Start, -1))))]
#   SampleID Chr rleid cumsum Start End State
#1:        1   1     1      0     1 200     3
#2:        1   1     2      0   201 300     2
#3:        1   1     3      1   401 499     3
#4:        1   2     3      1   500 599     3
#5:        2   1     4      1     1 200     2

Answer 2

如果我正确地解释了您要执行的操作，我建议执行以下操作：使用dplyr按要保留的元数据进行分组，然后使用GenomicRanges计算每个组中的范围（如果遇到性能问题，您可能希望避开data.frame所需的GenomicRanges并手动实现它以利用dyplr与data.tables的性能。这是一个如何工作的例子（利用管道%>%来更容易看到发生了什么）：

DF <- data.frame(SampleID = c(1,1,1,1,1,2,2),
                 Chr = c(1,1,1,1,2,1,1),
                 Start = c(1, 101, 201, 401, 500, 1, 101),
                 End = c(100, 200, 300, 499, 599, 100, 200),
                 State = c(3,3,2,3,3,2,2)
)

library(dplyr)
# take your data frame
DF %>% 
  # group it by the subsets
  group_by(SampleID, Chr, State) %>% 
  # operate on each group
  do(
    # turn subset into a GRanges object
    as(as.data.frame(.), "GRanges") %>%
      # reducae ranges
      GenomicRanges::reduce() %>% 
      # turn back into data frame for dplyr to stitch together
      as.data.frame() %>% 
      # get the information you want
      select(start, end, width)
  ) %>% 
  # ungroup for future operations
  ungroup() %>% 
  # sort by what makes most sense for your set
  arrange(SampleID, Chr, start)

输出：

Source: local data frame [5 x 6]

SampleID Chr State start end width
(dbl) (dbl) (dbl) (int) (int) (int)
 1     1     3     1   200   200
 1     1     2   201   300   100
 1     1     3   401   499    99
 1     2     3   500   599   100
 2     1     2     1   200   200

Answer 3

# This code is kind of robust but it appears to get the job done

DF <- data.frame(SampleID = c(1,1,1,1,1,2,2),
                 Chr = c(1,1,1,1,2,1,1),
                 Start = c(1, 101, 201, 401, 500, 1, 101),
                 End = c(100, 200, 300, 499, 599, 100, 200),
                 State = c(3,3,2,3,3,2,2)
)

test_and_combine <- function(r1,r2) {
  if (r1[,1] == r2[,1] & # check if "SampleID" column matches
      r1[,2] == r2[,2] & # check if  "Chr" column matches
      (r1[,4] + 1) == r2[,3] & # test if Start and End are in sequence
      r1[,5] == r2[,5]) # check if "State"column matches
    {
    # merge rows if true
    DF_comb <- r1[,]
    DF_comb[1,4] <- r2[,4]

  }
  else{
    DF_comb <- NA 
  }
  return(DF_comb)
}

# This section could rewritten to use Reduce()
DF_comb_final <- data.frame()
for(i in 1:(nrow(DF)-1)){ # loop through ever row of data.frame
  DF_temp <- test_and_combine(DF[i,],DF[i+1,]) # send two rows to function
  if(!any(is.na(DF_temp))){
    DF_comb_final <- rbind(DF_comb_final,DF_temp)    
  }
}

R - 比较data.frame的行（根据条件组合）

3 个答案: