对行进行分组直到满足特定条件

时间:2021-03-23 06:27:32

标签: r dataframe grouping sequence

我对在满足某些条件之前将行分组在一起有疑问。这是我的数据框。

| Gene     | directon       |intergenic_distance|
| -------- | -------------- |-------------------|
| fixA     | 11             |NA                 |
| fixB     | 11             |15                 |
| fixC     | 11             |51                 |
| fixX     | 11             |-3                 |
| kefF     | 11             |108                |
| kefC     | 11             |-7                 |
| apaH     | 12             |NA                 |
| apaG     | 12             |7                  |

我想在 intergenic_distance>50 之后对行进行分组,并在如下所示的同一方向内进行分组。

| Gene     | directon       |intergenic_distance|operon|
| -------- | -------------- |-------------------|------|
| fixA     | 11             |NA                 |1     |
| fixB     | 11             |15                 |1     |
| fixC     | 11             |51                 |2     |
| fixX     | 11             |-3                 |2     |
| kefF     | 11             |108                |3     |
| kefC     | 11             |-7                 |3     |
| apaH     | 12             |NA                 |4     |
| apaG     | 12             |7                  |4     |

我正在考虑使用 with、rle、rep、seq_along,但我不知道如何使用。提前致谢!

dput(head(e_coli_operon,10))
structure(list(name = c("thrA", "thrB", "thrC", "yaaW", "yaaI", 
"mokC", "hokC", "insB", "insA", "yaaY"), start = c(337, 2801, 
3734, 10643, 11382, 16751, 16751, 19811, 20233, 21181), end = c(2799, 
3733, 5020, 11356, 11786, 16960, 16903, 20314, 20508, 21399), 
    strand = c(1, 1, 1, -1, -1, -1, -1, -1, -1, 1), length = c(820L, 
    310L, 428L, 237L, 134L, 69L, 50L, 167L, 91L, 72L), pid = c(16127996L, 
    16127997L, 16127998L, 16128005L, 16128007L, 16128012L, 49175991L, 
    16128015L, 16128016L, 16128018L), gene = c("thrA", "thrB", 
    "thrC", "yaaW", "yaaI", "mokC", "hokC", "insB", "insA", "yaaY"
    ), synonym = c("b0002", "b0003", "b0004", "b0011", "b0013", 
    "b0018", "b4412", "b0021", "b0022", "b0024"), code = c("-", 
    "-", "-", "-", "-", "-", "-", "-", "-", "-"), cog = c("COG0527E", 
    "COG0083E", "COG0498E", "COG4735S", "-", "-", "-", "COG1662L", 
    "COG3677L", "-"), product = c("fused aspartokinase I and homoserine dehydrogenase I", 
    "homoserine kinase", "threonine synthase", "conserved protein, UPF0174 family", 
    "conserved protein, UPF0412 family", "regulatory protein for HokC, overlaps CDS of hokC", 
    "toxic membrane protein, small", "IS1 transposase B", "IS1 repressor TnpA", 
    "predicted protein"), col = c("blue", "blue", "blue", "blue", 
    "blue", "blue", "blue", "blue", "blue", "blue"), fill = c("blue", 
    "blue", "blue", "blue", "blue", "blue", "blue", "blue", "blue", 
    "blue"), lty = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1), lwd = c(1, 
    1, 1, 1, 1, 1, 1, 1, 1, 1), pch = c(8, 8, 8, 8, 8, 8, 8, 
    8, 8, 8), cex = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1), gene_type = c("arrows", 
    "arrows", "arrows", "arrows", "arrows", "arrows", "arrows", 
    "arrows", "arrows", "arrows"), directon = c("1", "1", "1", 
    "4", "4", "6", "6", "8", "8", "9"), intergenic_distance = c(82, 
    2, 1, 149, 26, NA, -209, NA, -81, NA)), row.names = c(NA, 
-10L), groups = structure(list(directon = c("1", "4", "6", "8", 
"9"), .rows = structure(list(1:3, 4:5, 6:7, 8:9, 10L), ptype = integer(0), class = c("vctrs_list_of", 
"vctrs_vctr", "list"))), row.names = c(NA, 5L), class = c("tbl_df", 
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df", 
"tbl_df", "tbl", "data.frame"))

1 个答案:

答案 0 :(得分:0)

您可以在以下情况下将 operon 值增加 1:

  • intergenic_distance 大于 50 并且不是 NA OR
  • 当前的 directon 值与之前的 directon 值不同。
library(dplyr)

df %>%
  mutate(operon = cumsum(intergenic_distance > 50 & !is.na(intergenic_distance)|
                       directon != lag(directon, default = first(directon))) + 1)

#  Gene directon intergenic_distance operon
#1 fixA       11                  NA      1
#2 fixB       11                  15      1
#3 fixC       11                  51      2
#4 fixX       11                  -3      2
#5 kefF       11                 108      3
#6 kefC       11                  -7      3
#7 apaH       12                  NA      4
#8 apaG       12                   7      4

数据

df <- structure(list(Gene = c("fixA", "fixB", "fixC", "fixX", "kefF", 
"kefC", "apaH", "apaG"), directon = c(11L, 11L, 11L, 11L, 11L, 
11L, 12L, 12L), intergenic_distance = c(NA, 15L, 51L, -3L, 108L, 
-7L, NA, 7L)), row.names = c(NA, -8L), class = "data.frame")
相关问题