Question

这是我的玩具数据框，真实的可能有40K-1M记录和另外五列

        animal1     version1    animal2     version2    sim             
53      20154620    TRUSEQ.v1   20104647    F250v1  0.3663569
854     20145687    TRUSEQ.v1   20105551    F250v1  0.5732854
3662    20154620    TRUSEQ.v1   20114509    F250v1  0.3374918
4063    20154620    TRUSEQ.v1   20114578    F250v1  0.3732692
4464    20154620    TRUSEQ.v1   20114595    F250v1  0.3772367
5262    20144516    TRUSEQ.v1   20115051    770k.v1 0.6034206
5663    20144516    TRUSEQ.v1   20115051    F250v1  0.6164795
5664    20145008    TRUSEQ.v1   20115051    F250v1  0.3146651
6064    20144516    TRUSEQ.v1   20115059    F250v1  0.3043295
6471    20165119    F250v1      20115096    F250v1  0.388435
9677    20154620    TRUSEQ.v1   20118095    F250v1  0.3079702
11281   20154620    TRUSEQ.v1   20134529    F250v1  0.3188631
12486   20165119    F250v1      20135032    F250v1  0.6091486
13282   20144516    TRUSEQ.v1   20135047    F250v1  0.3098507
14090   20165119    F250v1      20135072    F250v1  0.3025007
14892   20165119    F250v1      20135122    F250v1  0.345238

对于每只动物1，我需要所有以最高模拟量排列前3个独特动物2值的行......所以我想要的结果再现如下。

        animal1     version1    animal2     version2    sim
5663    20144516    TRUSEQ.v1   20115051    F250v1  0.6164795
5262    20144516    TRUSEQ.v1   20115051    770k.v1 0.6034206
13282   20144516    TRUSEQ.v1   20135047    F250v1  0.3098507
6064    20144516    TRUSEQ.v1   20115059    F250v1  0.3043295
5664    20145008    TRUSEQ.v1   20115051    F250v1  0.3146651
854     20145687    TRUSEQ.v1   20105551    F250v1  0.5732854
4464    20154620    TRUSEQ.v1   20114595    F250v1  0.3772367
4063    20154620    TRUSEQ.v1   20114578    F250v1  0.3732692
53      20154620    TRUSEQ.v1   20104647    F250v1  0.3663569
12486   20165119    F250v1      20135032    F250v1  0.6091486
6471    20165119    F250v1      20115096    F250v1  0.388435
14892   20165119    F250v1      20135122    F250v1  0.345238

因此，在子集中，每个animal1可能有1到20个观察值，但是在这种情况下，其中n = 3的＆lt; = n唯一值为animal2。

我可以通过sim和animal1对df进行排序，就像这样

mydf <- mydf[order(-xtfrm(mydf[,"animal1"]), -mydf[,"sim"]),]

我可以像每个动物一样抓住前n个观察结果

mydf2 <- by(mydf, mydf["animal1"], head, n=1)
mydf2 <- Reduce(rbind, mydf2)

但是如何将n应用于第三列，即动物2而不是观察数量？如果这是重复的道歉，答案可能隐藏在这里，how to find the top N values by group or within category (groupwise) in an R data.frame 但我似乎无法从答案中找到解决问题的方法。

Answer 1

尽管我有关于动物＆＃34; 20144516＆＃34;的问题，但这里有两个解决方案，使用dat作为您的上述样本数据（包含在底部以便重现）。我提供base-R和dplyr，但正如@Balter在评论中所建议的那样，可能还有一种简单的data.table方法。

基础R

# ordering by animal1 is not necessary, sim is priority
dat <- dat[rev(order(dat$sim)),]
dat2 <- do.call(rbind, by(dat, list(dat$animal1, dat$animal2), head, n = 1))
# ... but we need to re-sort by sim, since the ordering is lost with `by`
dat2 <- dat2[rev(order(dat2$sim)),]
head(dat2)
#        animal1  version1  animal2 version2       sim
# 5663  20144516 TRUSEQ.v1 20115051   F250v1 0.6164795
# 12486 20165119    F250v1 20135032   F250v1 0.6091486
# 854   20145687 TRUSEQ.v1 20105551   F250v1 0.5732854
# 6471  20165119    F250v1 20115096   F250v1 0.3884350
# 4464  20154620 TRUSEQ.v1 20114595   F250v1 0.3772367
# 4063  20154620 TRUSEQ.v1 20114578   F250v1 0.3732692

这为我们提供了animal1和animal2的前1个配对，按sim排序（降序）。现在我们仅使用animal1有效地重复该过程：

dat3 <- do.call(rbind, by(dat, list(dat$animal1), head, n = 3))
dat3
#                 animal1  version1  animal2 version2       sim
# 20144516.5663  20144516 TRUSEQ.v1 20115051   F250v1 0.6164795
# 20144516.5262  20144516 TRUSEQ.v1 20115051  770k.v1 0.6034206
# 20144516.13282 20144516 TRUSEQ.v1 20135047   F250v1 0.3098507
# 20145008       20145008 TRUSEQ.v1 20115051   F250v1 0.3146651
# 20145687       20145687 TRUSEQ.v1 20105551   F250v1 0.5732854
# 20154620.4464  20154620 TRUSEQ.v1 20114595   F250v1 0.3772367
# 20154620.4063  20154620 TRUSEQ.v1 20114578   F250v1 0.3732692
# 20154620.53    20154620 TRUSEQ.v1 20104647   F250v1 0.3663569
# 20165119.12486 20165119    F250v1 20135032   F250v1 0.6091486
# 20165119.6471  20165119    F250v1 20115096   F250v1 0.3884350
# 20165119.14892 20165119    F250v1 20135122   F250v1 0.3452380

（不幸的是，这些rownames是不可思议的。如果它们有意义，我建议您将rownames(dat)放入一列并保留在那里。）

`dplyr`

您也可以使用dplyr。

library(dplyr)
dat %>%
  group_by(animal1, animal2) %>%
  top_n(1, wt = sim) %>%
  group_by(animal1) %>%
  top_n(3, wt = sim) %>%
  ungroup()
# # A tibble: 11 × 5
#     animal1  version1  animal2 version2       sim
#       <int>    <fctr>    <int>   <fctr>     <dbl>
# 1  20144516 TRUSEQ.v1 20115051   F250v1 0.6164795
# 2  20165119    F250v1 20135032   F250v1 0.6091486
# 3  20145687 TRUSEQ.v1 20105551   F250v1 0.5732854
# 4  20165119    F250v1 20115096   F250v1 0.3884350
# 5  20154620 TRUSEQ.v1 20114595   F250v1 0.3772367
# 6  20154620 TRUSEQ.v1 20114578   F250v1 0.3732692
# 7  20154620 TRUSEQ.v1 20104647   F250v1 0.3663569
# 8  20165119    F250v1 20135122   F250v1 0.3452380
# 9  20145008 TRUSEQ.v1 20115051   F250v1 0.3146651
# 10 20144516 TRUSEQ.v1 20135047   F250v1 0.3098507
# 11 20144516 TRUSEQ.v1 20115059   F250v1 0.3043295

PS：使用do.call(rbind, ...)替代Reduce(rbind, ...)明显加快速度：

library(microbenchmark)
x <- by(dat, list(dat$animal1, dat$animal2), head, n = 1)
microbenchmark(
  docall = do.call(rbind, x),
  reduce = Reduce(rbind, x)
)
# Unit: milliseconds
#    expr       min        lq      mean    median        uq       max neval
#  docall  1.418577  1.493335  1.809469  1.551136  1.731466  5.216277   100
#  reduce 11.119961 11.829614 13.302388 12.727255 13.401535 26.897520   100

这种差异随着更多不同动物的增加而增加。（这是因为Reduce方法会为每个唯一动物调用rbind一次，而do.call只调用rbind一次。）

此处使用的样本数据：

dat <- structure(list(animal1 = c(20154620L, 20145687L, 20154620L, 20154620L, 
20154620L, 20144516L, 20144516L, 20145008L, 20144516L, 20165119L, 
20154620L, 20154620L, 20165119L, 20144516L, 20165119L, 20165119L
), version1 = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
1L, 2L, 2L, 1L, 2L, 1L, 1L), .Label = c("F250v1", "TRUSEQ.v1"
), class = "factor"), animal2 = c(20104647L, 20105551L, 20114509L, 
20114578L, 20114595L, 20115051L, 20115051L, 20115051L, 20115059L, 
20115096L, 20118095L, 20134529L, 20135032L, 20135047L, 20135072L, 
20135122L), version2 = structure(c(2L, 2L, 2L, 2L, 2L, 1L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("770k.v1", "F250v1"
), class = "factor"), sim = c(0.3663569, 0.5732854, 0.3374918, 
0.3732692, 0.3772367, 0.6034206, 0.6164795, 0.3146651, 0.3043295, 
0.388435, 0.3079702, 0.3188631, 0.6091486, 0.3098507, 0.3025007, 
0.345238)), .Names = c("animal1", "version1", "animal2", "version2", 
"sim"), class = "data.frame", row.names = c("53", "854", "3662", 
"4063", "4464", "5262", "5663", "5664", "6064", "6471", "9677", 
"11281", "12486", "13282", "14090", "14892"))

Answer 2

以下代码将包含多个animal1 - animal2组合，仅适用于较低sim - animal1条目的animal2值＆＃34;将是＆＃34;在前3名。如果我误解了，请告诉我。

library(dplyr)
selected <- dat %>% 
  arrange(animal1,animal2,desc(sim)) %>%
  group_by(animal1,animal2) %>% 
  mutate(rank=row_number()) %>%
  filter(rank==1) %>% ungroup() %>%
  group_by(animal1) %>%
  top_n(3,sim) %>% 
  summarise(threshold = min(sim))

out <- dat %>% 
  inner_join(selected, by = c("animal1"="animal1")) %>%
  filter(sim>=threshold) %>%
  arrange(animal1,animal2,desc(sim)) %>%
  select(-threshold)
> out
    animal1  version1  animal2 version2       sim
1  20144516 TRUSEQ.v1 20115051   F250v1 0.6164795
2  20144516 TRUSEQ.v1 20115051  770k.v1 0.6034206
3  20144516 TRUSEQ.v1 20115059   F250v1 0.3043295
4  20144516 TRUSEQ.v1 20135047   F250v1 0.3098507
5  20145008 TRUSEQ.v1 20115051   F250v1 0.3146651
6  20145687 TRUSEQ.v1 20105551   F250v1 0.5732854
7  20154620 TRUSEQ.v1 20104647   F250v1 0.3663569
8  20154620 TRUSEQ.v1 20114578   F250v1 0.3732692
9  20154620 TRUSEQ.v1 20114595   F250v1 0.3772367
10 20165119    F250v1 20115096   F250v1 0.3884350
11 20165119    F250v1 20135032   F250v1 0.6091486
12 20165119    F250v1 20135122   F250v1 0.3452380

多个列名称的子集数据帧并返回前n个点击

2 个答案:

基础R

`dplyr`