子集R中每个观察的第一行

时间:2014-08-27 17:35:44

标签: r dataframe subset

我有数据框:

Observations Value
obs 1     1
obs 1     2
obs 1     3
obs 1     4
obs 2     5
obs 2     6
obs 2     7
obs 3     8
obs 3     9

是否可以仅对每个观察的第一行进行子集化?

这样我得到了:

Observations Value
obs 1     1
obs 2     5
obs 3     8

4 个答案:

答案 0 :(得分:3)

我认为你想使用duplicated功能。

> df[!duplicated(df$Observations), ]
#   Observations Value
# 1        obs 1     1
# 5        obs 2     5
# 8        obs 3     8

另一种选择是

> unsplit(lapply(split(df, df$Observations), `[`, 1,), levels(df$Observations))
#   Observations Value
# 1        obs 1     1
# 5        obs 2     5
# 8        obs 3     8

虽然后者使用循环,并且将比duplicated方法慢很多。

do.call("rbind", lapply(split(df, df$Observations), "[", 1,))也可以使用。

答案 1 :(得分:2)

还可以使用dplyr

library(dplyr)
group_by(dat, Observations) %>% filter(row_number() == 1)
#Source: local data frame [3 x 2]
#Groups: Observations
#
#  Observations Value
#1        obs_1     1
#2        obs_2     5
#3        obs_3     8

使用@akrun的样本数据:

set.seed(49)
dat <- data.frame(Observations=sample(LETTERS, 1e5, replace=TRUE), value=rnorm(1e5))
library(microbenchmark)
library(data.table)
f1 <- function(){aggregate(value~Observations, head, 1, data=dat)}
f2 <- function(){dat[!duplicated(dat$Observations), ]}  
f3 <- function(){DT <- data.table(dat, key='Observations')
                 DT[J(unique(Observations)), mult="first"]}
f4 <- function(){group_by(dat, Observations) %>% filter(row_number() == 1)}

microbenchmark(f1(), f2(), f3(), f4(), unit="relative")
#Unit: relative
#expr         min         lq     median         uq        max neval
#f1() 149.0736206 145.881588 143.122352 138.611025 108.063314   100
#f2()   1.8248371   1.805648   1.783553   1.736195   1.554765   100
#f3()   0.9861738   1.259007   1.279011   1.270937  11.535428   100
#f4()   1.0000000   1.000000   1.000000   1.000000   1.000000   100

在@Arun评论之后用另一个data.table - 方法更新:

set.seed(49)
dat <- data.frame(Observations=sample(LETTERS, 1e5, replace=TRUE), value=rnorm(1e5))
library(microbenchmark)
f1 <- function(){aggregate(value~Observations, head, 1, data=dat)}
f2 <- function(){dat[!duplicated(dat$Observations), ]}  
f3 <- function(){DT <- data.table(dat, key='Observations')
                 DT[J(unique(Observations)), mult="first"]}
f4 <- function(){group_by(dat, Observations) %>% filter(row_number() == 1)}
f5 <- function() {dt = as.data.table(dat); unique(dt, by="Observations")}

microbenchmark(f1(), f2(), f3(), f4(), f5(), unit="relative")
#Unit: relative
#expr        min         lq     median         uq        max neval
#f1() 274.036916 247.499012 234.616587 227.094582 8.54993826   100
#f2()   3.065027   3.059164   2.881088   2.797630 0.10404962   100
#f3()   2.122190   2.197721   2.105737   2.056280 0.08284540   100
#f4()   1.731631   1.703298   1.616957   1.584485 0.07353602   100
#f5()   1.000000   1.000000   1.000000   1.000000 1.00000000   100

答案 2 :(得分:1)

这是使用R基函数的一种方法

> aggregate(Value~Observations, head, 1, data=df)  # df is your data.frame
  Observations Value
1         obs1     1
2         obs2     5
3         obs3     8

这个会给你相同的输出

> aggregate(Value~Observations, function(x) x[1], data=df)

答案 3 :(得分:1)

 library(data.table) 
 setDT(dat)[, .SD[1], by=Observations]
 #  Observations Value
 #1:        obs 1     1
 #2:        obs 2     5
 #3:        obs 3     8

或者更快的方式:

DT <- data.table(dat, key='Observations')
DT[J(unique(Observations)), mult="first"]
#   Observations Value
#1:        obs 1     1
#2:        obs 2     5
#3:        obs 3     8

基准

不在非常大的数据集中

set.seed(49)
dat <- data.frame(Observations=sample(LETTERS, 1e6, replace=TRUE), value=rnorm(1e6))

library(microbenchmark)
f1 <- function() {aggregate(value~Observations, head, 1, data=dat)}
f2 <- function(){ dat[!duplicated(dat$Observations), ]}  
f3 <- function(){DT <- data.table(dat, key='Observations')
            DT[J(unique(Observations)), mult="first"]}

 microbenchmark(f1(), f2(), f3(), unit="relative")
#    Unit: relative
# expr        min         lq    median        uq       max neval
# f1() 351.098220 365.803821 356.66198 302.875946 102.496097   100
# f2()   2.299184   2.218348   2.35962   1.995709   1.701758   100
# f3()   1.000000   1.000000   1.00000   1.000000   1.000000   100

更新

或者@Arun

的建议
 unique(DT, by="Observations")