查找唯一列

时间:2016-06-02 15:56:39

标签: r dataframe unique

请帮忙。我有一个如下数据框:

df <- data.frame("G"=c(1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7), 
                  "C"=c(1,0,0,0,1,0,1,0,0,0,1,0,1,0,0,0,1,0,0,1,0),
                "SKU"=c("a","b","c","a","c","d","a","c","d","a","b","c","a","b","c","b","c","d","a","b","c"))
df
 G C SKU
 1 1   a
 1 0   b
 1 0   c
 2 0   a
 2 1   c
 2 0   d
 3 1   a
 3 0   c
 3 0   d
 4 0   a
 4 1   b
 4 0   c
 5 1   a
 5 0   b
 5 0   c
 6 0   b
 6 1   c
 6 0   d
 7 0   a
 7 1   b
 7 0   c 

我想找到独特的&#34;块&#34;在这个数据框中。例如,这里我们有三个块:(a,b,c),(a,c,d)和(b,c,d)。我想用这些独特的块创建表,并对变量&#34; C&#34;在所有&#34; G&#34;对于某个区块中的每个SKU。最后得到这样的数据框:

     New_G SKU New_C
 1       1   a     2
 2       1   b     2
 3       1   c     0
 4       2   a     1
 5       2   c     1
 6       2   d     0
 7       3   b     0
 8       3   c     1
 9       3   d     0

正如我所说,这里有三个独特的块,每个块的New_G - 识别器,以及New_C - &#34; C&#34;的总和。对于某个区块中的每个SKU。 (例如,请参阅第一行.SKU =&#34; a&#34;,NEW_C = 2 - 这意味着在旧数据框SKU&#34; a&#34;同时在块中(a,b) ,c)变量&#34; C&#34; = 1两次)(另一个例子:见第四行。再次SKU =&#34; a&#34;,但NEW_C = 1 - 这意味着在旧数据框SKU&#34; a&#34;在块(a,c,d)中有变量&#34; C&#34; = 1一次)

如果我的问题不明确请立即告诉我。

3 个答案:

答案 0 :(得分:4)

您可以使用toString创建索引,并从那里按组使用简单求和。挑战在于获得独特群体的索引:

ind <- df %>% group_by(G) %>% summarise(temp=toString(SKU)) %>% mutate(fac=as.numeric(as.factor(temp)))
ind <- rep(ind$fac, each=3)
df$ind <- ind
df %>% group_by(ind, SKU) %>% summarise(New_C = sum(C))
# Source: local data frame [9 x 3]
# Groups: ind [?]
# 
#     ind    SKU New_C
#   (dbl) (fctr) (dbl)
# 1     1      a     2
# 2     1      b     2
# 3     1      c     0
# 4     2      a     1
# 5     2      c     1
# 6     2      d     0
# 7     3      b     0
# 8     3      c     1
# 9     3      d     0

修改

这可能更快:

df %>% group_by(G) %>% 
  mutate(temp=toString(SKU)) %>%
  group_by(temp, SKU) %>%
  summarise(New_C = sum(C))

答案 1 :(得分:2)

使用dplyr:

library(dplyr)
df %>%
  group_by(G) %>%
  summarize(bin = paste(SKU, collapse=',')) %>%
  left_join(df, by=c('G' = 'G')) %>%
  group_by(bin, SKU) %>%
  summarize(New_C = sum(C))

输出:

    bin    SKU New_C
  (chr) (fctr) (dbl)
1 a,b,c      a     2
2 a,b,c      b     2
3 a,b,c      c     0
4 a,c,d      a     1
5 a,c,d      c     1
6 a,c,d      d     0
7 b,c,d      b     0
8 b,c,d      c     1
9 b,c,d      d     0

答案 2 :(得分:2)

这是基础R的解决方案。

Grp <- vapply(unique(df$G), function(x) paste(df$SKU[which(df$G==x)], collapse = ""), "abc", USE.NAMES = FALSE)
ID <- vapply(1:nrow(df), function(x) paste(df$SKU[x],Grp[df$G[x]], collapse=""), "a abc", USE.NAMES = FALSE)
UniG <- unique(Grp)
New_G <- do.call(c, lapply(1:length(UniG), function(x) rep(x, nchar(UniG[x]))))
Newdf <- data.frame(New_G, t(sapply(unique(ID), function(x) list(SKU = strsplit(x,split = " ")[[1]][1], New_C = sum(df$C[which(ID==x)])), USE.NAMES = FALSE)))

> Newdf
  New_G SKU New_C
1     1   a     2
2     1   b     2
3     1   c     0
4     2   a     1
5     2   c     1
6     2   d     0
7     3   b     0
8     3   c     1
9     3   d     0

Pierre Lafortune和Edward R. Mazurek提供的dplyr解决方案要快得多。下面的BuildRandomDF构建的数据框与OP发布的数据框非常相似。

library(gtools)
BuildRandomDF <- function(n) {

    set.seed(117)
    samp1 <- sample(3:5, n, replace = TRUE)

    Len5 <- length(which(samp1==5))
    Len4 <- length(which(samp1==4))
    Len3 <- length(which(samp1==3))
    perm5 <- permutations(5,5,letters[1:5])
    perm4 <- permutations(4,4,letters[1:4])
    perm3 <- permutations(3,3,letters[1:3])
    sampPerm5 <- sample(nrow(perm5), Len5, replace = TRUE)
    sampPerm4 <- sample(nrow(perm4), Len4, replace = TRUE)
    sampPerm3 <- sample(nrow(perm3), Len3, replace = TRUE)

    G <- do.call(c, lapply(1:n, function(x) rep(x, samp1[x])))
    i <- j <- k <- 0L

    SKU <- do.call(c, lapply(1:n, function(x) {
        if (samp1[x]==3) {
            perm3[sampPerm3[j <<- j+1L],]
        } else if (samp1[x]==4) {
            perm4[sampPerm4[k <<- k+1L],]
        } else {
            perm5[sampPerm5[i <<- i+1L],]
        }}))

    C <- sample(0:1, length(SKU), replace = TRUE)
    data.frame(G, C, SKU)
}

以下是功能:

library(dplyr)
DplyrTest <- function(df) {
    df %>% group_by(G) %>% 
        mutate(temp=toString(SKU)) %>%
        group_by(temp, SKU) %>%
        summarise(New_C = sum(C))
}

DplyrCheck2 <- function(df) {
    df %>%
        group_by(G) %>%
        summarize(bin = paste(SKU, collapse=',')) %>%
        left_join(df, by=c('G' = 'G')) %>%
        group_by(bin, SKU) %>%
        summarize(New_C = sum(C))
}

BaseTest <- function(df) {
    Grp <- vapply(unique(df$G), function(x) paste(df$SKU[which(df$G==x)], collapse = ""), "abc", USE.NAMES = FALSE)
    ID <- vapply(1:nrow(df), function(x) paste(df$SKU[x],Grp[df$G[x]], collapse=""), "a abc", USE.NAMES = FALSE)
    UniG <- unique(Grp)
    New_G <- do.call(c, lapply(1:length(UniG), function(x) rep(x, nchar(UniG[x]))))
    Newdf <- data.frame(New_G, t(sapply(unique(ID), function(x) list(SKU = strsplit(x,split = " ")[[1]][1], New_C = sum(df$C[which(ID==x)])), USE.NAMES = FALSE)))
    Newdf
}

以下是时间:

df <- BuildRandomDF(10^4)

system.time(df1 <- DplyrCheck(df))
user  system elapsed 
0.43    0.00    0.43

system.time(df2 <- DplyrCheck2(df))
user  system elapsed 
0.39    0.00    0.39 

system.time(df3 <- BaseTest(df))
user  system elapsed 
5.15    0.00    5.19 

all(sort(unlist(df3$New_C))==sort(df1$New_C))
[1] TRUE
all(sort(df1$New_C)==sort(df2$New_C))
[1] TRUE