R - 聚合两列

时间:2014-07-25 12:21:27

标签: r aggregate

我有一个看起来像这样的数据框

 id1    id2    attr   
 ------------------
 11              a     
 11              a    
         11      a   
         11      b   
         11      c   
 22              a   
 22              a
         22      a
         22      a
 33              d
 44              e

我希望它看起来像这样。 id1,id2是计数(频率)。

id1    id2    attr   
 ------------------
 2              a     
        1       a  
        1       b
        1       c
 2              a
        2       a
 1              d
 1              e

间隙中没有值,所以如果需要,我可以用NA填充它。我尝试使用聚合函数但无法获得所需的输出。 Thanx的帮助。

4 个答案:

答案 0 :(得分:3)

这是您的数据

dat<-structure(list(id1 = c(11L, 11L, NA, NA, NA, 22L, 22L, NA, NA, 
                        33L, 44L), id2 = c(NA, NA, 11L, 11L, 11L, NA, NA, 22L, 22L, NA, 
                                           NA), attr = structure(c(1L, 1L, 1L, 2L, 3L, 1L, 1L, 1L, 1L, 4L, 
                                                                   5L), .Label = c("a", "b", "c", "d", "e"), class = "factor")), .Names = c("id1", 
                                                                                                                                            "id2", "attr"), class = "data.frame", row.names = c(NA, -11L))

所需的输出并不典型,但这似乎可以使用'plyr'

library(plyr)

#use ddply and count to count the number of instances of each case in each id
temp<-ddply(dat, .(id1, id2), transform,
  freq = count(attr))

#only keep unique rows
temp<-unique(temp)

#need to create an id column for whether there is 11,22,33,44 in either id1 or id2
temp$id<-pmax(temp$id1, temp$id2, na.rm=TRUE)

#order the rows into desired order 
temp <- temp[order(temp$id, temp$attr),]

#use these ifelse statements to replace id1 and id2
temp$id1<-ifelse(is.na(temp$id1), NA, temp$freq.freq)
temp$id2<-ifelse(is.na(temp$id2), NA, temp$freq.freq)


#just keep variables you want
temp<-temp[c(1,2,3)]

temp



   id1 id2 attr
1    2  NA    a
7   NA   1    a
8   NA   1    b
9   NA   1    c
3    2  NA    a
10  NA   2    a
5    1  NA    d
6    1  NA    e

答案 1 :(得分:2)

使用@jfreels使用tallydplyr中的dat

library(dplyr)

dat1 <- dat%>% 
       group_by(id1,id2, attr) %>%
       tally()
dat2 <- dat %>% 
         unique() 

left_join(dat2,dat1) %>% 
 mutate(id1=ifelse(!is.na(id1), n, NA),id2=ifelse(!is.na(id2), n, NA)) %>% 
 select(-n)
#Joining by: c("id1", "id2", "attr")
 #  id1 id2 attr
#1   2  NA    a
#2  NA   1    a
#3  NA   1    b
#4  NA   1    c
#5   2  NA    a
#6  NA   2    a
#7   1  NA    d
#8   1  NA    e

答案 2 :(得分:1)

此方法的结果并不是您想要的格式,但可能更容易理解。

# load library
library(dplyr)

# your data
dat<-structure(list(id1 = c(11L, 11L, NA, NA, NA, 22L, 22L, NA, NA,33L, 44L), id2 = c(NA, NA, 11L, 11L, 11L, NA, NA, 22L, 22L, NA,NA), attr = structure(c(1L, 1L, 1L, 2L, 3L, 1L, 1L, 1L, 1L, 4L,5L), .Label = c("a", "b", "c", "d", "e"), class = "factor")), .Names = c("id1","id2", "attr"), class = "data.frame", row.names = c(NA, -11L))

# tally counts the number of observations
dat %>% 
  group_by(id1,id2,attr) %>%
  tally

# output
Source: local data frame [8 x 4]
Groups: id1, id2

  id1 id2 attr n
1  11  NA    a 2
2  22  NA    a 2
3  33  NA    d 1
4  44  NA    e 1
5  NA  11    a 1
6  NA  11    b 1
7  NA  11    c 1
8  NA  22    a 2

答案 3 :(得分:0)

请原谅我可怜的R代码,但为了做出你想做的事,我不得不做非传统的事情。遗憾的是,代码的可扩展性不高。它肯定可以改进,但它提供了示例输出。唯一的区别是假设您的输入值在空白区域中具有NA。

#Concatenate each row to a single value and find the unique rows
unique.pasted<-apply(rawdata[!duplicated(rawdata),],1,paste,collapse="-")

#Concatenate each row
pasted.rows<-apply(rawdata,1,paste,collapse="-")

#Get frequencies and maintain row order
frequencies<-table(pasted.rows)[unique.pasted]

#Separate id1 and id2
id1.freq<-frequencies
id1.freq[is.na(rawdata[!duplicated(rawdata),"id1"])]<-NA
id2.freq<-frequencies
id2.freq[is.na(rawdata[!duplicated(rawdata),"id2"])]<-NA

#Obtain the final table
final.table<-data.frame(id1=id1.freq,id2=id2.freq,attr=rawdata[!duplicated(rawdata),"attr"])

#Remove row names
row.names(final.table)<-NULL

#Replace NA with empty values
final.table[is.na(final.table)]<-""
final.table

 id1 id2 attr
1   2        a
2       1    a
3       1    b
4       1    c
5   2        a
6       2    a
7   1        d
8   1        e
相关问题