比较两个数据框的列联表

时间:2018-08-11 07:45:04

标签: r

这是原始数据框:

set.seed(100)
toydata <- data.frame(A = sample(1:50,50,replace = T),
                      B = sample(1:50,50,replace = T),
                      C = sample(1:50,50,replace = T)

)

下面是可以交换值的函数:

derangement <- function(x){
  if(max(table(x)) > length(x)/2) return(NA)
  while(TRUE){
    y <- sample(x)
    if(sum(y == x)<3) return(y)
  }
}

swapFun <- function(x, n = 10){
  inx <- which(x < n)
  y <- derangement(x[inx])
  if(length(y) == 1) return(NA) 
  x[inx] <- y
  x
}

toy是交换后的新数据框

toy <- toydata    # Work with a copy
toy[] <- lapply(toydata, swapFun)

我想通过总和的差来比较这两个数据帧的列联表,这意味着:

table1<-table(toydata$A,toydata$B)
table2<-table(toy$A,toy$B)
SUM1<-sum(abs(table1-table2))

table3<-table(toydata$A,toydata$C)
table4<-table(toy$A,toy$C)
SUM2<-sum(abs(table3-table4))

table5<-table(toydata$B,toydata$C)
table6<-table(toy$C,toy$C)
SUM3<-sum(abs(table5-table6))
我想要的是

SUM1 + SUM2 + SUM3。我可以更方便地获取它,因为有时数据框具有许多列。

如何解决?谢谢。

1 个答案:

答案 0 :(得分:1)

library(dplyr)

# your function to compare contingency tables
f = function(x,y){
  table1<-table(toydata[,x],toydata[,y])
  table2<-table(toy[,x],toy[,y])
  sum(abs(table1-table2))
}

# vectorise your function
f = Vectorize(f)

combn(x=names(toydata),
      y=names(toydata), 2) %>%             # create all combinations of your column names
  t() %>%                                  # transpose
  data.frame(., stringsAsFactors = F) %>%  # save as dataframe
  filter(X1 != X2) %>%                     # exclude pairs of same column
  mutate(SumAbs = f(X1,X2))                # apply function

#   X1 X2 SumAbs
# 1  A  B     14
# 2  A  C     26
# 3  B  C     22