比较两个表之间的列类型

时间:2018-06-18 10:50:39

标签: r validation difference

如果我有两个表(File1)和(File2)

> dput(File1)
structure(list(Column.1 = structure(1:3, .Label = c("Row 1", 
"Row 2", "Row 3"), class = "factor"), Column.2 = c(NA, NA, NA
), Column.3 = c(NA, NA, NA), colNames = c(TRUE, TRUE, TRUE)), class = "data.frame", row.names = c(NA, 
-3L))
> dput(File2)
structure(list(Column.1 = structure(1:3, .Label = c("Row 1", 
"Row 2", "Row 3"), class = "factor"), Column.2 = c(1, 2, 34), 
    Column.3 = c(NA, NA, NA), colNames = c(TRUE, TRUE, TRUE)), class = "data.frame", row.names = c(NA, 
-3L))

我要确认文件1和文件2之间的列名,列类型和行数和列数,如果它们全部相同则返回TRUE,否则返回FALSE,如何添加到这段代码我写的?

我尝试了Compare column types between two data frames中的一些答案,但我只是寻找一个真或假的答案。这是我目前的代码。

check_file <- function(File1 , File2) {
  if (!nrow(File1) == nrow(File2)) {
    print("Non matching number of rows")
    return(FALSE)

  } else if (!ncol(File1) == ncol(File2)) {
    print("non matching number of columns")
    return(FALSE)
  } else if (length(grep("FALSE", names(File1) == names(File2)))>0){
    print("Non matching names of columns")
    return(FALSE)
  }else if (!class(File1)==class(File2)){
      print("Non matching column types")
    }
  return(TRUE)
}


check <- check_file(File1, File2)


if (check) {
  return(TRUE)
} else{
  return(FALSE)
}

我认为剩下的就是各种类型。例如,在dput文件2中,列2具有数字,而文件1具有NA。它们不必是相同的数字,但它需要返回false,因为它是NA。如果文件1有3,2,564,则应返回TRUE。

2 个答案:

答案 0 :(得分:1)

第一个解决方案:

all(                                                 # check if all ar T
  sapply(                                            # 
    c(colnames, dim, function(x){sapply(x, class)}), # functions to apply
    function(f) all(f(File1) == f(File2))            # check 4 equality  
    )                                                #
  )                                                  #
[1] FALSE                                            # numeric != logical
#all(              
#  sapply(
#    c(colnames, dim, function(x){sapply(x, class)}), 
#    function(f) all(f(File1) == f(File1))
#    )
#  )
#[1] TRUE

[编辑0] dim而不是nrow。 [编辑1]

第二种解决方案

如果两列有不同的类,但其中一列为空 - 返回TRUE

数据集

df1 <- data.frame(Column1 = paste("Row", 1:3), Column2 = 1:3,
                  Column3 = NA, colNames = TRUE)

df2 <- df1; df2[, 2] <- c(1, 2, 34)

df3 <- data.frame(Column1 = paste("Row", 1:3), Column2 = NA, Column3 = NA)

df4 <- df3

df4[, 2] <- "ddd"
df4[, 3] <- c(3, 4, 2)

df1
#  Column1 Column2 Column3 colNames
#1   Row 1       1      NA     TRUE
#2   Row 2       2      NA     TRUE
#3   Row 3       3      NA     TRUE

df2
#  Column1 Column2 Column3 colNames
#1   Row 1       1      NA     TRUE
#2   Row 2       2      NA     TRUE
#3   Row 3      34      NA     TRUE

请注意class(df1[,2]) == "integer"class(df2[,2]) == "numeric"

df3
#  Column1 Column2 Column3
#1   Row 1      NA      NA
#2   Row 2      NA      NA
#3   Row 3      NA      NA

df4
#  Column1 Column2 Column3
#1   Row 1     ddd       3
#2   Row 2     ddd       4
#3   Row 3     ddd       2

功能定义

identical_df <- function(x, y){
    ifelse(!identical(colnames(x), colnames(y)), FALSE,
           ifelse(!identical(dim(x), dim(y)), FALSE,
                  all((sapply(x, class) == sapply(y, class)) |
                      (apply(is.na(x), 2, prod) | apply(is.na(y), 2, prod))
                     )
                  )
           )
}

df1df2上测试该功能; df1df3; df3df4

identical_df(df1, df1) # identical 
#[1] TRUE              #
identical_df(df1, df2) # class(df1[,2]) != class(df2[,2])
#[1] FALSE
identical_df(df1, df3) # dim(df1) != dim(df3)
#[1] FALSE
identical_df(df3, df4) # different classes for cols 2, 3
#[1] TRUE              # however both cols 2, 3 in df3 are empty (NAs)
# ==============================================================================
# Evaluation of 
# all((sapply(x, class) == sapply(y, class)) |
#     (apply(is.na(x), 2, prod) | apply(is.na(y), 2, prod))
# )
# for x = df3, y = df4
#
# +-------------------------------------------------+--------+--------+--------+
# |Expression                                       |Column1 |Column2 |Column3 |
# +-------------------------------------------------+--------+--------+--------+
# |sapply(x, class) == sapply(y, class)   +--------<|TRUE    |FALSE   |FALSE   |
# +                                       |         +--------+--------+--------+
# |apply(is.na(x), 2, prod)               |     +--<|0       |1       |1       |
# +                                       OR-+<OR   |        |        |        |
# |apply(is.na(y), 2, prod)               |  |  +--<|0       |0       |0       |
# |                                       |  |      |        |        |        |
# |                                       |  +----->|FALSE   |TRUE    |TRUE    |
# |                                       |         |        |        |        |
# |                                       |         +--------+--------+--------+
# |                                       +-------->|TRUE    |TRUE    |TRUE    |
# +-------------------------------------------------+--------+--------+--------+ 

答案 1 :(得分:0)

您可以使用identical功能。

使用您的功能:

check_file <- function(File1 , File2) {
  if (identical(summary.default(File1)[,3],
                summary.default(File2)[,3]) == FALSE) {
    print("Not Same Str")
    return(FALSE)}

  if (identical(class(File1), class(File2)) == FALSE) {
    print("Not Same Class")
    return(FALSE)}

  if (identical(names(File1), names(File2)) == FALSE) {
    print("Non matching number of rows")
    return(FALSE)}

  if (identical(dim(File1), dim(File2)) == FALSE) {
    print("non matching number of columns")
    return(FALSE)
  } else if (length(grep("FALSE", names(File1) == names(File2)))>0){
    print("Non matching names of columns")
    return(FALSE)
  }else if (!class(File1)==class(File2)){
    print("Non matching column types")
  }
  return(TRUE)
}

测试您的data.frames:

File1 <- structure(list(Column.1 = structure(1:3, .Label = c("Row 1", 
                                                    "Row 2", "Row 3"), class = "factor"), Column.2 = c(NA, NA, NA
                                                    ), Column.3 = c(NA, NA, NA), colNames = c(TRUE, TRUE, TRUE)), class = "data.frame", row.names = c(NA, 
                                                                                                                                                      -3L))
File2 <-
structure(list(Column.1 = structure(1:3, .Label = c("Row 1", 
                                                    "Row 2", "Row 3"), class = "factor"), Column.2 = c(1, 2, 34), 
               Column.3 = c(NA, NA, NA), colNames = c(TRUE, TRUE, TRUE)), class = "data.frame", row.names = c(NA, 
                                                                                                              -3L))

check <- check_file(File1, File2)
check

[1] TRUE

或者使用不匹配的行数:

df1 <- data.frame(x = 1:20)
df2 <- data.frame(x = 1:10)
check <- check_file(df1, df2)
[1] "non matching number of columns"
check 
[1] FALSE