从两个数据帧中采样相同数量的不平衡类

时间:2021-07-13 07:20:13

标签: r dplyr tidyverse

我正在尝试从两个不同大小的数据框中采样相同数量的类。我可以手动执行此操作,但我的某些数据框中的类数量非常大。

我已经能够使用 dplyr::count 函数从较小的数据框中获取感兴趣的类的列表,以及它们的计数。然后我将这些类及其计数提取为向量。然后我尝试使用这些向量创建一个函数并使用 mapply 调用它,因此我可以为每个类创建过滤切片,然后使用 do.call 重新加入列表,但是当我收到错误时我尝试运行 mapply

以下是示例数据集。 df 是较小的数据框,其中 6 行包含 ControlVarA == "Group_1",10 行包含 ControlVarA == "Group_2",我想从较大的数据框 df2(有 6 行)中提取相同数量的行/类ControlVarA == "Group_1" 和 20 行包含 ControlVarA == "Group_2")。

df <- data.frame("ID" = 1:16)
df$VarA <- c(1,1,1,1,1,1,1,1,1,1,1,14,NA_real_,NA_real_,NA_real_,16)
df$VarB <- c(10,0,0,0,12,12,12,12,0,14,NA_real_,14,16,16,16,16)
df$VarC <- c(10,12,14,16,10,12,14,16,10,12,14,16,10,12,14,16)
df$VarD <- c(10,12,14,16,10,12,14,16,10,12,14,16,10,12,14,16)
df$ControlVarA <- factor(c("Group_1","Group_1","Group_1","Group_1","Group_1", "Group_1",
                           "Group_2","Group_2","Group_2","Group_2","Group_2","Group_2",
                           "Group_2","Group_2","Group_2","Group_2")) 
df


df2 <- data.frame("ID" = 1:26)
df2$VarA <- c(1,1,1,1,1,1,1,1,1,1,1,14,NA_real_,NA_real_,NA_real_,16,16,16,16,16,16,16,16,16,16,16)
df2$VarB <- c(10,0,0,0,12,12,12,12,0,14,NA_real_,14,16,16,16,16,16,16,16,16,16,16,16,16,16,16)
df2$VarC <- c(10,12,14,16,10,12,14,16,10,12,14,16,10,12,14,16,16,16,16,16,16,16,16,16,16,16)
df2$VarD <- c(10,12,14,16,10,12,14,16,10,12,14,16,10,12,14,16,16,16,16,16,16,16,16,16,16,16)
df2$ControlVarA <- factor(c("Group_1","Group_1","Group_1","Group_1","Group_1", "Group_1",
                           "Group_2","Group_2","Group_2","Group_2","Group_2","Group_2",
                           "Group_2","Group_2","Group_2","Group_2","Group_2","Group_2",
                           "Group_2","Group_2","Group_2","Group_2","Group_2","Group_2","Group_2","Group_2")) 
df2

为了提取类名和类计数,我使用下面的代码。

slice_vars <- df %>% 
  count(ControlVarA) %>% 
  filter(!is.na(.)) %>% 
  t() %>% 
  janitor::row_to_names(1) %>% 
  colnames()

slice_nums <- df %>% 
  count(ControlVarA) %>% 
  filter(!is.na(.)) %>% 
  t() %>% 
  janitor::row_to_names(2) %>% 
  as.data.frame() %>% 
  rename_with(~ gsub(" ", "", .x)) %>% 
  colnames() %>% 
  as.numeric()

我创建的 functionmapply 语句如下

func_group <- function(dataset, x, y) {
  dataset %>% 
    group_by(ControlVarA) %>% 
    slice_sample(n = all_of(x)) %>% 
    ungroup() %>% 
  filter(ControlVarA == data[[y]])
}

combine_lists <- mapply(func_group, slice_nums, slice_vars, MoreArgs = list(dataset = df2)) 

do.call(rbind, combine_lists)

2 个答案:

答案 0 :(得分:2)

library(tidyverse)

df <- data.frame("ID" = 1:16)
df$VarA <- c(1,1,1,1,1,1,1,1,1,1,1,14,NA_real_,NA_real_,NA_real_,16)
df$VarB <- c(10,0,0,0,12,12,12,12,0,14,NA_real_,14,16,16,16,16)
df$VarC <- c(10,12,14,16,10,12,14,16,10,12,14,16,10,12,14,16)
df$VarD <- c(10,12,14,16,10,12,14,16,10,12,14,16,10,12,14,16)
df$ControlVarA <- factor(c("Group_1","Group_1","Group_1","Group_1","Group_1", "Group_1",
                           "Group_2","Group_2","Group_2","Group_2","Group_2","Group_2",
                           "Group_2","Group_2","Group_2","Group_2")) 
df2 <- data.frame("ID" = 1:26)
df2$VarA <- c(1,1,1,1,1,1,1,1,1,1,1,14,NA_real_,NA_real_,NA_real_,16,16,16,16,16,16,16,16,16,16,16)
df2$VarB <- c(10,0,0,0,12,12,12,12,0,14,NA_real_,14,16,16,16,16,16,16,16,16,16,16,16,16,16,16)
df2$VarC <- c(10,12,14,16,10,12,14,16,10,12,14,16,10,12,14,16,16,16,16,16,16,16,16,16,16,16)
df2$VarD <- c(10,12,14,16,10,12,14,16,10,12,14,16,10,12,14,16,16,16,16,16,16,16,16,16,16,16)
df2$ControlVarA <- factor(c("Group_1","Group_1","Group_1","Group_1","Group_1", "Group_1",
                            "Group_2","Group_2","Group_2","Group_2","Group_2","Group_2",
                            "Group_2","Group_2","Group_2","Group_2","Group_2","Group_2",
                            "Group_2","Group_2","Group_2","Group_2","Group_2","Group_2","Group_2","Group_2")) 
df <- as_tibble(df) %>% 
  mutate(table = "df")
df2 <- as_tibble(df2) %>% 
  mutate(table = "df2")

final_df <- df %>% 
  bind_rows(df2)

set.seed(2021)  
final_df %>% 
  filter(!if_any(.cols = VarA:VarD, is.na)) %>% 
  group_by(table, ControlVarA) %>% 
  slice_sample(n = 5)
#> # A tibble: 20 x 7
#> # Groups:   table, ControlVarA [4]
#>       ID  VarA  VarB  VarC  VarD ControlVarA table
#>    <int> <dbl> <dbl> <dbl> <dbl> <fct>       <chr>
#>  1     6     1    12    12    12 Group_1     df   
#>  2     2     1     0    12    12 Group_1     df   
#>  3     3     1     0    14    14 Group_1     df   
#>  4     5     1    12    10    10 Group_1     df   
#>  5     4     1     0    16    16 Group_1     df   
#>  6    16    16    16    16    16 Group_2     df   
#>  7     9     1     0    10    10 Group_2     df   
#>  8     8     1    12    16    16 Group_2     df   
#>  9    10     1    14    12    12 Group_2     df   
#> 10     7     1    12    14    14 Group_2     df   
#> 11     1     1    10    10    10 Group_1     df2  
#> 12     4     1     0    16    16 Group_1     df2  
#> 13     3     1     0    14    14 Group_1     df2  
#> 14     2     1     0    12    12 Group_1     df2  
#> 15     6     1    12    12    12 Group_1     df2  
#> 16    22    16    16    16    16 Group_2     df2  
#> 17    23    16    16    16    16 Group_2     df2  
#> 18     9     1     0    10    10 Group_2     df2  
#> 19    18    16    16    16    16 Group_2     df2  
#> 20    20    16    16    16    16 Group_2     df2

reprex package (v2.0.0) 于 2021 年 7 月 13 日创建

答案 1 :(得分:2)

count 获取 ControlVarA 中每个值的行数,加入 df2 并使用 n 从每个组中选择 sample_n 随机行。 (不幸的是,slice_sample(n = first(n)) 返回错误)

library(dplyr)

df %>% 
  count(ControlVarA) %>%
  left_join(df2, by = 'ControlVarA') %>%
  group_by(ControlVarA) %>%
  sample_n(first(n)) %>%
  ungroup %>%
  select(-n)

#   ControlVarA    ID  VarA  VarB  VarC  VarD
#   <fct>       <int> <dbl> <dbl> <dbl> <dbl>
# 1 Group_1         1     1    10    10    10
# 2 Group_1         4     1     0    16    16
# 3 Group_1         3     1     0    14    14
# 4 Group_1         2     1     0    12    12
# 5 Group_1         5     1    12    10    10
# 6 Group_1         6     1    12    12    12
# 7 Group_2        12    14    14    16    16
# 8 Group_2        25    16    16    16    16
# 9 Group_2        15    NA    16    14    14
#10 Group_2        22    16    16    16    16
#11 Group_2         9     1     0    10    10
#12 Group_2         8     1    12    16    16
#13 Group_2        24    16    16    16    16
#14 Group_2        21    16    16    16    16
#15 Group_2         7     1    12    14    14
#16 Group_2        14    NA    16    12    12
相关问题