我正在尝试从两个不同大小的数据框中采样相同数量的类。我可以手动执行此操作,但我的某些数据框中的类数量非常大。
我已经能够使用 dplyr::count
函数从较小的数据框中获取感兴趣的类的列表,以及它们的计数。然后我将这些类及其计数提取为向量。然后我尝试使用这些向量创建一个函数并使用 mapply
调用它,因此我可以为每个类创建过滤切片,然后使用 do.call
重新加入列表,但是当我收到错误时我尝试运行 mapply
。
以下是示例数据集。 df 是较小的数据框,其中 6 行包含 ControlVarA == "Group_1"
,10 行包含 ControlVarA == "Group_2"
,我想从较大的数据框 df2(有 6 行)中提取相同数量的行/类ControlVarA == "Group_1"
和 20 行包含 ControlVarA == "Group_2"
)。
df <- data.frame("ID" = 1:16)
df$VarA <- c(1,1,1,1,1,1,1,1,1,1,1,14,NA_real_,NA_real_,NA_real_,16)
df$VarB <- c(10,0,0,0,12,12,12,12,0,14,NA_real_,14,16,16,16,16)
df$VarC <- c(10,12,14,16,10,12,14,16,10,12,14,16,10,12,14,16)
df$VarD <- c(10,12,14,16,10,12,14,16,10,12,14,16,10,12,14,16)
df$ControlVarA <- factor(c("Group_1","Group_1","Group_1","Group_1","Group_1", "Group_1",
"Group_2","Group_2","Group_2","Group_2","Group_2","Group_2",
"Group_2","Group_2","Group_2","Group_2"))
df
df2 <- data.frame("ID" = 1:26)
df2$VarA <- c(1,1,1,1,1,1,1,1,1,1,1,14,NA_real_,NA_real_,NA_real_,16,16,16,16,16,16,16,16,16,16,16)
df2$VarB <- c(10,0,0,0,12,12,12,12,0,14,NA_real_,14,16,16,16,16,16,16,16,16,16,16,16,16,16,16)
df2$VarC <- c(10,12,14,16,10,12,14,16,10,12,14,16,10,12,14,16,16,16,16,16,16,16,16,16,16,16)
df2$VarD <- c(10,12,14,16,10,12,14,16,10,12,14,16,10,12,14,16,16,16,16,16,16,16,16,16,16,16)
df2$ControlVarA <- factor(c("Group_1","Group_1","Group_1","Group_1","Group_1", "Group_1",
"Group_2","Group_2","Group_2","Group_2","Group_2","Group_2",
"Group_2","Group_2","Group_2","Group_2","Group_2","Group_2",
"Group_2","Group_2","Group_2","Group_2","Group_2","Group_2","Group_2","Group_2"))
df2
为了提取类名和类计数,我使用下面的代码。
slice_vars <- df %>%
count(ControlVarA) %>%
filter(!is.na(.)) %>%
t() %>%
janitor::row_to_names(1) %>%
colnames()
slice_nums <- df %>%
count(ControlVarA) %>%
filter(!is.na(.)) %>%
t() %>%
janitor::row_to_names(2) %>%
as.data.frame() %>%
rename_with(~ gsub(" ", "", .x)) %>%
colnames() %>%
as.numeric()
我创建的 function
和 mapply
语句如下
func_group <- function(dataset, x, y) {
dataset %>%
group_by(ControlVarA) %>%
slice_sample(n = all_of(x)) %>%
ungroup() %>%
filter(ControlVarA == data[[y]])
}
combine_lists <- mapply(func_group, slice_nums, slice_vars, MoreArgs = list(dataset = df2))
do.call(rbind, combine_lists)
答案 0 :(得分:2)
library(tidyverse)
df <- data.frame("ID" = 1:16)
df$VarA <- c(1,1,1,1,1,1,1,1,1,1,1,14,NA_real_,NA_real_,NA_real_,16)
df$VarB <- c(10,0,0,0,12,12,12,12,0,14,NA_real_,14,16,16,16,16)
df$VarC <- c(10,12,14,16,10,12,14,16,10,12,14,16,10,12,14,16)
df$VarD <- c(10,12,14,16,10,12,14,16,10,12,14,16,10,12,14,16)
df$ControlVarA <- factor(c("Group_1","Group_1","Group_1","Group_1","Group_1", "Group_1",
"Group_2","Group_2","Group_2","Group_2","Group_2","Group_2",
"Group_2","Group_2","Group_2","Group_2"))
df2 <- data.frame("ID" = 1:26)
df2$VarA <- c(1,1,1,1,1,1,1,1,1,1,1,14,NA_real_,NA_real_,NA_real_,16,16,16,16,16,16,16,16,16,16,16)
df2$VarB <- c(10,0,0,0,12,12,12,12,0,14,NA_real_,14,16,16,16,16,16,16,16,16,16,16,16,16,16,16)
df2$VarC <- c(10,12,14,16,10,12,14,16,10,12,14,16,10,12,14,16,16,16,16,16,16,16,16,16,16,16)
df2$VarD <- c(10,12,14,16,10,12,14,16,10,12,14,16,10,12,14,16,16,16,16,16,16,16,16,16,16,16)
df2$ControlVarA <- factor(c("Group_1","Group_1","Group_1","Group_1","Group_1", "Group_1",
"Group_2","Group_2","Group_2","Group_2","Group_2","Group_2",
"Group_2","Group_2","Group_2","Group_2","Group_2","Group_2",
"Group_2","Group_2","Group_2","Group_2","Group_2","Group_2","Group_2","Group_2"))
df <- as_tibble(df) %>%
mutate(table = "df")
df2 <- as_tibble(df2) %>%
mutate(table = "df2")
final_df <- df %>%
bind_rows(df2)
set.seed(2021)
final_df %>%
filter(!if_any(.cols = VarA:VarD, is.na)) %>%
group_by(table, ControlVarA) %>%
slice_sample(n = 5)
#> # A tibble: 20 x 7
#> # Groups: table, ControlVarA [4]
#> ID VarA VarB VarC VarD ControlVarA table
#> <int> <dbl> <dbl> <dbl> <dbl> <fct> <chr>
#> 1 6 1 12 12 12 Group_1 df
#> 2 2 1 0 12 12 Group_1 df
#> 3 3 1 0 14 14 Group_1 df
#> 4 5 1 12 10 10 Group_1 df
#> 5 4 1 0 16 16 Group_1 df
#> 6 16 16 16 16 16 Group_2 df
#> 7 9 1 0 10 10 Group_2 df
#> 8 8 1 12 16 16 Group_2 df
#> 9 10 1 14 12 12 Group_2 df
#> 10 7 1 12 14 14 Group_2 df
#> 11 1 1 10 10 10 Group_1 df2
#> 12 4 1 0 16 16 Group_1 df2
#> 13 3 1 0 14 14 Group_1 df2
#> 14 2 1 0 12 12 Group_1 df2
#> 15 6 1 12 12 12 Group_1 df2
#> 16 22 16 16 16 16 Group_2 df2
#> 17 23 16 16 16 16 Group_2 df2
#> 18 9 1 0 10 10 Group_2 df2
#> 19 18 16 16 16 16 Group_2 df2
#> 20 20 16 16 16 16 Group_2 df2
由 reprex package (v2.0.0) 于 2021 年 7 月 13 日创建
答案 1 :(得分:2)
count
获取 ControlVarA
中每个值的行数,加入 df2
并使用 n
从每个组中选择 sample_n
随机行。 (不幸的是,slice_sample(n = first(n))
返回错误)
library(dplyr)
df %>%
count(ControlVarA) %>%
left_join(df2, by = 'ControlVarA') %>%
group_by(ControlVarA) %>%
sample_n(first(n)) %>%
ungroup %>%
select(-n)
# ControlVarA ID VarA VarB VarC VarD
# <fct> <int> <dbl> <dbl> <dbl> <dbl>
# 1 Group_1 1 1 10 10 10
# 2 Group_1 4 1 0 16 16
# 3 Group_1 3 1 0 14 14
# 4 Group_1 2 1 0 12 12
# 5 Group_1 5 1 12 10 10
# 6 Group_1 6 1 12 12 12
# 7 Group_2 12 14 14 16 16
# 8 Group_2 25 16 16 16 16
# 9 Group_2 15 NA 16 14 14
#10 Group_2 22 16 16 16 16
#11 Group_2 9 1 0 10 10
#12 Group_2 8 1 12 16 16
#13 Group_2 24 16 16 16 16
#14 Group_2 21 16 16 16 16
#15 Group_2 7 1 12 14 14
#16 Group_2 14 NA 16 12 12