我想知道是否有更好的方式来做我正在做的事。
我有个小标题(这里是示例):
library(tidyverse)
library(Hmisc) # for the weighted values
df2 <- structure(list(Q31_A_1 = c(9L, 3L, 2L, 2L, 2L, 3L, 5L, 3L, 1L,
3L, 4L, 4L, 1L, 3L, 9L, 2L, 4L, 2L, 3L, 2L, 9L, 2L, 4L, 3L, 3L,
3L, 9L, 2L, 3L, NA), Q31_A_2 = c(9L, 4L, 2L, 2L, 2L, 3L, 4L,
3L, 1L, 3L, 5L, 4L, 1L, 3L, 9L, 2L, 3L, 2L, 3L, 9L, 9L, 2L, 4L,
3L, 3L, 3L, 4L, 2L, 3L, NA), Q31_A_3 = c(9L, 4L, 2L, 2L, 2L,
3L, NA, 3L, 1L, 3L, NA, 4L, 1L, 2L, 9L, 2L, 3L, 2L, 2L, 2L, 9L,
2L, 4L, 3L, 3L, 2L, 3L, 2L, 2L, 2L), Q31_A_4 = c(9L, 3L, 2L,
2L, NA, 3L, 4L, 3L, 3L, 3L, 5L, 4L, 3L, 3L, 4L, NA, 4L, 2L, 3L,
9L, 9L, 2L, 4L, 3L, 4L, 4L, 9L, 2L, 3L, 2L), Q31_B_1 = c(9L,
2L, 2L, 2L, 1L, 2L, 9L, 3L, NA, 3L, 4L, 4L, 2L, 9L, 9L, NA, 9L,
2L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 4L), Q31_B_2 = c(9L,
9L, 9L, 2L, 1L, 2L, 9L, 3L, 1L, 3L, 4L, 9L, 2L, 9L, 9L, 2L, 9L,
2L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 4L), Q31_B_3 = c(9L,
9L, 9L, 2L, 1L, 2L, 9L, 3L, NA, 3L, 4L, 9L, 1L, 9L, 9L, NA, 9L,
9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 3L), ages = c("50-64 years",
"35-49 years", "35-49 years", "50-64 years", "65+ years", "65+ years",
"65+ years", "65+ years", "65+ years", "65+ years", "65+ years",
"35-49 years", "65+ years", "50-64 years", "65+ years", "65+ years",
"50-64 years", "35-49 years", "65+ years", "65+ years", "65+ years",
"65+ years", "65+ years", "50-64 years", "50-64 years", "50-64 years",
"50-64 years", "65+ years", "50-64 years", "35-49 years"), wt = c(0.64708755364565,
0.921064359620811, 1.3907697993331, 0.974726729781105, 0.576703486333466,
0.489053964840285, 0.489053964840285, 0.576703486333466, 0.576703486333466,
0.489053964840285, 0.489053964840285, 0.921064359620811, 0.489053964840285,
0.974726729781105, 0.489053964840285, 0.489053964840285, 0.64708755364565,
0.921064359620811, 0.489053964840285, 0.489053964840285, 0.576703486333466,
0.489053964840285, 0.576703486333466, 0.974726729781105, 0.64708755364565,
0.974726729781105, 0.974726729781105, 0.489053964840285, 0.974726729781105,
0.921064359620811)), row.names = c(NA, -30L), class = c("tbl_df",
"tbl", "data.frame"))
这是什么:
# A tibble: 30 x 9
Q31_A_1 Q31_A_2 Q31_A_3 Q31_A_4 Q31_B_1 Q31_B_2 Q31_B_3 ages wt
<int> <int> <int> <int> <int> <int> <int> <chr> <dbl>
1 9 9 9 9 9 9 9 50-64 years 0.647
2 3 4 4 3 2 9 9 35-49 years 0.921
3 2 2 2 2 2 9 9 35-49 years 1.39
4 2 2 2 2 2 2 2 50-64 years 0.975
5 2 2 2 NA 1 1 1 65+ years 0.577
6 3 3 3 3 2 2 2 65+ years 0.489
7 5 4 NA 4 9 9 9 65+ years 0.489
8 3 3 3 3 3 3 3 65+ years 0.577
9 1 1 1 3 NA 1 NA 65+ years 0.577
10 3 3 3 3 3 3 3 65+ years 0.489
# ... with 20 more rows
我想对Q31_A_1到Q31_B_3列应用一个函数(在整个数据集中,有更多的列和行)。这是来自调查的数据。我想将值连接到索引值:
index5 <- tibble(
int = 1:5,
factor = c(100, 75, 50, 25, 0))
这是在函数中完成的:
group_scores2 <- function(field) {
field <- enquo(field)
df <- df2 %>% select(!!field, ages, wt) %>%
filter(UQ(field) <=5) %>%
mutate(int = as.integer(!!field))
df
df <- left_join(df,index5, by = "int",
copy=FALSE)
df
ov <- df %>% summarise(mean = wtd.mean(factor, wt), var = wtd.var(factor, wt), sd = sqrt(var), count = sum(wt)) %>%
mutate(cat = "Overall") %>%
mutate(group = "Overall (2018)")
ag <- df %>%
group_by(ages) %>%
summarise(mean = wtd.mean(factor, wt), var = wtd.var(factor, wt), sd = sqrt(var), count = sum(wt)) %>%
mutate(cat = "Age Group") %>%
rename(group = ages)
combined <- bind_rows(ov, ag)
}
例如,当我运行此命令时:
group_scores2(Q31_A_1)
这是输出。
# A tibble: 4 x 6
mean var sd count cat group
<dbl> <dbl> <dbl> <dbl> <chr> <chr>
1 56.2 514. 22.7 17.5 Overall Overall (2018)
2 58.4 548. 23.4 4.15 Age Group 35-49 years
3 51.3 194. 13.9 6.17 Age Group 50-64 years
4 59.1 894. 29.9 7.20 Age Group 65+ years
我尝试使用purrr
和apply
系列函数,但似乎无法正确使用。例如:
df3 <- df2 %>% gather(ind, value, Q31_A_1:Q31_B_3)
df3 %>% map(group_scores2)
哪个返回错误。
我不知道从apply
开始。
我想知道是否有更有效的方法。
答案 0 :(得分:1)
这是一种解决方法。首先,我将您的函数重新编写为group_scores3
,该函数可以实现相同的功能,但会将某些部分替换为基R Syntex。我还在最终输出中添加了一个列,以显示哪一列是来自df2
的输入列。
group_scores3 <- function(field) {
# The following four lines do the same things as the first chunk in your function
df <- df2[, c(field, "ages", "wt")]
df <- df[df[[field]] <= 5 & !is.na(df[[field]]), ]
df$int = as.integer(df[[field]])
df <- left_join(df, index5, by = "int", copy=FALSE)
ov <- df %>% summarise(mean = wtd.mean(factor, wt), var = wtd.var(factor, wt), sd = sqrt(var), count = sum(wt)) %>%
mutate(cat = "Overall") %>%
mutate(group = "Overall (2018)")
ag <- df %>%
group_by(ages) %>%
summarise(mean = wtd.mean(factor, wt), var = wtd.var(factor, wt), sd = sqrt(var), count = sum(wt)) %>%
mutate(cat = "Age Group") %>%
rename(group = ages)
combined <- bind_rows(ov, ag)
# Add a column to show which question
combined$Q <- field
return(combined)
}
然后我创建了一个显示所有目标名称的向量。
# Create a vector with target column names
cols <- str_subset(names(df2), "^Q")
最后,我们可以使用map_dfr
遍历各列。输出是一个数据帧,其中所有单独的输出组合在一起。请注意,使用map_dfr
时警告消息不是来自group_scores3
,而是来自某些单独的列。
# Perform the analysis
map_dfr(cols, ~group_scores3(.))
# A tibble: 28 x 7 mean var sd count cat group Q <dbl> <dbl> <dbl> <dbl> <chr> <chr> <chr> 1 56.2 514. 22.7 17.5 Overall Overall (2018) Q31_A_1 2 58.4 548. 23.4 4.15 Age Group 35-49 years Q31_A_1 3 51.3 194. 13.9 6.17 Age Group 50-64 years Q31_A_1 4 59.1 894. 29.9 7.20 Age Group 65+ years Q31_A_1 5 53.6 553. 23.5 18.0 Overall Overall (2018) Q31_A_2 6 52.8 813. 28.5 4.15 Age Group 35-49 years Q31_A_2 7 50 198. 14.1 7.14 Age Group 50-64 years Q31_A_2 8 57.9 947. 30.8 6.71 Age Group 65+ years Q31_A_2 9 63.4 414. 20.4 18.4 Overall Overall (2018) Q31_A_3 10 56.9 720. 26.8 5.08 Age Group 35-49 years Q31_A_3 # ... with 18 more rows Warning messages: 1: In wtd.var(factor, wt) : only one effective observation; variance estimate undefined 2: In wtd.var(factor, wt) : only one effective observation; variance estimate undefined 3: In wtd.var(factor, wt) : only one effective observation; variance estimate undefined 4: In wtd.var(factor, wt) : only one effective observation; variance estimate undefined
答案 1 :(得分:1)
当您传递数据框以使用library(tidyverse)
library(purrr)
library(microbenchmark)
set.seed(42)
n <- 1e4
p <- 100
x <- runif(n*p); x[x < 0.8] <- -1
col_no <- paste0("R", rep(seq(1, p), n))
id <- rep(1:n, each = p)
df <- data.frame(id, x, col_no)
df <- df %>% spread(col_no, x)
foo <- function(df, mycols) {
bind_cols(df, somefeature = df %>%
select(mycols) %>%
rowwise() %>%
do( (.) %>% as.data.frame %>%
mutate(temp = all(. == -1))) %>%
pull(temp))
}
bar <- function(df, mycols) {
df$somefeature = rowSums(df[mycols] != -1) == 0
df
}
baz <- function(df, mycols) {
df %>%
mutate(somefeature = map(.[mycols], `==`, -1) %>%
reduce(`+`) %>%
{. == length(mycols) })
}
mycols <- paste0("R", c(1:50))
res1 <- foo(df, mycols) # Takes roughly a minute on my machine
res2 <- bar(df, mycols)
res3 <- baz(df, mycols)
# Verify all methods give the same solution
stopifnot(ncol(res1) == ncol(res2))
stopifnot(ncol(res1) == ncol(res3))
stopifnot(all(res1$somefeature == res2$somefeature))
stopifnot(all(res1$somefeature == res3$somefeature))
# Time the methods (not foo, as it is much slower than the other two)
microbenchmark(bar(df, mycols), baz(df, mycols))
Unit: milliseconds
expr min lq mean median uq max neval
bar(df, mycols) 3.926076 5.534273 6.782348 6.468424 7.019863 30.70699 100
baz(df, mycols) 8.289160 9.598482 11.726803 10.208659 10.909052 72.72334 100
进行映射时,df3 %>% map(group_scores2)
函数会尝试用map
的每一列调用group_scores2
-我想那不是您想要的与df3
ed数据帧有关?
使用purr :: map
修改函数,例如,因为它应该使用gather
。由于函数以某种未记录的方式依赖全局变量(data.frame
),因此我不会走这条路,而且命名变量index5
似乎可能会引起麻烦。 。但是它可以工作; factor
期望函数arg根据您的意图返回可以map_dfr
在一起的数据帧。
rbind
只是dplyr动词
也许更直接的是老式的拆分应用组合。我不确定是否有按年龄/问题和公正问题进行总结的“简单”方法;如果您要进行许多这样的总结,那么也许group_scores3 <- function(ds) {
df = ds %>% filter(value <=5) %>%
rename(int = value) %>%
left_join(index5, by = "int",copy=FALSE)
ov <- df %>% summarise(mean = wtd.mean(factor, wt), var = wtd.var(factor, wt),
sd = sqrt(var), count = sum(wt)) %>%
mutate(cat = "Overall") %>%
mutate(group = "Overall (2018)")
ag <- df %>%
group_by(ages) %>%
summarise(mean = wtd.mean(factor, wt), var = wtd.var(factor, wt),
sd = sqrt(var), count = sum(wt)) %>%
mutate(cat = "Age Group") %>%
rename(group = ages)
bind_rows(ov, ag)
}
# df3 as before
df3 <- df2 %>% gather(ind, value, Q31_A_1:Q31_B_3)
# summarize each question and concatenate the results:
df3 %>% split(.$ind) %>%
map_dfr(.f = group_scores3,.id = "ind")
部分将成为分解为函数并通过summarise
进行调用的部分。
purrr::map_***
这两个给我相同的答案。总结不同问题子集所需的唯一更改是在# like df3, but take care of filter/merge once instead of repeating every time
df4 = df2 %>% gather(ind,value,Q31_A_1:Q31_B_3) %>%
filter(value <= 5) %>%
rename(int = value) %>%
inner_join(index5,by="int")
# scores per age group:
output1 = df4 %>%
group_by(ind,ages) %>%
summarise(mean = wtd.mean(factor, wt),
var = wtd.var(factor, wt),
sd = sqrt(var),
count = sum(wt)) %>%
mutate(category = "Age Group")
# overall scores:
output2 = df4 %>%
group_by(ind) %>%
summarise(mean = wtd.mean(factor, wt),
var = wtd.var(factor, wt),
sd = sqrt(var),
count = sum(wt)) %>%
mutate(category = "Overall")
bind_rows(output1,output2) %>%
mutate(ages = ifelse(is.na(ages),"Overall (2018)",ages)) %>%
arrange(ind,desc(category))
调用中。