使用功能

时间:2018-12-19 00:27:19

标签: r function dplyr apply purrr

我想知道是否有更好的方式来做我正在做的事。

我有个小标题(这里是示例):

library(tidyverse)
library(Hmisc) # for the weighted values 

df2 <- structure(list(Q31_A_1 = c(9L, 3L, 2L, 2L, 2L, 3L, 5L, 3L, 1L, 
    3L, 4L, 4L, 1L, 3L, 9L, 2L, 4L, 2L, 3L, 2L, 9L, 2L, 4L, 3L, 3L, 
    3L, 9L, 2L, 3L, NA), Q31_A_2 = c(9L, 4L, 2L, 2L, 2L, 3L, 4L, 
    3L, 1L, 3L, 5L, 4L, 1L, 3L, 9L, 2L, 3L, 2L, 3L, 9L, 9L, 2L, 4L, 
    3L, 3L, 3L, 4L, 2L, 3L, NA), Q31_A_3 = c(9L, 4L, 2L, 2L, 2L, 
    3L, NA, 3L, 1L, 3L, NA, 4L, 1L, 2L, 9L, 2L, 3L, 2L, 2L, 2L, 9L, 
    2L, 4L, 3L, 3L, 2L, 3L, 2L, 2L, 2L), Q31_A_4 = c(9L, 3L, 2L, 
    2L, NA, 3L, 4L, 3L, 3L, 3L, 5L, 4L, 3L, 3L, 4L, NA, 4L, 2L, 3L, 
    9L, 9L, 2L, 4L, 3L, 4L, 4L, 9L, 2L, 3L, 2L), Q31_B_1 = c(9L, 
    2L, 2L, 2L, 1L, 2L, 9L, 3L, NA, 3L, 4L, 4L, 2L, 9L, 9L, NA, 9L, 
    2L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 4L), Q31_B_2 = c(9L, 
    9L, 9L, 2L, 1L, 2L, 9L, 3L, 1L, 3L, 4L, 9L, 2L, 9L, 9L, 2L, 9L, 
    2L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 4L), Q31_B_3 = c(9L, 
    9L, 9L, 2L, 1L, 2L, 9L, 3L, NA, 3L, 4L, 9L, 1L, 9L, 9L, NA, 9L, 
    9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 3L), ages = c("50-64 years", 
    "35-49 years", "35-49 years", "50-64 years", "65+ years", "65+ years", 
    "65+ years", "65+ years", "65+ years", "65+ years", "65+ years", 
    "35-49 years", "65+ years", "50-64 years", "65+ years", "65+ years", 
    "50-64 years", "35-49 years", "65+ years", "65+ years", "65+ years", 
    "65+ years", "65+ years", "50-64 years", "50-64 years", "50-64 years", 
    "50-64 years", "65+ years", "50-64 years", "35-49 years"), wt = c(0.64708755364565, 
    0.921064359620811, 1.3907697993331, 0.974726729781105, 0.576703486333466, 
    0.489053964840285, 0.489053964840285, 0.576703486333466, 0.576703486333466, 
    0.489053964840285, 0.489053964840285, 0.921064359620811, 0.489053964840285, 
    0.974726729781105, 0.489053964840285, 0.489053964840285, 0.64708755364565, 
    0.921064359620811, 0.489053964840285, 0.489053964840285, 0.576703486333466, 
    0.489053964840285, 0.576703486333466, 0.974726729781105, 0.64708755364565, 
    0.974726729781105, 0.974726729781105, 0.489053964840285, 0.974726729781105, 
    0.921064359620811)), row.names = c(NA, -30L), class = c("tbl_df", 
    "tbl", "data.frame"))

这是什么:

# A tibble: 30 x 9
   Q31_A_1 Q31_A_2 Q31_A_3 Q31_A_4 Q31_B_1 Q31_B_2 Q31_B_3 ages           wt
     <int>   <int>   <int>   <int>   <int>   <int>   <int> <chr>       <dbl>
 1       9       9       9       9       9       9       9 50-64 years 0.647
 2       3       4       4       3       2       9       9 35-49 years 0.921
 3       2       2       2       2       2       9       9 35-49 years 1.39 
 4       2       2       2       2       2       2       2 50-64 years 0.975
 5       2       2       2      NA       1       1       1 65+ years   0.577
 6       3       3       3       3       2       2       2 65+ years   0.489
 7       5       4      NA       4       9       9       9 65+ years   0.489
 8       3       3       3       3       3       3       3 65+ years   0.577
 9       1       1       1       3      NA       1      NA 65+ years   0.577
10       3       3       3       3       3       3       3 65+ years   0.489
# ... with 20 more rows

我想对Q31_A_1到Q31_B_3列应用一个函数(在整个数据集中,有更多的列和行)。这是来自调查的数据。我想将值连接到索引值:

index5 <- tibble(
  int = 1:5,
  factor = c(100, 75, 50, 25, 0))

这是在函数中完成的:

group_scores2 <- function(field) {
  field <- enquo(field)
  df <- df2 %>%  select(!!field, ages, wt) %>% 
    filter(UQ(field) <=5) %>% 
    mutate(int = as.integer(!!field))
  df
  df <- left_join(df,index5, by = "int",
                  copy=FALSE)
  df

  ov <- df %>% summarise(mean = wtd.mean(factor, wt), var = wtd.var(factor, wt), sd = sqrt(var), count = sum(wt)) %>% 
    mutate(cat = "Overall") %>% 
    mutate(group = "Overall (2018)")

  ag <- df %>%
    group_by(ages) %>%
    summarise(mean = wtd.mean(factor, wt), var = wtd.var(factor, wt), sd = sqrt(var), count = sum(wt)) %>% 
    mutate(cat = "Age Group") %>% 
    rename(group = ages)

  combined <- bind_rows(ov, ag)
}

例如,当我运行此命令时: group_scores2(Q31_A_1)

这是输出。

# A tibble: 4 x 6
   mean   var    sd count cat       group         
  <dbl> <dbl> <dbl> <dbl> <chr>     <chr>         
1  56.2  514.  22.7 17.5  Overall   Overall (2018)
2  58.4  548.  23.4  4.15 Age Group 35-49 years   
3  51.3  194.  13.9  6.17 Age Group 50-64 years   
4  59.1  894.  29.9  7.20 Age Group 65+ years  

我尝试使用purrrapply系列函数,但似乎无法正确使用。例如:

df3 <- df2 %>% gather(ind, value, Q31_A_1:Q31_B_3)
df3 %>% map(group_scores2)

哪个返回错误。

我不知道从apply开始。

我想知道是否有更有效的方法。

2 个答案:

答案 0 :(得分:1)

这是一种解决方法。首先,我将您的函数重新编写为group_scores3,该函数可以实现相同的功能,但会将某些部分替换为基R Syntex。我还在最终输出中添加了一个列,以显示哪一列是来自df2的输入列。

group_scores3 <- function(field) {

  # The following four lines do the same things as the first chunk in your function
  df <- df2[, c(field, "ages", "wt")]
  df <- df[df[[field]] <= 5 & !is.na(df[[field]]), ]
  df$int = as.integer(df[[field]])
  df <- left_join(df, index5, by = "int", copy=FALSE)

  ov <- df %>% summarise(mean = wtd.mean(factor, wt), var = wtd.var(factor, wt), sd = sqrt(var), count = sum(wt)) %>% 
    mutate(cat = "Overall") %>% 
    mutate(group = "Overall (2018)")

  ag <- df %>%
    group_by(ages) %>%
    summarise(mean = wtd.mean(factor, wt), var = wtd.var(factor, wt), sd = sqrt(var), count = sum(wt)) %>% 
    mutate(cat = "Age Group") %>% 
    rename(group = ages)

  combined <- bind_rows(ov, ag)

  # Add a column to show which question
  combined$Q <- field

  return(combined)
}

然后我创建了一个显示所有目标名称的向量。

# Create a vector with target column names
cols <- str_subset(names(df2), "^Q")

最后,我们可以使用map_dfr遍历各列。输出是一个数据帧,其中所有单独的输出组合在一起。请注意,使用map_dfr时警告消息不是来自group_scores3,而是来自某些单独的列。

# Perform the analysis
map_dfr(cols, ~group_scores3(.))
# A tibble: 28 x 7
    mean   var    sd count cat       group          Q      
   <dbl> <dbl> <dbl> <dbl> <chr>     <chr>          <chr>  
 1  56.2  514.  22.7 17.5  Overall   Overall (2018) Q31_A_1
 2  58.4  548.  23.4  4.15 Age Group 35-49 years    Q31_A_1
 3  51.3  194.  13.9  6.17 Age Group 50-64 years    Q31_A_1
 4  59.1  894.  29.9  7.20 Age Group 65+ years      Q31_A_1
 5  53.6  553.  23.5 18.0  Overall   Overall (2018) Q31_A_2
 6  52.8  813.  28.5  4.15 Age Group 35-49 years    Q31_A_2
 7  50    198.  14.1  7.14 Age Group 50-64 years    Q31_A_2
 8  57.9  947.  30.8  6.71 Age Group 65+ years      Q31_A_2
 9  63.4  414.  20.4 18.4  Overall   Overall (2018) Q31_A_3
10  56.9  720.  26.8  5.08 Age Group 35-49 years    Q31_A_3
# ... with 18 more rows
Warning messages:
1: In wtd.var(factor, wt) :
  only one effective observation; variance estimate undefined
2: In wtd.var(factor, wt) :
  only one effective observation; variance estimate undefined
3: In wtd.var(factor, wt) :
  only one effective observation; variance estimate undefined
4: In wtd.var(factor, wt) :
  only one effective observation; variance estimate undefined

答案 1 :(得分:1)

当您传递数据框以使用library(tidyverse) library(purrr) library(microbenchmark) set.seed(42) n <- 1e4 p <- 100 x <- runif(n*p); x[x < 0.8] <- -1 col_no <- paste0("R", rep(seq(1, p), n)) id <- rep(1:n, each = p) df <- data.frame(id, x, col_no) df <- df %>% spread(col_no, x) foo <- function(df, mycols) { bind_cols(df, somefeature = df %>% select(mycols) %>% rowwise() %>% do( (.) %>% as.data.frame %>% mutate(temp = all(. == -1))) %>% pull(temp)) } bar <- function(df, mycols) { df$somefeature = rowSums(df[mycols] != -1) == 0 df } baz <- function(df, mycols) { df %>% mutate(somefeature = map(.[mycols], `==`, -1) %>% reduce(`+`) %>% {. == length(mycols) }) } mycols <- paste0("R", c(1:50)) res1 <- foo(df, mycols) # Takes roughly a minute on my machine res2 <- bar(df, mycols) res3 <- baz(df, mycols) # Verify all methods give the same solution stopifnot(ncol(res1) == ncol(res2)) stopifnot(ncol(res1) == ncol(res3)) stopifnot(all(res1$somefeature == res2$somefeature)) stopifnot(all(res1$somefeature == res3$somefeature)) # Time the methods (not foo, as it is much slower than the other two) microbenchmark(bar(df, mycols), baz(df, mycols)) Unit: milliseconds expr min lq mean median uq max neval bar(df, mycols) 3.926076 5.534273 6.782348 6.468424 7.019863 30.70699 100 baz(df, mycols) 8.289160 9.598482 11.726803 10.208659 10.909052 72.72334 100 进行映射时,df3 %>% map(group_scores2)函数会尝试用map的每一列调用group_scores2-我想那不是您想要的与df3 ed数据帧有关?

使用purr :: map 修改函数,例如,因为它应该使用gather。由于函数以某种未记录的方式依赖全局变量(data.frame),因此我不会走这条路,而且命名变量index5似乎可能会引起麻烦。 。但是它可以工作; factor期望函数arg根据您的意图返回可以map_dfr在一起的数据帧。

rbind

只是dplyr动词

也许更直接的是老式的拆分应用组合。我不确定是否有按年龄/问题和公正问题进行总结的“简单”方法;如果您要进行许多这样的总结,那么也许group_scores3 <- function(ds) { df = ds %>% filter(value <=5) %>% rename(int = value) %>% left_join(index5, by = "int",copy=FALSE) ov <- df %>% summarise(mean = wtd.mean(factor, wt), var = wtd.var(factor, wt), sd = sqrt(var), count = sum(wt)) %>% mutate(cat = "Overall") %>% mutate(group = "Overall (2018)") ag <- df %>% group_by(ages) %>% summarise(mean = wtd.mean(factor, wt), var = wtd.var(factor, wt), sd = sqrt(var), count = sum(wt)) %>% mutate(cat = "Age Group") %>% rename(group = ages) bind_rows(ov, ag) } # df3 as before df3 <- df2 %>% gather(ind, value, Q31_A_1:Q31_B_3) # summarize each question and concatenate the results: df3 %>% split(.$ind) %>% map_dfr(.f = group_scores3,.id = "ind") 部分将成为分解为函数并通过summarise进行调用的部分。

purrr::map_***

这两个给我相同的答案。总结不同问题子集所需的唯一更改是在# like df3, but take care of filter/merge once instead of repeating every time df4 = df2 %>% gather(ind,value,Q31_A_1:Q31_B_3) %>% filter(value <= 5) %>% rename(int = value) %>% inner_join(index5,by="int") # scores per age group: output1 = df4 %>% group_by(ind,ages) %>% summarise(mean = wtd.mean(factor, wt), var = wtd.var(factor, wt), sd = sqrt(var), count = sum(wt)) %>% mutate(category = "Age Group") # overall scores: output2 = df4 %>% group_by(ind) %>% summarise(mean = wtd.mean(factor, wt), var = wtd.var(factor, wt), sd = sqrt(var), count = sum(wt)) %>% mutate(category = "Overall") bind_rows(output1,output2) %>% mutate(ages = ifelse(is.na(ages),"Overall (2018)",ages)) %>% arrange(ind,desc(category)) 调用中。