Question

由于我的数据集非常庞大，我在操作上苦苦挣扎，我提供了我想要的示例。

我有两个数据框。

df1-包含定义为列名（10,000行）的变量的每个参数的采样衍生迭代

df2-包含定义为列名（4,000行）的每个变量的实际值

我想要一个df3，它实际上是df2的每行与df1的乘积，因此将是4000 * 10000行

作为一个简短示例，我提供了df1和df2的最小示例。我已经提供了将在df3中显示的输出。

df1 <- structure(list(intercept = c(3.4, 3.6, 3.7), age = c(0.08, 0.05, 
0.06), male = c(0.07, 0.06, 0.07)), class = "data.frame", row.names = c(NA, 
-3L))

df2 <- structure(list(id = structure(1:2, .Label = c("a", "b"), class = "factor"), 
intercept = c(1L, 1L), age = c(40L, 45L), male = 1:0), class = "data.frame", row.names = c(NA, 
-2L))

df3 <- structure(list(id = structure(c(1L, 1L, 1L, 2L, 2L, 2L), .Label = c("a", 
"b"), class = "factor"), intercept = c(3.4, 3.6, 3.7, 3.4, 3.6, 
3.7), age = c(3.2, 2, 2.4, 3.6, 2.25, 2.7), male = c(0.07, 0.06, 
0.07, 0, 0, 0)), class = "data.frame", row.names = c(NA, -6L))

有人可以指出我在R中执行此操作的有效方法吗？

Answer 1

使用outer，

data.frame(id = rep(df2$id, each = nrow(df1)), 
           mapply(function(x, y)c(outer(x, y, `*`)), df1, df2[-1])
           )

给出，

  id intercept  age male
1  a       3.4 3.20 0.07
2  a       3.6 2.00 0.06
3  a       3.7 2.40 0.07
4  b       3.4 3.60 0.00
5  b       3.6 2.25 0.00
6  b       3.7 2.70 0.00

Answer 2

您可以按如下所示执行逐行Kronecker产品（来自软件包MGLM）

out <- data.frame(id = rep(df2$id,each=nrow(df1)),
                  t(MGLM::kr(t(df2[-1]),t(df1))))

这样

> out
  id intercept  age male
1  a       3.4 3.20 0.07
2  a       3.6 2.00 0.06
3  a       3.7 2.40 0.07
4  b       3.4 3.60 0.00
5  b       3.6 2.25 0.00
6  b       3.7 2.70 0.00

基准化（到目前为止，https://doi.org/10.13140/RG.2.2.15187.02084的方法是赢家）

df1 <- do.call(rbind,replicate(500,structure(list(intercept = c(3.4, 3.6, 3.7), age = c(0.08, 0.05, 
                                                            0.06), male = c(0.07, 0.06, 0.07)), class = "data.frame", row.names = c(NA, 
                                                                                                                                    -3L)),simplify = F))

df2 <- do.call(rbind,replicate(100,structure(list(id = structure(1:2, .Label = c("a", "b"), class = "factor"), 
                      intercept = c(1L, 1L), age = c(40L, 45L), male = 1:0), class = "data.frame", row.names = c(NA, 
                                                                                                                 -2L)),simplify = F))

library(MGLM)
library(purrr)

f_ThomasIsCoding <- function() {
  data.frame(id = rep(df2$id,each=nrow(df1)),
                    t(MGLM::kr(t(df2[-1]),t(df1))))
}

f_tmfmnk_1 <- function() {
  map_dfr(.x = asplit(df2[-1], 1), ~ sweep(df1, 2, FUN = `*`, .x))
}

f_tmfmnk_2 <- function() {
  data.frame(do.call(rbind, lapply(asplit(df2[-1], 1), function(x) sweep(df1, 2, FUN = `*`, x))),
             id = rep(df2$id, each = nrow(df1)))
}

f_RonakShah <- function() {
  new1 <- df1[rep(seq(nrow(df1)), nrow(df2)), ] 
  new2 <- df2[rep(seq(nrow(df2)), each = nrow(df1)),]
  out <- cbind(new2[1], new1 * new2[-1])
  rownames(out) <- NULL
  out
}

f_Sotos <- function() {
  data.frame(id = rep(df2$id, each = nrow(df1)), 
             mapply(function(x, y)c(outer(x, y, `*`)), df1, df2[-1])
  )
}

bmk <- microbenchmark(times = 20,
               unit = "relative",
               f_ThomasIsCoding(),
               f_tmfmnk_1(),
               f_tmfmnk_2(),
               f_RonakShah(),
               f_Sotos())

给出

> bmk
Unit: relative
               expr       min        lq      mean    median       uq       max neval
 f_ThomasIsCoding()  1.186124  1.218201  1.197346  1.321731 1.042721  1.077854    20
       f_tmfmnk_1()  7.594520  7.572723  4.539698  7.297610 2.437621  3.446436    20
       f_tmfmnk_2()  9.670286 12.212220  6.583183 11.888061 3.370593  4.088534    20
      f_RonakShah() 28.918724 28.861437 16.707258 27.889563 8.403161 11.668252    20
          f_Sotos()  1.000000  1.000000  1.000000  1.000000 1.000000  1.000000    20

Answer 3

涉及@RunWith(CamelSpringBootRunner.class) @SpringBootTest( classes = { [Classes to load for the test] } ) @EnableAutoConfiguration @UseAdviceWith public class MyTestClass { ... }的一个选项可能是：

purrr

id列是否也很重要：

map_dfr(.x = asplit(df2[-1], 1), ~ sweep(df1, 2, FUN = `*`, .x))

  intercept  age male
1       3.4 3.20 0.07
2       3.6 2.00 0.06
3       3.7 2.40 0.07
4       3.4 3.60 0.00
5       3.6 2.25 0.00
6       3.7 2.70 0.00

与data.frame(map_dfr(.x = asplit(df2[-1], 1), ~ sweep(df1, 2, FUN = `*`, .x)), id = rep(df2$id, each = nrow(df1))) intercept age male id 1 3.4 3.20 0.07 a 2 3.6 2.00 0.06 a 3 3.7 2.40 0.07 a 4 3.4 3.60 0.00 b 5 3.6 2.25 0.00 b 6 3.7 2.70 0.00 b相同：

base R

或者：

do.call(rbind, lapply(asplit(df2[-1], 1), function(x) sweep(df1, 2, FUN = `*`, x)))

Answer 4

您可以根据其他数据框中的行数在两个数据框中重复行并直接相乘

df1[rep(seq(nrow(df1)), nrow(df2)),] * df2[rep(seq(nrow(df2)), each = nrow(df1)),-1]

#    intercept  age male
#1         3.4 3.20 0.07
#2         3.6 2.00 0.06
#3         3.7 2.40 0.07
#1.1       3.4 3.60 0.00
#2.1       3.6 2.25 0.00
#3.1       3.7 2.70 0.00

还要获得id列

new1 <- df1[rep(seq(nrow(df1)), nrow(df2)), ] 
new2 <- df2[rep(seq(nrow(df2)), each = nrow(df1)),]
out <- cbind(new2[1], new1 * new2[-1])
rownames(out) <- NULL

out
#  id intercept  age male
#1  a       3.4 3.20 0.07
#2  a       3.6 2.00 0.06
#3  a       3.7 2.40 0.07
#4  b       3.4 3.60 0.00
#5  b       3.6 2.25 0.00
#6  b       3.7 2.70 0.00

将一个数据框的每一行乘以第二个数据框的所有行

4 个答案: