Question

我是新手重新采样技术和引导r，所以任何帮助将不胜感激。我有一个由不同个体组成的数据框，有时候不止一次测量过。我想从群体中的每个个体中随机抽取单个测量值，然后计算总体平均值和标准差。我想多次（500左右）执行此过程并获得两个新的数据帧，一个具有所有总体平均值，另一个具有每个变量的所有总体标准偏差（Dim.1到Dim.4）。由此，我可以提取下游分析的全局均值和标准差。以下是数据框的外观（“ID”是每个人的唯一编号，您可以看到每个人都有不同数量的重复测量值。）

ID      Dim.1       Dim.2        Dim.3        Dim.4
41  0.4001945  1.15899378  0.269197195  0.184791153
14  2.1615710  1.15712356 -0.096055808  0.450943821
63  0.4325496  0.75521068  0.085588532 -0.233144806
53  1.2459718  0.97450610 -0.069171367 -0.613423267
63  1.3380629  0.22606572 -0.061178395 -0.304960508
42  1.6048214  0.94184036  0.232863647 -0.201738198
57  1.3306709  0.80440736 -0.955949551 -0.734022636
53  0.7019118  0.87285991 -0.042557052 -0.146748989
51  0.7235493  0.29946448  0.474477629  0.305810371
53  1.2431220  1.20252749 -0.073627812  0.237740020
41  1.1788653  0.55536570 -0.017354302  0.119014260
14  2.5769809  0.18551630  0.634304132  0.617288243
67  1.0445458  1.47107481  0.024383348  0.111808376
31  0.9759513  1.31091796 -0.008660192  0.189962355
63  1.8621687  0.97137412  0.317014897 -0.390871248
76  0.5905190  1.49817641 -0.374503265  0.142478388
90  2.4323563  0.87696545  0.467220123  0.513197279
67  2.2378032  0.35682721  0.400233674 -0.926848226
41  1.7098808  0.40470067  0.050950910 -0.153059068
97  1.5351169  1.11597681  0.011878347 -0.092047152
63  1.2647155  0.80006707  0.730022680 -0.089726522
57  1.7200676  0.01358165  0.450075592  0.038352174
76  0.6949196  1.36741272 -1.286488394  0.477345585
123  2.4235534  1.69165605  0.528863655  0.447856674
76 -2.4022432 -0.27531557 -1.850999153  2.194893741
117  1.6955740 -1.86088122  1.502655438  0.856026945
117  0.7130716  1.44198379 -1.495098987 -1.021981479
131  0.8425548  1.22970621 -0.160634720  0.005202717
117  1.0913048  1.19834030 -0.240309947  0.279379075
90  2.5787954  0.21638781  0.973339314  0.853752379
105  1.4989440  1.31525062  0.233114414  0.082557111
45  0.4749492  0.36264159  0.016554066  0.434416650
14  1.9841503 -0.18133091 -0.517021686  0.131796394

这是dput版本......

structure(list(AnID = structure(c(3L, 1L, 9L, 7L, 9L, 4L, 8L, 
7L, 6L, 7L, 3L, 1L, 10L, 2L, 9L, 11L, 12L, 10L, 3L, 13L, 9L, 
8L, 11L, 16L, 11L, 15L, 15L, 17L, 15L, 12L, 14L, 5L, 1L), .Label = c("14", 
"31", "41", "42", "45", "51", "53", "57", "63", "67", "76", "90", 
"97", "105", "117", "123", "131"), class = "factor"), Dim.1 = c(0.400194544195721, 
2.16157096683054, 0.432549610256816, 1.24597182598991, 1.33806287869605, 
1.60482137307563, 1.33067093524332, 0.701911835019105, 0.723549265733465, 
1.24312199041168, 1.17886527411877, 2.57698094739979, 1.04454579781695, 
0.975951278566957, 1.86216869726173, 0.590519015534528, 2.43235630542313, 
2.23780317751189, 1.70988079418724, 1.53511692947232, 1.26471553939687, 
1.72006761902848, 0.694919562457936, 2.42355344632234, -2.40224317003857, 
1.69557401848893, 0.713071563313831, 0.84255475961074, 1.09130484807346, 
2.57879543707134, 1.49894397171646, 0.474949215360165, 1.9841503256016
), Dim.2 = c(1.15899377720071, 1.15712355628702, 0.755210676050028, 
0.974506103663373, 0.226065715930444, 0.941840360304357, 0.804407356238532, 
0.872859912826886, 0.299464475124326, 1.2025274866889, 0.55536570304097, 
0.185516296049789, 1.47107481283135, 1.31091795925695, 0.971374119614307, 
1.49817640676682, 0.876965451353274, 0.356827207847936, 0.404700668672103, 
1.11597680662439, 0.800067070614603, 0.0135816493815426, 1.36741271705742, 
1.69165605426992, -0.275315573666507, -1.86088122056554, 1.44198379044125, 
1.229706212058, 1.19834030462339, 0.216387812905091, 1.31525061699366, 
0.362641590025834, -0.181330912913297), Dim.3 = c(0.269197195180612, 
-0.0960558078596061, 0.0855885321454752, -0.0691713671666404, 
-0.0611783947257435, 0.232863646917399, -0.955949551451659, -0.0425570523689114, 
0.474477629049467, -0.0736278121798866, -0.0173543018324465, 
0.634304131880689, 0.0243833483864922, -0.00866019164798527, 
0.317014896588811, -0.374503264871839, 0.467220123029729, 0.400233673552903, 
0.0509509097106227, 0.0118783465387495, 0.730022679967163, 0.450075591988245, 
-1.28648839432794, 0.528863655457902, -1.85099915345691, 1.50265543792412, 
-1.49509898726221, -0.160634720376254, -0.24030994662375, 0.973339313851613, 
0.233114414466102, 0.0165540663395682, -0.517021685999838), Dim.4 = c(0.184791153018369, 
0.45094382124022, -0.233144806193005, -0.613423266807646, -0.304960507895512, 
-0.201738198311526, -0.734022636110577, -0.146748988783387, 0.305810371055691, 
0.237740020179384, 0.11901425952943, 0.61728824337695, 0.111808376374363, 
0.189962354663836, -0.390871248426407, 0.14247838773032, 0.513197279323348, 
-0.926848226311571, -0.153059067639092, -0.0920471522899872, 
-0.0897265219239891, 0.0383521738356584, 0.477345585143069, 0.447856673901548, 
2.19489374105159, 0.856026944966164, -1.02198147948597, 0.00520271670521917, 
0.279379074573862, 0.853752378937349, 0.0825571109781094, 0.434416649778733, 
0.131796393683415)), .Names = c("AnID", "Dim.1", "Dim.2", "Dim.3", 
"Dim.4"), class = "data.frame", row.names = c("20", "26", "36", 
"46", "49", "52", "75", "93", "94", "110", "118", "124", "132", 
"143", "157", "168", "185", "199", "210", "211", "215", "225", 
"240", "245", "248", "250", "254", "270", "272", "281", "297", 
"322", "337"))

Answer 1

你走了：

boot_id <- function(df) {
  s <- sample((1:nrow(df)), size=1, replace=F)
  return(df[s,])
}

boot_dat <- function(df, n= 500, f= c("mean", "sd")) {
  f <- match.arg(f, c("mean", "sd"), several.ok = FALSE)

  res <- matrix(NA, nrow= n, ncol= ncol(df)-1)

  for (i in 1:n) {
    df2 <- df[, boot_id(.SD), by= "ID"]
    df2$ID <- NULL
    if (f == "mean") {
      res[i,] <- colMeans(df2)
    } else {
      res[i,] <- apply(df2, 2, sd)
    }
  }
  return(res)
}

# dt <- <your structure>
names(dt) <- c("ID", "d1", "d2", "d3", "d4")
library(data.table)
dt <- data.table(dt)
setkey(dt, ID)

dat_means <- boot_dat(dt, f= "mean")
dat_sds   <- boot_dat(dt, f= "sd")

r中重复测量的随机重采样

1 个答案: