R更快的功能来重复子集数据

时间:2017-06-08 12:02:13

标签: r data.table subset

我有两个data.tables entext.newtransit.service1(在下面的数据框中发布)。

> dput(data.frame(entext.new))
structure(list(person = c(1701L, 1701L), vehicle = c("tr_71_3", 
"tr_5_7"), atstop = c(108557L, 108536L), time1 = c(31931, 32560
), acttype1 = structure(c(1L, 1L), .Label = c("PersonEntersVehicle", 
"PersonLeavesVehicle", "waitingForPt"), class = "factor"), person2 = c(1701L, 
1701L), vehicle2 = c("tr_71_3", "tr_5_7"), deststop = c(108558L, 
100905L), time2 = c(31998, 32620), acttype2 = structure(c(2L, 
2L), .Label = c("PersonEntersVehicle", "PersonLeavesVehicle", 
"waitingForPt"), class = "factor")), .Names = c("person", "vehicle", 
"atstop", "time1", "acttype1", "person2", "vehicle2", "deststop", 
"time2", "acttype2"), row.names = c(NA, -2L), class = "data.frame")

> dput(data.frame(entext.new))
structure(list(person = c(1701L, 1701L), vehicle = c("tr_71_3", 
"tr_5_7"), atstop = c(108557L, 108536L), time1 = c(31931, 32560
), acttype1 = structure(c(1L, 1L), .Label = c("PersonEntersVehicle", 
"PersonLeavesVehicle", "waitingForPt"), class = "factor"), person2 = c(1701L, 
1701L), vehicle2 = c("tr_71_3", "tr_5_7"), deststop = c(108558L, 
100905L), time2 = c(31998, 32620), acttype2 = structure(c(2L, 
2L), .Label = c("PersonEntersVehicle", "PersonLeavesVehicle", 
"waitingForPt"), class = "factor")), .Names = c("person", "vehicle", 
"atstop", "time1", "acttype1", "person2", "vehicle2", "deststop", 
"time2", "acttype2"), row.names = c(NA, -2L), class = "data.frame")
> dput(data.frame(transit.service1))
structure(list(id = c(725531L, 725532L, 726871L, 728273L, 728274L, 
728825L, 728826L, 729489L, 729490L, 730106L, 730109L, 730315L, 
730316L, 732297L, 732298L, 734989L, 734990L, 735945L, 735948L, 
736878L, 736879L, 737807L, 737808L, 737834L, 737835L, 738292L, 
738293L, 738314L, 738315L, 739275L, 739276L, 740407L, 740408L, 
741248L, 741249L, 700159L, 700160L, 700244L, 700245L, 700292L, 
700490L, 700526L, 700527L, 702052L, 702053L, 702725L, 702726L, 
702812L, 702815L, 702872L, 702991L), vehicle = c("tr_5_7", "tr_5_7", 
"tr_5_7", "tr_5_7", "tr_5_7", "tr_5_7", "tr_5_7", "tr_5_7", "tr_5_7", 
"tr_5_7", "tr_5_7", "tr_5_7", "tr_5_7", "tr_5_7", "tr_5_7", "tr_5_7", 
"tr_5_7", "tr_5_7", "tr_5_7", "tr_5_7", "tr_5_7", "tr_5_7", "tr_5_7", 
"tr_5_7", "tr_5_7", "tr_5_7", "tr_5_7", "tr_5_7", "tr_5_7", "tr_5_7", 
"tr_5_7", "tr_5_7", "tr_5_7", "tr_5_7", "tr_5_7", "tr_71_3", 
"tr_71_3", "tr_71_3", "tr_71_3", "tr_71_3", "tr_71_3", "tr_71_3", 
"tr_71_3", "tr_71_3", "tr_71_3", "tr_71_3", "tr_71_3", "tr_71_3", 
"tr_71_3", "tr_71_3", "tr_71_3"), time = c(32542, 32542, 32563, 
32584, 32584, 32594, 32594, 32604, 32604, 32613, 32613, 32617, 
32617, 32648, 32648, 32691, 32691, 32706, 32706, 32721, 32721, 
32736, 32736, 32737, 32737, 32744, 32744, 32745, 32745, 32760, 
32760, 32778, 32778, 32793, 32793, 31927, 31927, 31929, 31929, 
31930, 31935, 31936, 31936, 31977, 31977, 31994, 31994, 31996, 
31996, 31997, 32000), link = c(200016105L, NA, NA, 200016105L, 
61056124L, 61056124L, 61246144L, 61246144L, 61446158L, NA, NA, 
61446158L, 61589049L, 61589049L, 90496198L, 90496198L, 61986249L, 
NA, NA, 61986249L, 62496295L, NA, NA, 62496295L, 62956316L, NA, 
NA, 62956316L, 63166350L, NA, NA, 63166350L, 63506404L, 63506404L, 
64046472L, 61176131L, 613120013L, 613120013L, 200136131L, NA, 
NA, 200136131L, 61316194L, 61316194L, 61946230L, 61946230L, 623020014L, 
623020014L, 200146230L, NA, NA), facility = c(NA, 108536L, 108536L, 
NA, NA, NA, NA, NA, NA, 100905L, 100905L, NA, NA, NA, NA, NA, 
NA, 100979L, 100979L, NA, NA, 101017L, 101017L, NA, NA, 101075L, 
101075L, NA, NA, 101098L, 101098L, NA, NA, NA, NA, NA, NA, NA, 
NA, 108557L, 108557L, NA, NA, NA, NA, NA, NA, NA, NA, 108558L, 
108558L), acttype = structure(c(3L, 4L, 5L, 2L, 1L, 2L, 1L, 2L, 
1L, 4L, 5L, 2L, 1L, 2L, 1L, 2L, 1L, 4L, 5L, 2L, 1L, 4L, 5L, 2L, 
1L, 4L, 5L, 2L, 1L, 4L, 5L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 4L, 
5L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 4L, 5L), .Label = c("entered link", 
"left link", "vehicle enters traffic", "VehicleArrivesAtFacility", 
"VehicleDepartsAtFacility"), class = "factor")), .Names = c("id", 
"vehicle", "time", "link", "facility", "acttype"), row.names = c(NA, 
-51L), class = "data.frame")

我需要根据transit.service1中的某些条件对entext.new进行分组。为此,我写了一个小函数,如下所示。但是,这个功能运行得很慢。

func_subset1 <- function(x,y,z,k) {
  list(cbind(x, transit.service1[vehicle==y & time>=z & time<=k]))
}
list1 <- mapply(func_subset1, entext.new$person, entext.new$vehicle, entext.new$time1, entext.new$time2)
df.final <- do.call(rbind.data.frame, list1) 

我试图像这样剖析代码(虽然我不完全理解它):

> summaryRprof(tmp)
$by.self
      self.time self.pct total.time total.pct
"cat"      0.02      100       0.02       100

$by.total
                    total.time total.pct self.time self.pct
"cat"                     0.02       100      0.02      100
".rs.valueContents"       0.02       100      0.00        0
".rs.valueFromStr"        0.02       100      0.00        0
".rs.withTimeLimit"       0.02       100      0.00        0
"<Anonymous>"             0.02       100      0.00        0
"capture.output"          0.02       100      0.00        0
"do.call"                 0.02       100      0.00        0
"doTryCatch"              0.02       100      0.00        0
"eval"                    0.02       100      0.00        0
"evalVis"                 0.02       100      0.00        0
"NextMethod"              0.02       100      0.00        0
"str"                     0.02       100      0.00        0
"str.data.frame"          0.02       100      0.00        0
"str.default"             0.02       100      0.00        0
"strSub"                  0.02       100      0.00        0
"try"                     0.02       100      0.00        0
"tryCatch"                0.02       100      0.00        0
"tryCatchList"            0.02       100      0.00        0
"tryCatchOne"             0.02       100      0.00        0
"withVisible"             0.02       100      0.00        0

$sample.interval
[1] 0.02

$sampling.time
[1] 0.02

您对加速此子集功能有什么建议吗?

0 个答案:

没有答案