Question

我需要遍历数据表2 DT2，并且对于每一行检查数据表1 DT1中的某些条件是否匹配。

我目前这样做。这在DT1中查找DT2中的数据。在循环中，您将看到：

  subset = DT1[DT1$time == DT2$time[i] & DT1$a == DT2$a[i] & DT1$b == DT2$b[i] & (DT1$cat == DT2$cat1[i] | DT1$cat == DT2$cat2[i] )  & DT1$Flag ==0]

如果匹配我需要

1 - 标记在DT1中匹配的行，因此我再次不匹配。在使用DT1$Flag[match$RowNumber] = 1

完成的代码中

2 - 使用DT1中相应列的数据填充DT2中的列在使用

完成的代码中

  DT2$x[i]  = match$x

这几乎就是它的工作原理但是DT1可能是10,000行而DT2可能是100,000行，所以对于每10,000行我是子集。子集10,000次！

还有第二个要求，即循环具有完全匹配的匹配条件。在您看到的代码中

DT1$time == DT2$time[i]

完成匹配后。放宽时间条件并运行第二个循环，其时间窗口为+/- 7秒，因此在第二个循环中，您可以在DT1中找到与DT2中的时间相差+/- 7秒的匹配。

DT1[  DT1$time >= DT2$time[i] -7 & DT1$time <= DT2$time[i]+7

这也可以，但是因为数据表有很多行，子集化需要很长时间。这两个循环能以某种方式变得更快吗？

注意子集中的OR条件使得这个很棘手。

 (DT1$cat == DT2$cat1[i] | DT1$cat == DT2$cat2[i] )

示例代码低于您可以运行的代码。谢谢。

############# Here is the setup of the datatables
times= rep(as.POSIXct("2016-01-01",tz="GMT")+seq(1,10,by = 1),2)
times= times[order(times)]
DT1 = data.table(time = times, a = c(1,seq(1,19,1)) , b = c(11,seq(11,29,1)) , cat= c("a","a",  rep(c("a","b"),each=9) ) ,Flag =rep(0,20)  ,x = seq(201,220,1) )
DT1$RowNumber = seq(1,dim(DT1)[1],1)
DT2 = data.table(time = as.POSIXct(c("2016-01-01 00:00:01","2016-01-01 00:00:10","2016-01-01 00:00:10"),tz = "GMT"), a = c(1,19,10),b=c(11,29,20), cat1 = c("a","x","b"), x = c(0,0,0),MatchType = c("none","none","none"), cat2=c("a","b","a"))


 ######### This is the for loop that does the matching
#If there is a match i.e. dim(subset)[1]>0 two things happen
# 2 - flag the row used in DT1 so it is not used again...notice DT1$Flag ==0 is used in the subset
# 1 - populate column x in dt2 from column x in dt1
for(i in 1:dim(DT2)[1])#loop over rows of dt2
{
  #i =1
  subset = DT1[DT1$time == DT2$time[i] & DT1$a == DT2$a[i] & DT1$b == DT2$b[i] & (DT1$cat == DT2$cat1[i] | DT1$cat == DT2$cat2[i] )  & DT1$Flag ==0] #lookin dt2 for the dt1 data
  if(dim(subset)[1]>0)
  {
    match = head(subset,1) # if there are multiple matches only use the 1st one
    DT1$Flag[match$RowNumber] = 1 #flag the row used in DT1 so it is not used again
    DT2$MatchType[i]  = "First Loop"#populate column x in dt2 from column x in dt1
    DT2$x[i]  = match$x #populate column x in dt2 from column x in dt1
  }
}
##### after that loop some rows in DT2 will not have a match. In this case the last row has HasAMatch = 0
DT2 # NOTE HERE that  the last row has Match Type = none because a match could not be found
DT1 # NOTE the flag column has a 1 in the first and last rows which was set in the loop when the match occured

##### Now a second loop is done this time trying to match within a time window +-7 seconds instead of a matching EXACTLEY on time
firstloop = DT2[DT2$MatchType != "none",] ### this removes any of the rows ALREADY MATCHED IN THE FIRST LOOP
DT2 =DT2[DT2$MatchType == "none",] ### this is used in the loop below and has the rows that have NOT been matched yet
DT1 = DT1[DT1$Flag == 0,] ## this again removes rows from DT1 that have already been matched
DT1$RowNumber = seq(1,dim(DT1)[1],1)
for(i in 1:dim(DT2)[1])#loop over rows of dt2
{
  i=1
  subset = DT1[  DT1$time >= DT2$time[i] -7 & DT1$time <= DT2$time[i]+7  & DT1$a == DT2$a[i] & DT1$b == DT2$b[i] & (DT1$cat == DT2$cat1[i] | DT1$cat == DT2$cat2[i] )  & DT1$Flag ==0] #lookin dt2 for the dt1 data
  if(dim(subset)[1]>0)
  {
    match = head(subset,1) # if there are multiple matches only use the 1st one
    DT1$Flag[match$RowNumber] = 1 #flag the row used in DT1 so it is not used again
    DT2$MatchType[i]  = "Second Loop" #populate column x in dt2 from column x in dt1
    DT2$x[i]  = match$x #populate column x in dt2 from column x in dt1
  }
}

# now the process is finished
rbind(firstloop, DT2) # NOTE now you can see the match type of "second loop" for the last row
DT1 # NOTE the flag in row 10 because that was the row used in the match

Answer 1

首先我要提一下，你必须尝试阻止在数据集中使用变量名称的函数名称：.nupkg和cat是R中的函数，因此我使用了subset在这个答案中，而不是cat0和cat而不是subs。您的代码有几种可能的改进：

创建样本数据：

特别是使用subset创建RowNumber变量可以更高效地完成。此外，我还给了.I一个rownumber变量，因为这在接下来的步骤中很有用：

DT2

第一个循环：

这可以通过取消times <- rep(as.POSIXct("2016-01-01",tz="GMT") + seq(1,10,by = 1), 2) times <- times[order(times)] DT1 <- data.table(time = times, a = c(1,1:19), b = c(11,11:29), cat0 = c("a","a", rep(c("a","b"), each=9)), Flag = rep(0,20), x = seq(201,220,1))[, rn := .I] DT2 <- data.table(time = as.POSIXct(c("2016-01-01 00:00:01","2016-01-01 00:00:10","2016-01-01 00:00:10"), tz="GMT"), a = c(1,19,10), b = c(11,29,20), cat1 = c("a","x","b"), x = c(0,0,0), MatchType = c("none","none","none"), cat2 = c("a","b","a"))[, rn := .I]的更新并利用{em> data.table 包的引用可能性来更新DT1（这可能会简化）比在for-loop中更有效率：

:=

然后可以通过创建索引并再次使用

for(i in 1:nrow(DT2))
{
  subs <- DT1[time == DT2$time[i] & 
                a == DT2$a[i] & 
                b == DT2$b[i] & 
                (cat0 == DT2$cat1[i] | cat0 == DT2$cat2[i])
              & Flag == 0] 
  if(nrow(subs) > 0)
  {
    DT2[i, `:=` (MatchType = 'First Loop', x = subs$x[1])]
  }
}

按引用更新来更新

DT1：

:=

第二个循环：

idx1 <- DT1[(time %in% DT2$time) & (a %in% DT2$a) & (b %in% DT2$b) & 
              (cat0 %in% DT2$cat1 | cat0 %in% DT2$cat2) & (Flag == 0), 
            .SD[1], 
            .(time,a,b,cat0,Flag)]$rn
DT1[idx1, Flag := 1]

这些改进消除了创建中间子集和for(i in DT2[MatchType == "none"]$rn) # here we need the rownumber variable for DT2 { subs <- DT1[time >= DT2$time[i]-7 & time <= DT2$time[i]+7 & a == DT2$a[i] & b == DT2$b[i] & (cat0 == DT2$cat1[i] | cat0 == DT2$cat2[i] ) & Flag == 0] if(nrow(subs) > 0) { DT1[subs$rn[1], Flag := 2] DT2[i, `:=` (MatchType = 'Second Loop', x = subs$x[1])] } }步骤的需要。最终结果：

rbind

重复的子集可以让您更快地完成此过程

1 个答案: