Question

我正在尝试解决一个问题，那就是长长的列表在每个索引处都有数量不定的数字。目的是说每个数字出现的最早索引是什么。因此，如果15在列表中的索引45和78处出现，那么我应该返回15在48处的第一个位置。在最初的问题中，它的长度为10,000，所以这样做是有帮助的。

最初，我尝试使用现有的列表结构，并执行了类似的操作，速度慢了10,000行。

set.seed(1)
x <- replicate(100, sample(100, sample(10, 1)))
cbind(value = 1:100,
      index = sapply(1:100, function(i) which.max(sapply(x, function(x) i %in% x))))

最终，我尝试将数据转换为data.table，效果更好，但我一直想知道是否有更好的方法来解决问题。就像默认列表结构本质上效率低下吗？还是有更好的方法可以使用它？

set.seed(1)
x <- replicate(100, sample(100, sample(10, 1)))
dt <- data.table(index = rep(1:100, sapply(x, length)), value = unlist(x))
dt[,.(index = first(index)),value][order(value)]

如果有帮助，这里是原始问题的完整数据集。

library(RcppAlgos)
library(memoise)
library(data.table)

jgo <- function(n) {
  if (isPrimeRcpp(n) | n == 1) return (n)
  div <- divisorsRcpp(n)
  div <- div[-c(1, length(div))]
  div <- Map(function(a, b) c(a, b), div, rev(div))
  div2 <- lapply(div, function(x) lapply(jgo(x[1]), c, x[2]))
  unique(lapply(c(div, unlist(div2, recursive = FALSE)), sort))
}

jgo <- memoise(jgo)  

x <- lapply(1:12500, function(x) x - sapply(jgo(x), sum) + sapply(jgo(x), length))

Answer 1

这是使用match查找第一个索引的另一种方法。这比其他建议的方法稍胜一筹，并产生与OP问题类似的输出：

## dummy data
set.seed(1)
x <- replicate(100, sample(100, sample(10, 1)))

## use match to find first indices
first_indices_match <- function(x) {
  seq_x <- 1:length(x)
  matrix(c(seq_x, rep(seq_x, lengths(x))[match(seq_x, unlist(x))]), 
      ncol = 2, dimnames = list(NULL, c("value", "index"))) 
}

head(first_indices_match(x))
#>      value index
#> [1,]     1     1
#> [2,]     2     7
#> [3,]     3    45
#> [4,]     4    38
#> [5,]     5    31
#> [6,]     6     7

## data.table approach
library(data.table)

first_indices_dt <- function(x) {
  dt <- data.table(index = rep(seq_along(x), sapply(x, length)), value = unlist(x))
  dt[,.(index = first(index)),value][order(value)]

}

head(first_indices_dt(x))
#>    value index
#> 1:     1     1
#> 2:     2     7
#> 3:     3    45
#> 4:     4    38
#> 5:     5    31
#> 6:     6     7

基准

## stack + remove duplicate approach
first_indices_shree <- function(x) {
  names(x) <- seq_len(length(x))
  (d <- stack(x))[!duplicated(d$values), ]
}

## benchmarks several list sizes
bnch <- bench::press(
    n_size = c(100, 1E3, 1E4),
    {
      x <- replicate(n_size, sample(n_size, sample(10, 1)))
      bench::mark(
          match = first_indices_match(x),
          shree = first_indices_shree(x),
          dt = first_indices_dt(x),
          check = FALSE
      )
    }  
)
#> # A tibble: 9 x 7
#>   expression n_size      min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr>  <dbl> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 match         100  18.17µs   21.2µs   45639.    637.3KB    27.4 
#> 2 shree         100 361.88µs 411.06µs    2307.   106.68KB    11.1 
#> 3 dt            100 759.17µs 898.26µs     936.   264.58KB     8.51
#> 4 match        1000 158.34µs  169.9µs    5293.   164.15KB    30.8 
#> 5 shree        1000   1.54ms   1.71ms     567.   412.52KB    13.2 
#> 6 dt           1000   1.19ms    1.4ms     695.   372.13KB    10.7 
#> 7 match       10000   3.09ms   3.69ms     255.     1.47MB    15.9 
#> 8 shree       10000  18.06ms  18.95ms      51.5    4.07MB    12.9 
#> 9 dt          10000   5.65ms   6.33ms     149.     2.79MB    20.5

Answer 2

您可以简单地将列表堆叠到数据框中并删除重复的值。这将为您提供列表中所有值的第一个索引。

set.seed(1)
x <- replicate(100, sample(100, sample(10, 1)))

names(x) <- seq_along(x)
first_indices <- (d <- stack(x))[!duplicated(d$values), ]

head(first_indices)
  values ind
1     38   1
2     57   1
3     90   1
5     94   2
6     65   2
7      7   3

您现在可以使用%in%-

查找索引以查找所需的任何值

subset(first_indices, values %in% c(37,48))

   values ind
11     37   3
40     48   8

基准-

set.seed(1)
x <- replicate(1000, sample(1000, sample(10, 1)))

microbenchmark::microbenchmark(
  Shree = first_indices(x),
  JamesB = cbind(value = 1:1000,
      index = sapply(1:1000, function(i) which.max(sapply(x, function(x) i %in% x))))
)

Unit: milliseconds
  expr       min         lq        mean    median        uq       max neval
 Shree    2.3488    2.74855    4.171323    3.0577    4.7752   17.0743   100
JamesB 1750.4806 1904.79150 2519.912936 1994.9814 3282.5957 5966.1011   100

查找值第一次有效显示在列表中

2 个答案: