Question

我想创建一个由固定数字0和1组成的二进制向量的所有可能组合。例如：变暗（V）= 5×1; N1 = 3; N0 = 2; 在这种情况下，我希望有类似的内容：

  1,1,1,0,0
  1,1,0,1,0
  1,1,0,0,1
  1,0,1,1,0
  1,0,1,0,1
  1,0,0,1,1
  0,1,1,1,0
  0,1,1,0,1
  0,1,0,1,1
  0,0,1,1,1

我在阅读这篇文章时找到了一些帮助 Create all possible combiations of 0,1, or 2 "1"s of a binary vector of length n 但我想只生成我需要的组合，避免浪费任何空间（我认为问题会随着n而呈现出来）

Answer 1

Marat答案的稍快版本：

f.roland <- function(n, m) {
  ind <- combn(seq_len(n), m)
  ind <- t(ind) + (seq_len(ncol(ind)) - 1) * n
  res <- rep(0, nrow(ind) * n)
  res[ind] <- 1
  matrix(res, ncol = n, nrow = nrow(ind), byrow = TRUE)
}

all.equal(f.2(16, 8), f.roland(16, 8))
#[1] TRUE
library(rbenchmark)
benchmark(f(16,8),f.2(16,8),f.roland(16,8))

#             test replications elapsed relative user.self sys.self user.child sys.child
#2      f.2(16, 8)          100   5.693    1.931     5.670    0.020          0         0
#3 f.roland(16, 8)          100   2.948    1.000     2.929    0.017          0         0
#1        f(16, 8)          100   8.287    2.811     8.214    0.066          0         0

Answer 2

您可以尝试这种方法：

f <- function(n=5,m=3)
 t(apply(combn(1:n,m=m),2,function(cm) replace(rep(0,n),cm,1)))

f(5,3)
#       [,1] [,2] [,3] [,4] [,5]
#  [1,]    1    1    1    0    0
#  [2,]    1    1    0    1    0
#  [3,]    1    1    0    0    1
#  [4,]    1    0    1    1    0
#  [5,]    1    0    1    0    1
#  [6,]    1    0    0    1    1
#  [7,]    0    1    1    1    0
#  [8,]    0    1    1    0    1
#  [9,]    0    1    0    1    1
# [10,]    0    0    1    1    1

我们的想法是为1生成索引的所有组合，然后使用它们来生成最终结果。

同样方法的另一种风格：

f.2 <- function(n=5,m=3)
  t(combn(1:n,m,FUN=function(cm) replace(rep(0,n),cm,1)))

第二种方法快两倍：

library(rbenchmark)
benchmark(f(16,8),f.2(16,8))
#         test replications elapsed relative user.self sys.self user.child sys.child
# 2 f.2(16, 8)          100   5.706    1.000     5.688    0.017          0         0
# 1   f(16, 8)          100  10.802    1.893    10.715    0.082          0         0

基准

f.akrun <- function(n=5,m=3) {

  indx <- combnPrim(1:n,m)

  DT <- setDT(as.data.frame(matrix(0, ncol(indx),n)))
  for(i in seq_len(nrow(DT))){
    set(DT, i=i, j=indx[,i],value=1) 
  }
  DT  
}

benchmark(f(16,8),f.2(16,8),f.akrun(16,8))
#            test replications elapsed relative user.self sys.self user.child sys.child
# 2     f.2(16, 8)          100   5.464    1.097     5.435    0.028          0         0
# 3 f.akrun(16, 8)          100   4.979    1.000     4.938    0.037          0         0
# 1       f(16, 8)          100  10.854    2.180    10.689    0.129          0         0

@ akrun的解决方案（f.akrun）比f.2快〜10％。

<强> [编辑] 另一种方法，更快更简单：

f.3 <- function(n=5,m=3) t(combn(n,m,tabulate,nbins=n))

Answer 3

您可以combnPrim中的gRbase和set中的data.table（可能是faster）

source("http://bioconductor.org/biocLite.R")
biocLite("gRbase") 
library(gRbase)
library(data.table)
n <-5
indx <- combnPrim(1:n,3)

DT <- setDT(as.data.frame(matrix(0, ncol(indx),n)))
 for(i in seq_len(nrow(DT))){
  set(DT, i=i, j=indx[,i],value=1) 
 }
DT
 #   V1 V2 V3 V4 V5
 #1:  1  1  1  0  0
 #2:  1  1  0  1  0
 #3:  1  0  1  1  0
 #4:  0  1  1  1  0
 #5:  1  1  0  0  1
 #6:  1  0  1  0  1
 #7:  0  1  1  0  1
 #8:  1  0  0  1  1
 #9:  0  1  0  1  1
#10:  0  0  1  1  1

Answer 4

这是另一种方法：

func <- function(n, m) t(combn(n, m, function(a) {z=integer(n);z[a]=1;z}))

func(n = 5, m = 2)

     # [,1] [,2] [,3] [,4] [,5]
 # [1,]    1    1    0    0    0
 # [2,]    1    0    1    0    0
 # [3,]    1    0    0    1    0
 # [4,]    1    0    0    0    1
 # [5,]    0    1    1    0    0
 # [6,]    0    1    0    1    0
 # [7,]    0    1    0    0    1
 # [8,]    0    0    1    1    0
 # [9,]    0    0    1    0    1
# [10,]    0    0    0    1    1

Answer 5

使用二叉树扩展比 f.roland（对于 n/m 约等于 2，对于 m << n f.roland 获胜）的性能略有提升，但代价是更高的内存使用：

f.krassowski = function(n, m) {
    m_minus_n = m - n
    paths = list(
        c(0, rep(NA, n-1)),
        c(1, rep(NA, n-1))
    )
    sums = c(0, 1)
    for (level in 2:n) {
        upper_threshold = level + m_minus_n

        is_worth_adding_0 = (sums <= m) & (upper_threshold <= sums)
        is_worth_adding_1 = (sums <= m - 1) & (upper_threshold - 1 <= sums)

        x = paths[is_worth_adding_0]
        y = paths[is_worth_adding_1]

        for (i in 1:length(x)) {
            x[[i]][[level]] = 0
        }
        for (i in 1:length(y)) {
            y[[i]][[level]] = 1
        }
        paths = c(x, y)
        sums = c(sums[is_worth_adding_0], sums[is_worth_adding_1] + 1)
    }
    matrix(unlist(paths), byrow=TRUE, nrow=length(paths))
}

元素的顺序不同。

n/m = 2 的基准测试：

               expr       min        lq     mean    median        uq      max
           f(16, 8) 47.488731 48.182502 52.04539 48.689082 57.558552 65.26211
         f.2(16, 8) 38.291302 39.533287 43.61786 40.513500 48.673713 54.21076
         f.3(16, 8) 38.289619 39.007766 40.21002 39.273940 39.970907 49.02320
       f.989(16, 8) 35.000941 35.199950 38.09043 35.607685 40.725833 49.61785
    f.roland(16, 8) 14.295560 14.399079 15.02285 14.559891 14.625825 23.54574
f.krassowski(16, 8)  9.343784  9.552871 10.20118  9.614251  9.863443 19.70659

值得注意的是，f.3 的内存占用最小：

<头>

表达	mem_alloc
f(16, 8)	5.7MB
f.2(16, 8)	3.14MB
f.3(16, 8)	1.57MB
f.989(16, 8)	3.14MB
f.roland(16, 8)	5.25MB
f.krassowski(16, 8)	6.37MB

对于n/m = 10：

               expr       min        lq      mean    median        uq      max
           f(30, 3) 14.590784 14.819879 15.061327 14.970385 15.238594 15.74435
         f.2(30, 3) 11.886532 12.164719 14.197877 12.267662 12.450575 32.47237
         f.3(30, 3) 11.458760 11.597360 12.741168 11.706475 11.892549 30.36309
       f.989(30, 3) 10.646286 10.861159 12.922651 10.971200 11.106610 30.86498
    f.roland(30, 3)  3.513980  3.589361  4.559673  3.629923  3.727350 21.58201
f.krassowski(30, 3)  8.861349  8.927388 10.430068  9.022631  9.405705 32.70073

创建二进制向量的组合

5 个答案: