具有指定元素总和的数据帧子集

时间:2014-06-16 15:05:57

标签: r mathematical-optimization

拥有这样的数据框:

   df <- data.frame(a=c(31, 18, 0, 1, 20, 2), 
   b=c(1, 0, 0, 3, 1, 1), 
   c=c(12, 0, 9, 8, 10, 3))

   > df
      a b  c
   1 31 1 12
   2 18 0  0
   3  0 0  9
   4  1 3  8
   5 20 1 10
   6  2 1  3

如何进行随机子集,使行和列的总和等于一个值,即100?

1 个答案:

答案 0 :(得分:2)

据我了解您的问题,您正在尝试对矩阵的行和列的子集进行采样,以便它们总和为目标值。

您可以使用整数优化来完成此任务。您将为每个行,列和单元格提供二元决策变量,并使用约束来强制单元格值等于行值和列值的乘积。我将使用lpSolve包来执行此操作,因为它有一个方便的机制来获得多个最佳解决方案。然后我们可以使用sample函数在它们之间进行选择:

library(lpSolve)
get.subset <- function(dat, target) {
  nr <- nrow(dat)
  nc <- ncol(dat)
  nvar <- nr + nc + nr*nc
  # Cells upper bounded by row and column variable values (r and c) and lower bounded by r+c-1
  mat <- as.matrix(do.call(rbind, apply(expand.grid(seq(nr), seq(nc)), 1, function(x) {
    r <- x[1]
    c <- x[2]
    pos <- nr + nc + (r-1)*nc + c
    ltc <- rep(0, nvar)
    ltc[nr + c] <- 1
    ltc[pos] <- -1
    ltr <- rep(0, nvar)
    ltr[r] <- 1
    ltr[pos] <- -1
    gtrc <- rep(0, nvar)
    gtrc[nr + c] <- 1
    gtrc[r] <- 1
    gtrc[pos] <- -1
    return(as.data.frame(rbind(ltc, ltr, gtrc)))
  })))
  dir <- rep(c(">=", ">=", "<="), nr*nc)
  rhs <- rep(c(0, 0, 1), nr*nc)

  # Sum of selected cells equals target
  mat <- rbind(mat, c(rep(0, nr+nc), as.vector(t(dat))))
  dir <- c(dir, "=")
  rhs <- c(rhs, target)

  res <- lp(objective.in=rep(0, nvar),  # Feasibility problem
            const.mat=mat,
            const.dir=dir,
            const.rhs=rhs,
            all.bin=TRUE,
            num.bin.solns=100  # Number of feasible solutions to get
            )
  if (res$status != 0) {
    return(list(rows=NA, cols=NA, subset=NA, num.sol=0))
  }
  sol.num <- sample(res$num.bin.solns, 1)
  vals <- res$solution[seq((sol.num-1)*nvar+1, sol.num*nvar)]
  rows <- which(vals[seq(nr)] >= 0.999)
  cols <- which(vals[seq(nr+1, nr+nc)] >= 0.999)
  return(list(rows=rows, cols=cols, subset=dat[rows,cols], num.sol=res$num.bin.solns))
}

该函数返回具有该总和的子集数,并返回随机选择的子集:

set.seed(144)
get.subset(df, 1)
# $rows
# [1] 1
# $cols
# [1] 2
# $subset
# [1] 1
# $num.sol
# [1] 14

get.subset(df, 100)
# $rows
# [1] 1 2 4 5
# $cols
# [1] 1 3
# $subset
#    a  c
# 1 31 12
# 2 18  0
# 4  1  8
# 5 20 10
# $num.sol
# [1] 2

get.subset(df, 10000)
# $rows
# [1] NA
# $cols
# [1] NA
# $subset
# [1] NA
# $num.sol
# [1] 0