R - 使用speedglm包中的summary()时出错

时间:2016-01-19 21:30:19

标签: r dataframe summary speedglm

我使用speedglm估算某些数据的逻辑回归模型。我创建了一个可重现的示例,它会生成与使用原始数据相同的错误。

library(speedglm)
n <- 10000
dtf <- data.frame( y = sample(c(0,1), n, 1),
                  x1 = as.factor(sample(c("a","b"), n, 1)),
                  x2 = rnorm(n, 30, 10))
m <- speedglm(y ~ x1 + x2, dtf, family=binomial())
summary(m)

输出如下:

Generalized Linear Model of class 'speedglm':

Call:  speedglm(formula = y ~ x1 + x2, data = dtf, family = binomial()) 

Coefficients:
 ------------------------------------------------------------------ 
Error in data.frame(..., check.names = FALSE) : 
  arguments imply differing number of rows: 3, 0

我已经通过执行summary.speedglm检查了getS3method("summary", "speedglm")的源代码,并找到了产生错误的代码行,但它没有帮助解决问题。

PS:拥有1500+代表的人应该创建speedglm代码。

更新

speedglm的维护者Marco Enea要求对summary.speedglmprint.summary.speedglm发布以下临时修正。

summary.speedglm <- function (object, correlation = FALSE, ...) 
{
  if (!inherits(object, "speedglm")) 
    stop("object is not of class speedglm")
  z <- object
  var_res <- as.numeric(z$RSS/z$df)
  dispersion <- if (z$family$family %in% c("poisson", "binomial")) 1 else var_res
  if (z$method == "qr") {
    z$XTX <- z$XTX[z$ok, z$ok]
  }
  inv <- solve(z$XTX, tol = z$tol.solve)
  covmat <- diag(inv)
  se_coef <- rep(NA, length(z$coefficients))
  se_coef[z$ok] <- sqrt(dispersion * covmat)
  if (z$family$family %in% c("binomial", "poisson")) {
    z1 <- z$coefficients/se_coef
    p <- 2 * pnorm(abs(z1), lower.tail = FALSE)
  } else {
    t1 <- z$coefficients/se_coef
    p <- 2 * pt(abs(t1), df = z$df, lower.tail = FALSE)
  }
  ip <- !is.na(p)
  p[ip] <- as.numeric(format(p[ip], digits = 3))
  dn <- c("Estimate", "Std. Error")
  if (z$family$family %in% c("binomial", "poisson")) {
    format.coef <- if (any(na.omit(abs(z$coef)) < 1e-04)) 
      format(z$coefficients, scientific = TRUE, digits = 4) else 
        round(z$coefficients, digits = 7)
    format.se <- if (any(na.omit(se_coef) < 1e-04)) 
      format(se_coef, scientific = TRUE, digits = 4) else round(se_coef, digits = 7)
    format.pv <- if (any(na.omit(p) < 1e-04)) 
      format(p, scientific = TRUE, digits = 4) else round(p, digits = 4)
    param <- data.frame(format.coef, format.se, round(z1, 
                                                      digits = 4), format.pv)
    dimnames(param) <- list(names(z$coefficients), c(dn, 
                                                     "z value", "Pr(>|z|)"))
  } else {
    format.coef <- if (any(abs(na.omit(z$coefficients)) < 
                             1e-04)) 
      format(z$coefficients, scientific = TRUE, digits = 4) else 
        round(z$coefficients, digits = 7)
    format.se <- if (any(na.omit(se_coef) < 1e-04)) 
      format(se_coef, scientific = TRUE, digits = 4) else 
        round(se_coef, digits = 7)
    format.pv <- if (any(na.omit(p) < 1e-04)) 
      format(p, scientific = TRUE, digits = 4) else round(p, digits = 4)
    param <- data.frame(format.coef, format.se, round(t1, 
                                                      digits = 4), format.pv)
    dimnames(param) <- list(names(z$coefficients), c(dn, 
                                                     "t value", "Pr(>|t|)"))
  }
  eps <- 10 * .Machine$double.eps
  if (z$family$family == "binomial") {
    if (any(z$mu > 1 - eps) || any(z$mu < eps)) 
      warning("fitted probabilities numerically 0 or 1 occurred")
  }
  if (z$family$family == "poisson") {
    if (any(z$mu < eps)) 
      warning("fitted rates numerically 0 occurred")
  }
  keep <- match(c("call", "terms", "family", "deviance", "aic", 
                  "df", "nulldev", "nulldf", "iter", "tol", "n", "convergence", 
                  "ngoodobs", "logLik", "RSS", "rank"), names(object), 
                0)
  ans <- c(object[keep], list(coefficients = param, dispersion = dispersion, 
                              correlation = correlation, cov.unscaled = inv, cov.scaled = inv * 
                                var_res))
  if (correlation) {
    ans$correl <- (inv * var_res)/outer(na.omit(se_coef), 
                                        na.omit(se_coef))
  }
  class(ans) <- "summary.speedglm"
  return(ans)
}

print.summary.speedglm <- function (x, digits = max(3, getOption("digits") - 3), ...) 
{
  cat("Generalized Linear Model of class 'speedglm':\n")
  if (!is.null(x$call)) 
    cat("\nCall: ", deparse(x$call), "\n\n")
  if (length(x$coef)) {
    cat("Coefficients:\n")
    cat(" ------------------------------------------------------------------", 
        "\n")
    sig <- function(z){
      if (!is.na(z)){
        if (z < 0.001) 
          "***"
        else if (z < 0.01) 
          "** "
        else if (z < 0.05) 
          "*  "
        else if (z < 0.1) 
          ".  "
        else "   "
      } else "   "
    }
    options(warn=-1)
    sig.1 <- sapply(as.numeric(as.character(x$coefficients[,4])), 
                    sig)
    options(warn=0)
    est.1 <- cbind(format(x$coefficients, digits = digits), 
                   sig.1)
    colnames(est.1)[ncol(est.1)] <- ""
    print(est.1)
    cat("\n")
    cat("-------------------------------------------------------------------", 
        "\n")
    cat("Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1", 
        "\n")
    cat("\n")
  }
  else cat("No coefficients\n")
  cat("---\n")
  cat("null df: ", x$nulldf, "; null deviance: ", round(x$nulldev, 
                                                        digits = 2), ";\n", "residuals df: ", x$df, "; residuals deviance: ", 
      round(x$deviance, digits = 2), ";\n", "# obs.: ", x$n, 
      "; # non-zero weighted obs.: ", x$ngoodobs, ";\n", "AIC: ", 
      x$aic, "; log Likelihood: ", x$logLik, ";\n", "RSS: ", 
      round(x$RSS, digits = 1), "; dispersion: ", x$dispersion, 
      "; iterations: ", x$iter, ";\n", "rank: ", round(x$rank, 
                                                       digits = 1), "; max tolerance: ", format(x$tol, scientific = TRUE, 
                                                                                                digits = 3), "; convergence: ", x$convergence, ".\n", 
      sep = "")
  invisible(x)
  if (x$correlation) {
    cat("---\n")
    cat("Correlation of Coefficients:\n")
    x$correl[upper.tri(x$correl, diag = TRUE)] <- NA
    print(x$correl[-1, -nrow(x$correl)], na.print = "", digits = 2)
  }
}

关注42&#39;建议,我还要补充以下内容:

environment(summary.speedglm) <- environment(speedglm)
environment(print.summary.speedglm) <- environment(speedglm)

2 个答案:

答案 0 :(得分:6)

print.summary.speedglm函数中有一个小错误。如果您更改此行:

sig.1 <- cbind(sapply(as.numeric(as.character(x$coefficients$"Pr(>|t|)")), sig))

到这一行:

 sig.1 <- cbind(sapply(as.numeric(as.character(x$coefficients$"Pr(>|z|)")), sig))

并且还运行:

environment(print.summary.speedglm) <- environment(speedglm)

您将不再看到错误消息。

报告错误的正确方法是联系维护人员(我将给他发送电子邮件):

maintainer('speedglm')
[1] "Marco Enea <emarco76@libero.it>"

答案 1 :(得分:2)

看来这是一个错误;在speedglm:::print.summary.speedglm中有一行:

        sig.1 <- sapply(as.numeric(as.character(x$coefficients$"Pr(>|t|)")), 
        sig)

但是当你看到这个物体时,你可以看到:

              Estimate Std. Error z value Pr(>|z|)
(Intercept) -0.0546397  0.0655713 -0.8333    0.405
x1b         -0.0618225  0.0400126 -1.5451    0.122
x2           0.0020771  0.0019815  1.0483    0.295

其中Pr(>|z|)代替Pr(>|t|),因此sig明星失败。