Question

我尝试重现Stata在运行具有群集标准错误的模型时产生的95％CI。例如：

    regress api00 acs_k3 acs_46 full enroll, cluster(dnum)

    Regression with robust standard errors                 Number of obs =     395
                                                           F(  4,    36) =   31.18
                                                           Prob > F      =  0.0000
                                                           R-squared     =  0.3849
    Number of clusters (dnum) = 37                         Root MSE      =  112.20

    ------------------------------------------------------------------------------
             |               Robust
       api00 |      Coef.   Std. Err.       t     P>|t|       [95% Conf. Interval]
    ---------+--------------------------------------------------------------------
      acs_k3 |   6.954381   6.901117      1.008   0.320      -7.041734     20.9505
      acs_46 |   5.966015   2.531075      2.357   0.024       .8327565    11.09927
        full |   4.668221   .7034641      6.636   0.000        3.24153    6.094913
      enroll |  -.1059909   .0429478     -2.468   0.018      -.1930931   -.0188888
       _cons |  -5.200407   121.7856     -0.043   0.966       -252.193    241.7922
    ------------------------------------------------------------------------------

我能够重现系数和标准误差：

    library(readstata13)
    library(texreg)
    library(sandwich)
    library(lmtest)

    clustered.se <- function(model_result, data, cluster) {
      model_variables   <-
      intersect(colnames(data), c(colnames(model_result$model), cluster))
      model_rows <- rownames(model_result$model)
      data <- data[model_rows, model_variables]
      cl <- data[[cluster]]
      M <- length(unique(cl))
      N <- nrow(data)
      K <- model_result$rank
      dfc <- (M / (M - 1)) * ((N - 1) / (N - K))
      uj  <-
      apply(estfun(model_result), 2, function(x)
      tapply(x, cl, sum))
      vcovCL <- dfc * sandwich(model_result, meat = crossprod(uj) / N)
      standard.errors <- coeftest(model_result, vcov. = vcovCL)[, 2]
      p.values <- coeftest(model_result, vcov. = vcovCL)[, 4]
      clustered.se <-
      list(vcovCL = vcovCL,
      standard.errors = standard.errors,
      p.values = p.values)
      return(clustered.se)
      }

    elemapi2 <- read.dta13(file = 'elemapi2.dta')

    lm1 <-
      lm(formula = api00 ~ acs_k3 + acs_46 + full + enroll,
      data = elemapi2)

    clustered_se <-
      clustered.se(model_result = lm1,
      data = elemapi2,
      cluster = "dnum")

    htmlreg(
      lm1,
      override.se = clustered_se$standard.errors,
      override.p = clustered_se$p.value,
      star.symbol = "\\*",
      digits = 7
      )   

    =============================
                 Model 1         
    -----------------------------
    (Intercept)    -5.2004067    
                 (121.7855938)   
    acs_k3          6.9543811    
                   (6.9011174)   
    acs_46          5.9660147 *  
                   (2.5310751)   
    full            4.6682211 ***
                   (0.7034641)   
    enroll         -0.1059909 *  
                   (0.0429478)   
    -----------------------------
    R^2             0.3848830    
    Adj. R^2        0.3785741    
    Num. obs.     395            
    RMSE          112.1983218    
    =============================
    *** p < 0.001, ** p < 0.01, * p < 0.05

唉，我无法重现95％的置信区间：

    screenreg(
      lm1,
      override.se = clustered_se$standard.errors,
      override.p = clustered_se$p.value,
      digits = 7,
      ci.force = TRUE
      )     

    ========================================
                 Model 1                    
    ----------------------------------------
    (Intercept)     -5.2004067              
                 [-243.8957845; 233.4949710]
    acs_k3           6.9543811              
                 [  -6.5715605;  20.4803228]
    acs_46           5.9660147 *            
                 [   1.0051987;  10.9268307]
    full             4.6682211 *            
                 [   3.2894567;   6.0469855]
    enroll          -0.1059909 *            
                 [  -0.1901670;  -0.0218148]
    ----------------------------------------
    R^2              0.3848830              
    Adj. R^2         0.3785741              
    Num. obs.      395                      
    RMSE           112.1983218              
    ========================================
    * 0 outside the confidence interval

如果我手工制作＆＃39;，我会得到与texreg相同的内容：

    level <- 0.95
    a <- 1-(1 - level)/2
    coeff <- lm1$coefficients
    se <- clustered_se$standard.errors
    lb <- coeff - qnorm(a)*se
    ub <- coeff + qnorm(a)*se

    > lb
    (Intercept)      acs_k3      acs_46        full      enroll 
    -243.895784   -6.571560    1.005199    3.289457   -0.190167 

    > ub
     (Intercept)       acs_k3       acs_46         full       enroll 
    233.49497100  20.48032276  10.92683074   6.04698550  -0.02181481

Stata在做什么以及如何在R中重现它？

PS：这是follow up question。 PS2：Stata数据可用here。

Answer 1

看起来Stata使用基于t（36）而不是Z（即正常误差）的置信区间。

从Stata输出中获取值

coef=6.954381; rse=  6.901117 ; lwr= -7.041734; upr= 20.9505
(upr-coef)/rse
## [1] 2.028095
(lwr-coef)/rse
## [1] -2.028094

计算/交叉检查t（36）的尾值：

pt(2.028094,36) 
## [1] 0.975
qt(0.975,36)
## [1] 2.028094

我不知道你如何将置信区间传递给texreg。既然你没有给出一个可重复的例子（我没有elemapi2.dta），我不能确切地说你将如何获得df，但看起来你想要tdf <- length(unique(elemapi2$dnum))-1

level <- 0.95
a <- 1- (1 - level)/2
bounds <- coef(lm1) + c(-1,1)*clustered_se*qt(a,tdf)

Answer 2

事实上，Stata正在使用t分布而不是正态分布。现在有一个非常简单的解决方案，可以使用estimatr package中的texreg获取与Stata匹配的置信区间lm_robust，您可以从CRAN install.packages(estimatr)安装。{/ p>

> library(estimatr)
> lmro <- lm_robust(mpg ~ hp, data = mtcars, clusters = cyl, se_type = "stata")
> screenreg(lmro)

===========================
             Model 1       
---------------------------
(Intercept)   30.10 *      
             [13.48; 46.72]
hp            -0.07        
             [-0.15;  0.01]
---------------------------
R^2            0.60        
Adj. R^2       0.59        
Num. obs.     32           
RMSE           3.86        
===========================
* 0 outside the confidence interval

具有聚类标准误差和texreg的置信区间？

2 个答案: