---
title: "Negative Binomial Convergence: fastglm vs MASS"
author: "Jared Huling"
date: "2026-05-27"
output: rmarkdown::html_vignette
vignette: >
  %\VignetteIndexEntry{Negative Binomial Convergence: fastglm vs MASS}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
---


Standard negative-binomial GLM fitting alternates between IRLS for the
regression coefficients $\beta$ (at fixed dispersion $\theta$) and a
one-dimensional MLE update for $\theta$.
Both `MASS::glm.nb()` and `fastglm_nb()` follow this strategy, but their
implementations differ in initialization and numerical handling.  This
vignette compares the two on a deliberately challenging data-generating
process where MASS frequently fails to converge.

## Data-generating process

We use a moderately high-dimensional negative-binomial model
($n = 200$, $p = 9$) with strong alternating effects and true
$\theta = 1$.
The large coefficients produce fitted means spanning many orders of
magnitude, which stresses the IRLS solver when the initial $\theta$
estimate is far from the truth.


``` r
library(fastglm)
library(MASS)

n <- 200
p <- 9
beta_true  <- c(4, 1.5, -1.2, 1.5, -1.2, 1.5, -1.2, 1.5, -1.2)
theta_true <- 1
n_reps     <- 200
```

## Convergence comparison

For each of 200 replications we draw a fresh design matrix and
response, then fit with both `fastglm_nb()` and `MASS::glm.nb()`.  We
record whether each method converges without error or warning, and when
both converge, we compare the maximized log-likelihoods.


``` r
results <- data.frame(
    seed         = integer(n_reps),
    mass_conv    = logical(n_reps),
    fg_conv      = logical(n_reps),
    mass_loglik  = numeric(n_reps),
    fg_loglik    = numeric(n_reps),
    mass_theta   = numeric(n_reps),
    fg_theta     = numeric(n_reps)
)

for (i in seq_len(n_reps)) {
    set.seed(i)
    X  <- cbind(1, matrix(rnorm(n * (p - 1)), n, p - 1))
    mu <- exp(X %*% beta_true)
    y  <- MASS::rnegbin(n, mu = mu, theta = theta_true)

    ## MASS::glm.nb
    mass_ok   <- FALSE
    mass_ll   <- NA_real_
    mass_th   <- NA_real_
    tryCatch(
        withCallingHandlers(
            {
                m <- MASS::glm.nb(y ~ X[, -1])
                if (m$converged) {
                    mass_ok <- TRUE
                    mass_ll <- as.numeric(logLik(m))
                    mass_th <- m$theta
                }
            },
            warning = function(w) invokeRestart("muffleWarning")
        ),
        error = function(e) NULL
    )

    ## fastglm_nb
    fg_ok  <- FALSE
    fg_ll  <- NA_real_
    fg_th  <- NA_real_
    tryCatch({
        f <- fastglm_nb(X, y, link = "log")
        if (f$converged && !any(is.nan(coef(f)))) {
            fg_ok <- TRUE
            fg_ll <- as.numeric(logLik(f))
            fg_th <- f$theta
        }
    }, error = function(e) NULL)

    results$seed[i]        <- i
    results$mass_conv[i]   <- mass_ok
    results$fg_conv[i]     <- fg_ok
    results$mass_loglik[i] <- mass_ll
    results$fg_loglik[i]   <- fg_ll
    results$mass_theta[i]  <- mass_th
    results$fg_theta[i]    <- fg_th
}
```

### Summary


``` r
tab <- with(results, data.frame(
    Method     = c("MASS::glm.nb", "fastglm_nb"),
    Converged  = c(sum(mass_conv), sum(fg_conv)),
    Failed     = c(sum(!mass_conv), sum(!fg_conv)),
    Rate       = sprintf("%.1f%%", 100 * c(mean(mass_conv), mean(fg_conv)))
))
knitr::kable(tab, align = "lrrl",
             caption = sprintf("Convergence over %d replications", n_reps))
```


Table: Convergence over 200 replications

|Method       | Converged| Failed|Rate   |
|:------------|---------:|------:|:------|
|MASS::glm.nb |       165|     35|82.5%  |
|fastglm_nb   |       200|      0|100.0% |


`fastglm_nb()` converges on every replication.  `MASS::glm.nb()` fails
on roughly one in six.

### Log-likelihood comparison

When both methods converge they should find the same MLE, so their
log-likelihoods should agree.  In a small number of replications
`MASS::glm.nb()` nominally converges but lands at a degenerate solution
with $\hat\theta > 10^6$ and very poor log-likelihood.  We exclude
these (identified by `fastglm` achieving a strictly higher
log-likelihood by more than 1 unit) to focus on genuine agreement:


``` r
both <- results$mass_conv & results$fg_conv
ll_diff <- results$fg_loglik[both] - results$mass_loglik[both]

n_degenerate <- sum(ll_diff > 1)
agree <- both
agree[both] <- ll_diff <= 1

cat(sprintf("Replications where both converge:         %d / %d\n",
            sum(both), n_reps))
#> Replications where both converge:         165 / 200
cat(sprintf("  of which MASS lands at degenerate MLE:  %d\n", n_degenerate))
#>   of which MASS lands at degenerate MLE:  3
cat(sprintf("  genuine agreement (|diff| <= 1):        %d\n", sum(agree)))
#>   genuine agreement (|diff| <= 1):        162

ll_good <- results$fg_loglik[agree] - results$mass_loglik[agree]
cat(sprintf("\nAmong agreeing fits:\n"))
#> 
#> Among agreeing fits:
cat(sprintf("  Max |logLik difference|:  %.2e\n", max(abs(ll_good))))
#>   Max |logLik difference|:  1.74e-06
cat(sprintf("  Mean |logLik difference|: %.2e\n", mean(abs(ll_good))))
#>   Mean |logLik difference|: 2.20e-08
```


``` r
plot(results$mass_loglik[agree], results$fg_loglik[agree],
     xlab = "MASS log-likelihood",
     ylab = "fastglm log-likelihood",
     main = "Log-likelihood agreement (both converge, excluding degenerate MASS fits)",
     pch = 16, cex = 0.6, col = "#2166ac80")
abline(0, 1, col = "grey40", lty = 2)
```

![plot of chunk loglik-plot](figure/loglik-plot-1.png)

### Theta estimates


``` r
plot(results$mass_theta[agree], results$fg_theta[agree],
     xlab = expression(hat(theta) ~ "(MASS)"),
     ylab = expression(hat(theta) ~ "(fastglm)"),
     main = expression("Dispersion" ~ hat(theta) ~ "agreement"),
     pch = 16, cex = 0.6, col = "#2166ac80")
abline(0, 1, col = "grey40", lty = 2)
```

![plot of chunk theta-plot](figure/theta-plot-1.png)

``` r

cat(sprintf("Max |theta difference|: %.2e\n",
            max(abs(results$fg_theta[agree] - results$mass_theta[agree]))))
#> Max |theta difference|: 4.27e-07
```

### Cases where only fastglm converges


``` r
fg_only <- results$fg_conv & !results$mass_conv
cat(sprintf("fastglm converges, MASS fails: %d replications\n", sum(fg_only)))
#> fastglm converges, MASS fails: 35 replications
if (any(fg_only)) {
    cat(sprintf("  theta range:  [%.3f, %.3f]\n",
                min(results$fg_theta[fg_only]),
                max(results$fg_theta[fg_only])))
    cat(sprintf("  logLik range: [%.1f, %.1f]\n",
                min(results$fg_loglik[fg_only]),
                max(results$fg_loglik[fg_only])))
}
#>   theta range:  [0.861, 1.242]
#>   logLik range: [-1123.1, -935.4]
```