Firth Bias-Reduced GLMs with ‘fastglm’

Jared Huling

2026-06-05

Firth’s (1993) penalized likelihood removes the leading \(O(1/n)\) bias from maximum likelihood estimates and produces finite estimates even under separation. fastglm implements the general mean-bias reduction of Kosmidis and Firth (2009, 2021), which extends Firth’s original logistic penalty to arbitrary GLM families. The method is activated with a single flag:

fastglm(x, y, family = binomial(), firth = TRUE)

The penalty adds \(\frac{1}{2} \partial \log |X^\top W X| / \partial \beta\) to the score, which is equivalent to modifying the IRLS working response by

\[ \xi_i = \frac{\phi \, h_i \, \mu''(\eta_i) \, V(\mu_i)}{2 \, w_i \, [\mu'(\eta_i)]^3} \]

where \(h_i\) is the \(i\)-th diagonal of the hat matrix \(H = W^{1/2} X (X^\top W X)^{-1} X^\top W^{1/2}\), \(\mu''(\eta)\) is the second derivative of the inverse link, \(V(\mu)\) is the variance function, \(w_i\) is the prior weight, and \(\phi\) is the dispersion (1 for binomial and Poisson families; estimated iteratively for Gaussian, Gamma, and inverse Gaussian). This is the AS_mean adjustment of Kosmidis and Firth (2009), the same method used by brglm2::brglmFit(type = "AS_mean").

The adjustment is computed once per IRLS iteration using the leverages from the weighted least-squares decomposition that fastglm already performs, so the overhead relative to an unpenalized fit is modest.

library(fastglm)

Logistic regression under separation

The canonical application is logistic regression with (quasi-) separation, where the standard MLE diverges. The sex2 dataset from logistf (Heinze and Schemper, 2002) demonstrates:

data(sex2, package = "logistf")
X <- model.matrix(~ age + oc + vic + vicl + vis + dia, data = sex2)
y <- sex2$case

fit_f <- fastglm(X, y, family = binomial(), firth = TRUE)
fit_l <- logistf::logistf(case ~ age + oc + vic + vicl + vis + dia,
                          data = sex2, pl = FALSE, plconf = NULL,
                          control = logistf::logistf.control(
                              lconv = 1e-12, gconv = 1e-12, xconv = 1e-12,
                              maxit = 200L))

cbind(fastglm = unname(coef(fit_f)), logistf = unname(coef(fit_l)))
#>          fastglm     logistf
#> [1,]  0.12025405  0.12025405
#> [2,] -1.10598133 -1.10598133
#> [3,] -0.06881673 -0.06881673
#> [4,]  2.26887465  2.26887465
#> [5,] -2.11140819 -2.11140819
#> [6,] -0.78831695 -0.78831695
#> [7,]  3.09601183  3.09601183
max(abs(unname(coef(fit_f)) - unname(coef(fit_l))))
#> [1] 1.478671e-09

Coefficients agree to roughly 1e-7. The runtime advantage is substantial because fastglm’s Firth solver reuses the C++ IRLS infrastructure:

mb <- microbenchmark::microbenchmark(
    fastglm = fastglm(X, y, family = binomial(), firth = TRUE),
    logistf = logistf::logistf(case ~ age + oc + vic + vicl + vis + dia,
                               data = sex2, pl = FALSE, plconf = NULL),
    times = 10L
)
print(mb, unit = "ms")
#> Unit: milliseconds
#>     expr      min       lq      mean   median       uq      max neval cld
#>  fastglm 0.255717 0.265229 0.2795995 0.273798 0.283351 0.349320    10  a 
#>  logistf 1.263456 1.279077 1.3051735 1.287584 1.313230 1.411876    10   b

General GLM families

firth = TRUE works with all standard R families and link functions. For each family below, we verify that fastglm reproduces the bias-reduced coefficients from brglm2::brglmFit(type = "AS_mean") — both implement the same AS_mean adjustment, so the coefficients should agree to near machine precision.

library(brglm2)
set.seed(123)
n <- 500
x1 <- rnorm(n);  x2 <- rnorm(n)
X <- cbind(1, x1, x2)
eta_true <- 0.5 + 0.3 * x1 - 0.2 * x2

Binomial (logit, probit, cloglog)

y_bin <- rbinom(n, 1, plogis(eta_true))
df <- data.frame(y = y_bin, x1 = x1, x2 = x2)

for (lnk in c("logit", "probit", "cloglog")) {
    fam <- binomial(lnk)
    f <- fastglm(X, y_bin, family = fam, firth = TRUE, tol = 1e-10)
    b <- glm(y ~ x1 + x2, data = df, family = fam, method = "brglmFit",
             type = "AS_mean", control = list(epsilon = 1e-10, maxit = 200))
    cat(sprintf("binomial %-8s max|diff| = %.2e\n", lnk,
                max(abs(unname(coef(f)) - unname(coef(b))))))
}
#> binomial logit    max|diff| = 2.29e-12
#> binomial probit   max|diff| = 1.10e-11
#> binomial cloglog  max|diff| = 3.09e-11

Poisson (log, sqrt)

y_pois <- rpois(n, exp(eta_true))

for (lnk in c("log", "sqrt")) {
    fam <- poisson(lnk)
    f <- fastglm(X, y_pois, family = fam, firth = TRUE, tol = 1e-10)
    b <- glm(y ~ x1 + x2, data = data.frame(y = y_pois, x1 = x1, x2 = x2),
             family = fam, method = "brglmFit", type = "AS_mean",
             control = list(epsilon = 1e-10, maxit = 200))
    cat(sprintf("poisson  %-8s max|diff| = %.2e\n", lnk,
                max(abs(unname(coef(f)) - unname(coef(b))))))
}
#> poisson  log      max|diff| = 2.13e-12
#> poisson  sqrt     max|diff| = 6.30e-11

Gamma (log, inverse)

y_gam <- rgamma(n, shape = 5, rate = 5 / exp(eta_true))

for (lnk in c("log", "inverse")) {
    fam <- Gamma(lnk)
    f <- fastglm(X, y_gam, family = fam, firth = TRUE, tol = 1e-10)
    b <- glm(y ~ x1 + x2, data = data.frame(y = y_gam, x1 = x1, x2 = x2),
             family = fam, method = "brglmFit", type = "AS_mean",
             control = list(epsilon = 1e-10, maxit = 200))
    cat(sprintf("Gamma    %-8s max|diff| = %.2e\n", lnk,
                max(abs(unname(coef(f)) - unname(coef(b))))))
}
#> Gamma    log      max|diff| = 2.10e-05
#> Gamma    inverse  max|diff| = 1.67e-05

Gaussian (identity, log)

y_gauss <- eta_true + rnorm(n, 0, 0.5)

for (lnk in c("identity", "log")) {
    fam <- gaussian(lnk)
    y0 <- if (lnk == "log") pmax(y_gauss, 0.01) else y_gauss
    f <- fastglm(X, y0, family = fam, firth = TRUE, tol = 1e-10)
    b <- glm(y ~ x1 + x2, data = data.frame(y = y0, x1 = x1, x2 = x2),
             family = fam, method = "brglmFit", type = "AS_mean",
             control = list(epsilon = 1e-10, maxit = 200))
    cat(sprintf("gaussian %-8s max|diff| = %.2e\n", lnk,
                max(abs(unname(coef(f)) - unname(coef(b))))))
}
#> gaussian identity max|diff| = 1.67e-16
#> gaussian log      max|diff| = 1.78e-06

Inverse Gaussian (log)

mu_ig <- exp(eta_true)
y_ig <- statmod::rinvgauss(n, mean = mu_ig, shape = 5)
y_ig[y_ig <= 0] <- 0.01

f <- fastglm(X, y_ig, family = inverse.gaussian("log"), firth = TRUE, tol = 1e-10)
b <- glm(y ~ x1 + x2, data = data.frame(y = y_ig, x1 = x1, x2 = x2),
         family = inverse.gaussian("log"), method = "brglmFit", type = "AS_mean",
         control = list(epsilon = 1e-10, maxit = 200))
cat(sprintf("inv.gauss log      max|diff| = %.2e\n",
            max(abs(unname(coef(f)) - unname(coef(b))))))
#> inv.gauss log      max|diff| = 1.16e-06

Standard errors

The standard errors reported by fastglm with firth = TRUE come from the unscaled \((X^\top W X)^{-1}\) at convergence, the same quantity that brglm2 uses. For unit-dispersion families (binomial, Poisson), these are the Firth-penalized standard errors directly; for other families, the dispersion is estimated as \(\hat\phi = D / (n - p)\) and applied to the variance-covariance matrix.

f <- fastglm(X, y_bin, family = binomial(), firth = TRUE, tol = 1e-10)
b <- glm(y ~ x1 + x2, data = df, family = binomial(), method = "brglmFit",
         type = "AS_mean", control = list(epsilon = 1e-10, maxit = 200))

cbind(fastglm_SE = f$se, brglm2_SE = summary(b)$coefficients[, "Std. Error"])
#>             fastglm_SE  brglm2_SE
#> (Intercept) 0.09423309 0.09423309
#> x1          0.09910290 0.09910290
#> x2          0.09527990 0.09527990
max(abs(f$se - summary(b)$coefficients[, "Std. Error"]))
#> [1] 8.0283e-14

Penalized deviance

The Firth-penalized deviance is \(D^* = D - \log |X^\top W X|\), where \(D\) is the standard deviance and the log-determinant comes from the information matrix at the penalized MLE. Both the standard and penalized deviances are reported:

f <- fastglm(X, y_bin, family = binomial(), firth = TRUE)
c(deviance = f$deviance,
  log.det.XtWX = f$log.det.XtWX,
  penalized.deviance = f$penalized.deviance)
#>           deviance       log.det.XtWX penalized.deviance 
#>          644.08511           14.05578          630.02933

Speed comparison across families

A timing comparison of fastglm(firth = TRUE) against brglm2::brglmFit(type = "AS_mean") across families with n = 10000 observations and p = 5 covariates:

set.seed(123)
n <- 10000;  p <- 5
x1 <- rnorm(n);  x2 <- rnorm(n); x3 <- rnorm(n)
x4 <- rnorm(n);  x5 <- rnorm(n)
X <- cbind(1, x1, x2, x3, x4, x5)
eta <- 0.5 + 0.3 * x1 - 0.2 * x2 + 0.1 * x3

families <- list(
    `binomial logit`   = list(fam = binomial("logit"),
                              y = rbinom(n, 1, plogis(eta))),
    `binomial probit`  = list(fam = binomial("probit"),
                              y = rbinom(n, 1, plogis(eta))),
    `binomial cloglog` = list(fam = binomial("cloglog"),
                              y = rbinom(n, 1, plogis(eta))),
    `poisson log`      = list(fam = poisson("log"),
                              y = rpois(n, exp(eta))),
    `Gamma log`        = list(fam = Gamma("log"),
                              y = rgamma(n, shape = 5, rate = 5 / exp(eta))),
    `gaussian identity`= list(fam = gaussian("identity"),
                              y = eta + rnorm(n, 0, 0.5))
)

df <- data.frame(y = 0, x1 = x1, x2 = x2, x3 = x3, x4 = x4, x5 = x5)
results <- list()
for (nm in names(families)) {
    fi <- families[[nm]]
    df$y <- fi$y
    mb <- microbenchmark::microbenchmark(
        fastglm = fastglm(X, fi$y, family = fi$fam, firth = TRUE),
        brglm2  = glm(y ~ x1 + x2 + x3 + x4 + x5, data = df,
                      family = fi$fam, method = "brglmFit", type = "AS_mean"),
        times = 10L
    )
    s <- summary(mb, unit = "ms")
    cat(sprintf("%-20s fastglm=%7.2f ms  brglm2=%7.2f ms  speedup=%.1fx\n",
                nm, s$median[1], s$median[2], s$median[2] / s$median[1]))
}
#> binomial logit       fastglm=   3.22 ms  brglm2=  13.31 ms  speedup=4.1x
#> binomial probit      fastglm=   4.22 ms  brglm2=  15.77 ms  speedup=3.7x
#> binomial cloglog     fastglm=   5.58 ms  brglm2=  16.75 ms  speedup=3.0x
#> poisson log          fastglm=   3.66 ms  brglm2= 121.21 ms  speedup=33.1x
#> Gamma log            fastglm=   3.64 ms  brglm2= 100.23 ms  speedup=27.6x
#> gaussian identity    fastglm=   1.02 ms  brglm2=  21.17 ms  speedup=20.7x

References