
\documentclass{article}
\usepackage{amstext}
\usepackage{amsfonts}
\usepackage{hyperref}
\usepackage[round]{natbib}
\usepackage{hyperref}
\usepackage{graphicx}
\usepackage{rotating}
%%\usepackage[nolists]{endfloat}
%%\usepackage{Sweave}

%%\VignetteIndexEntry{Survival Ensembles}
%%\VignetteDepends{mboost, survival, rpart, TH.data, partykit}

\newcommand{\Rpackage}[1]{{\normalfont\fontseries{b}\selectfont #1}}
\newcommand{\Robject}[1]{\texttt{#1}}
\newcommand{\Rclass}[1]{\textit{#1}}
\newcommand{\Rcmd}[1]{\texttt{#1}}
\newcommand{\Roperator}[1]{\texttt{#1}}
\newcommand{\Rarg}[1]{\texttt{#1}}
\newcommand{\Rlevel}[1]{\texttt{#1}}

\newcommand{\RR}{\textsf{R}}
\renewcommand{\S}{\textsf{S}}

\RequirePackage[T1]{fontenc}
\RequirePackage{graphicx,ae,fancyvrb}
\IfFileExists{upquote.sty}{\RequirePackage{upquote}}{}
\usepackage{relsize}

\DefineVerbatimEnvironment{Sinput}{Verbatim}{baselinestretch=1}
\DefineVerbatimEnvironment{Soutput}{Verbatim}{fontfamily=courier,
                                              baselinestretch=1,
                                              fontshape=it,
                                              fontsize=\relsize{-1}}
\DefineVerbatimEnvironment{Scode}{Verbatim}{}
\newenvironment{Schunk}{}{}

\renewcommand{\baselinestretch}{1}

\hypersetup{%
  pdftitle = {mboost Illustrations},
  pdfsubject = {package vignette},
  pdfauthor = {Torsten Hothorn and Peter Buhlmann},
%% change colorlinks to false for pretty printing
  colorlinks = {true},
  linkcolor = {blue},
  citecolor = {blue},
  urlcolor = {red},
  hyperindex = {true},
  linktocpage = {true},
}

\begin{document}

\setkeys{Gin}{width=\textwidth}

\title{Survival Ensembles}

\author{Torsten Hothorn$^{1,\star}$, Peter B\"uhlmann$^2$, Sandrine Dudoit$^3$, \\
        Annette Molinaro$^4$ and Mark J. van der Laan$^3$}
\date{}
\maketitle

\noindent$^1$Institut f\"ur Statistik \\
           Ludwig-Maximilians-Universit\"at M\"unchen \\
           Ludwigstra{\ss}e 33, D-80539 M\"unchen, Germany \\
     Tel: ++49--9131--8522707 \\
     Fax: ++49--9131--8525740 \\
     \texttt{Torsten.Hothorn@R-project.org}
\newline

\noindent$^2$Seminar f\"ur Statistik, ETH Z\"urich,
             CH-8032 Z\"urich, Switzerland \\
            \texttt{buhlmann@stat.math.ethz.ch}
\newline

\noindent$^3$Division of Biostatistics, University of California, Berkeley \\
     140 Earl Warren Hall, \#7360, Berkeley, CA 94720-7360, USA \\
    \texttt{sandrine@stat.Berkeley.EDU} \\
    \texttt{laan@stat.Berkeley.EDU}
\newline

\noindent$^4$Division of Biostatistics, Epidemiology and Public Health\\
    Yale University School of Medicine, 206 LEPH \\
    60 College Street PO Box 208034, New Haven CT 06520-8034 \\
    \texttt{annette.molinaro@yale.edu}
\newline

\section{Illustrations and Applications}

This document reproduces the data analyses presented in
\cite{hothetal06}. For a description of the theory behind
applications shown here we refer to the original manuscript.
The results differ slightly due to technical changes or bugfixes in \textbf{mboost}
that have been implemented after the paper was printed.

\subsection{Acute myeloid leukemia}

<<setup, echo = FALSE, results = hide>>=
source("setup.R")
if (!require("TH.data"))
    stop("cannot attach package ", sQuote("TH.data"))
if (!require("rpart"))
    stop("cannot attach package ", sQuote("rpart"))
if (!require("survival"))
    stop("cannot attach package ", sQuote("survival"))
if (!require("partykit"))
    stop("cannot attach package ", sQuote("partykit"))

set.seed(290875)
CEX <- 0.85
options(digits = 3)

### mean difference plots
mdplot <- function(obs, pred, main = "", ...) {

    m <- (obs + pred)/2
    d <- obs - pred
    plot(m, d, xlab = "(Observed + Predicted) / 2",
         ylab = "Observed - Predicted", main =
         main, cex.axis = CEX, cex.main = CEX, cex.lab = CEX, ...)
    abline(h = 0, lty = 3)
}
@

<<loaddata, echo = FALSE, results = tex>>=
### load data. See `mboost/inst/readAML_Bullinger.R' for
### how the data were generated from the raw data.
load(file.path(path.package(package = "TH.data"), "rda", "AML_Bullinger.rda"))
@

\paragraph{Data preprocessing}

Compute IPC weights, define risk score and set up learning sample:
<<AML-dpp, echo = TRUE>>=
### compute IPC weights
AMLw <- IPCweights(Surv(clinical$time, clinical$event))

### risk score
risk <- rep(0, nrow(clinical))
rlev <- levels(clinical[, "Cytogenetic.group"])
risk[clinical[, "Cytogenetic.group"] %in% rlev[c(7,8,4)]] <- "low"
risk[clinical[, "Cytogenetic.group"] %in% rlev[c(5, 9)]] <- "intermediate"
risk[clinical[, "Cytogenetic.group"] %in% rlev[-c(4,5, 7,8,9)]] <- "high"
risk <- as.factor(risk)

### set-up learning sample
AMLlearn <- cbind(clinical[, c("time", "Sex", "Age", "LDH", "WBC",
                        "FLT3.aberration.", "MLL.PTD", "Tx.Group.")],
               risk = risk,
               iexpressions[, colnames(iexpressions) %in% selgenes[["Clone.ID"]]])
cc <- complete.cases(AMLlearn)
AMLlearn <- AMLlearn[AMLw > 0 & cc,]
AMLw <- AMLw[AMLw > 0 & cc]
@

\paragraph{Model fitting}

Fit random forest for censored data
<<AML-RF, echo = TRUE>>=
### controls for tree growing
ctrl <- ctree_control(testtype = "Teststatistic", 
                      teststat = "maximum", mincriterion = .1, minsplit = 5)
### was: cforest_control(mincriterion = 0.1, mtry = 5, minsplit = 5, ntree = 250)

### fit random forest for censored data (warnings are OK here)
AMLrf <- cforest(log(time) ~ ., data = AMLlearn, control = ctrl,
                 weights = AMLw, mtry = 5, ntree = 250, 
                 perturb = list(replace = TRUE, fraction = 0.632))
@
and $L_2$Boosting for censored data
<<AML-boost, echo = TRUE>>=
AMLl2b <- glmboost(I(log(time)) ~ ., data = AMLlearn, weights = AMLw,
                    control = boost_control(mstop = 5000))
@

\begin{figure}
\begin{center}
<<AML-AIC, echo = TRUE, fig = TRUE>>=
### AIC criterion
plot(aic <- AIC(AMLl2b))
@
\caption{AIC criterion for AML data.}
\end{center}
\end{figure}

Compute fitted values
<<AML-fitted, echo = TRUE>>=
### restrict number of boosting iterations and inspect selected variables
AMLl2b <- AMLl2b[mstop(aic)]
cAML <- coef(AMLl2b)
cAML[abs(cAML) > 0]

### fitted values
AMLprf <- predict(AMLrf, newdata = AMLlearn)
AMLpb <- predict(AMLl2b, newdata = AMLlearn)
@

\begin{figure}
\begin{center}
<<Figure1, echo = FALSE, fig = TRUE>>=
Mmod <- sum(AMLw * log(AMLlearn$time))/sum(AMLw )
par(mai = par("mai") * c(0.7, 0.8, 0.7, 0.6))
layout(matrix(1:4, ncol = 2))

mdplot(log(AMLlearn$time), AMLprf, main = "Random Forest",
       cex = AMLw / 4, ylim = c(-4, 4), xlim = c(0, 7))

plot(log(AMLlearn$time), AMLprf, cex = AMLw / 4,
       ylim = range(log(AMLlearn$time)), ylab = "Predicted", xlab = "Observed",
       main = "Random Forest",  cex.axis = CEX, cex.main = CEX, cex.lab = CEX)
abline(h = Mmod, lty = 2)

mdplot(log(AMLlearn$time), AMLpb,
        cex = AMLw / 4, main = "Boosting", ylim = c(-4, 4), xlim = c(0, 7))

plot(log(AMLlearn$time), AMLpb, cex = AMLw / 4,
       ylim = range(log(AMLlearn$time)), ylab = "Predicted", xlab = "Observed",
       main = "Boosting",  cex.axis = CEX, cex.main = CEX, cex.lab = CEX)
abline(h = Mmod, lty = 2)
@
\caption{AML data: Reproduction of Figure 1.}
\end{center}
\end{figure}

\subsection{Node-positive breast cancer}

\paragraph{Data preprocessing}

Compute IPC weights and set up learning sample:
<<GBSG2-dpp, echo = TRUE>>=
### attach data
data("GBSG2", package = "TH.data")

### IPC weights
GBSG2w <- IPCweights(Surv(GBSG2$time, GBSG2$cens))

### set-up learning sample
GBSG2learn <- cbind(GBSG2[,-which(names(GBSG2) %in% c("time", "cens"))],
               ltime = log(GBSG2$time))
n <- nrow(GBSG2learn)
@

\paragraph{Model fitting}

<<GBSG2-models, echo = TRUE>>=
### linear model
LMmod <- lm(ltime ~ . , data = GBSG2learn, weights = GBSG2w)
LMerisk <- sum((GBSG2learn$ltime - predict(LMmod))^2*GBSG2w) / n

### regression tree
pos <- GBSG2w > 0
TRmod <- rpart(ltime ~ . , data = GBSG2learn, weights = GBSG2w, 
               subset = pos)
TRerisk <- sum((GBSG2learn$ltime[pos] - predict(TRmod))^2*GBSG2w[pos]) / n

### tree controls
ctrl <- ctree_control(testtype = "Teststatistic", 
                      teststat = "maximum", mincriterion = qnorm(.95), 
                      minsplit = 5)
### was: cforest_control(mincriterion = qnorm(0.95), mtry = 5,
###                      minsplit = 5, ntree = 100)


### fit random forest for censored data (warnings are OK here)
RFmod <- cforest(ltime ~ . , data = GBSG2learn, weights = GBSG2w,
                 control = ctrl, mtry = 5, ntree = 100, 
                 perturb = list(replace = TRUE, 
                     fraction = 0.632 * sum(GBSG2w > 0)))

### fit L2 boosting for censored data
L2Bmod <- glmboost(ltime ~ ., data = GBSG2learn, weights = GBSG2w,
                   control = boost_control(mstop = 250))

### with Huber loss function
L2BHubermod <- glmboost(ltime ~ ., data = GBSG2learn, weights = GBSG2w,
                        family = Huber(d = log(2)))
@

\begin{figure}
\begin{center}
<<GBSG2-AIC, echo = TRUE, fig = TRUE>>=
plot(aic <- AIC(L2Bmod))
@
\caption{AIC criterion for GBSG2 data.}
\end{center}
\end{figure}

Compute fitted values:
<<GBSG2-fitted, echo = TRUE>>=
GBSG2Hp <- predict(L2BHubermod, newdata = GBSG2learn)
L2Berisk <- sum((GBSG2learn$ltime - predict(L2Bmod, newdata = GBSG2learn))^2*GBSG2w) / n
RFerisk <- sum((GBSG2learn$ltime - predict(RFmod, newdata = GBSG2learn))^2*GBSG2w) / n
@

\begin{figure}
\begin{center}
<<Figure3, echo = FALSE, fig = TRUE>>=
lim <- c(4,9)
mylwd <- 0.5
par(mai = par("mai") * c(0.7, 0.8, 0.7, 0.6))

layout(matrix(1:4, ncol = 2))
Mmod <- sum(GBSG2w * GBSG2learn$ltime)/sum(GBSG2w)

mdplot(GBSG2learn$ltime, predict(LMmod), cex = GBSG2w / 4, main = "Linear Model",
       ylim = c(-3, 3), xlim = c(5, 8))
mdplot(GBSG2learn$ltime[pos], predict(TRmod), cex = GBSG2w / 4, main = "Tree",
       ylim = c(-3, 3), xlim = c(5, 8))
mdplot(GBSG2learn$ltime, predict(RFmod, newdata = GBSG2learn), cex = GBSG2w / 4,
       main = "Random Forest", ylim = c(-3, 3), xlim = c(5, 8))
mdplot(GBSG2learn$ltime, predict(L2Bmod, newdata = GBSG2learn), cex = GBSG2w / 4,
       main = "Boosting", ylim = c(-3, 3), xlim = c(5, 8))
@
\caption{GBSG-2 data: Reproduction of Figure 3.}
\end{center}
\end{figure}

\begin{figure}
\begin{center}
<<Figure5, echo = FALSE, fig = TRUE>>=
RFpr <- predict(RFmod, newdata = GBSG2learn)
L2Bpr <- predict(L2Bmod, newdata = GBSG2learn)
ylim <- range(c(RFpr[GBSG2w > 0], L2Bpr[GBSG2w > 0]))
mydf <- 4
par(mai = par("mai") * c(0.7, 0.8, 0.4, 0.6))
layout(matrix(1:4, ncol = 2))
plot(GBSG2learn$pnodes, RFpr, cex = GBSG2w/4, xlim = c(0,40), lwd = mylwd,
     xlab = "Nr. positive lymph nodes", ylim = ylim, ylab =
     expression(hat(Y)), cex.axis = CEX, cex.main = CEX, cex.lab = CEX,)
lines(smooth.spline(GBSG2learn$pnodes, RFpr, GBSG2w/4, df = mydf))
plot(GBSG2learn$age, RFpr, cex = GBSG2w/4, xlab = "Age",
     ylab = expression(hat(Y)), ylim = ylim, lwd = mylwd, cex.axis = CEX,
     cex.main = CEX, cex.lab = CEX)
lines(smooth.spline(GBSG2learn$age, RFpr, GBSG2w/4, df = mydf))
plot(GBSG2learn$estrec, RFpr, cex = GBSG2w/4, xlab = "Estrogen receptor",
     ylab = expression(hat(Y)), ylim = ylim, lwd = mylwd, cex.axis = CEX,
     cex.main = CEX, cex.lab = CEX)
lines(smooth.spline(GBSG2learn$estrec, RFpr, GBSG2w/4, df = mydf))
indx <- which(GBSG2learn$progrec < 100)
plot(GBSG2learn$progrec[indx], RFpr[indx], cex = GBSG2w[indx]/4,
     xlab = "Progesterone receptor (< 100 fmol / l)",
     ylab = expression(hat(Y)), ylim = ylim, lwd = mylwd, cex.axis = CEX,
     cex.main = CEX, cex.lab = CEX)
lines(smooth.spline(GBSG2learn$progrec[indx], RFpr[indx], GBSG2w[indx]/4, df = mydf))
@
\caption{GBSG-2 data: Reproduction of Figure 5.}
\end{center}
\end{figure}


\begin{figure}
\begin{center}
<<Figure6, echo = FALSE, fig = TRUE>>=
par(mai = par("mai") * c(0.7, 0.8, 0.4, 0.6))
layout(matrix(1:4, ncol = 2))
plot(GBSG2learn$pnodes, L2Bpr, cex = GBSG2w/4, xlim = c(0,40),
     ylab = expression(hat(Y)),
     xlab = "Nr. positive lymph nodes", ylim = ylim, lwd = mylwd, cex.axis =
     CEX, cex.main = CEX, cex.lab = CEX)
lines(smooth.spline(GBSG2learn$pnodes, L2Bpr, GBSG2w/4, df = mydf))
plot(GBSG2learn$age, L2Bpr, cex = GBSG2w/4, xlab = "Age",
     ylab = expression(hat(Y)),
     ylim = ylim, lwd = mylwd, cex.axis = CEX, cex.main = CEX, cex.lab =
     CEX)
lines(smooth.spline(GBSG2learn$age, L2Bpr, GBSG2w/4, df = mydf))
plot(GBSG2learn$estrec, L2Bpr, cex = GBSG2w/4, xlab = "Estrogen receptor",
     ylab = expression(hat(Y)), ylim = ylim, lwd = mylwd, cex.axis = CEX,
     cex.main = CEX, cex.lab = CEX)
lines(smooth.spline(GBSG2learn$estrec, L2Bpr, GBSG2w/4, df = mydf))
indx <- which(GBSG2learn$progrec < 100)
plot(GBSG2learn$progrec[indx], L2Bpr[indx], cex = GBSG2w[indx]/4,
     xlab = "Progesterone receptor (< 100 fmol / l)",
     ylab = expression(hat(Y)), ylim = ylim, lwd = mylwd, cex.axis = CEX,
     cex.main = CEX, cex.lab = CEX)
lines(smooth.spline(GBSG2learn$progrec[indx], L2Bpr[indx], GBSG2w[indx]/4, df = mydf))
@
\caption{GBSG-2 data: Reproduction of Figure 6.}
\end{center}
\end{figure}

\begin{figure}
\begin{center}
<<Figure7, echo = FALSE, fig = TRUE>>=
Mmod <- sum(GBSG2w * GBSG2learn$ltime)/sum(GBSG2w)
par(mai = par("mai") * c(0.7, 0.8, 0.7, 0.6))
layout(matrix(1:4, ncol = 2))

yl <- range(c(GBSG2Hp[GBSG2w > 0], L2Bpr[GBSG2w > 0]))
mdplot(GBSG2learn$ltime, GBSG2Hp, main = "Huber Loss",
       cex = GBSG2w / 4, ylim = c(-3, 3), xlim = c(5, 8))

plot(GBSG2learn$ltime, GBSG2Hp, cex = GBSG2w / 4,
       xlim = range(GBSG2learn$ltime[GBSG2w > 0]), ylim = yl, ylab = "Predicted", xlab = "Observed",
       main = "Huber Loss",  cex.axis = CEX, cex.main = CEX, cex.lab = CEX)

mdplot(GBSG2learn$ltime, L2Bpr,
        cex = GBSG2w / 4, main = "Quadratic Loss", ylim = c(-3, 3), xlim = c(5, 8))

plot(GBSG2learn$ltime, L2Bpr, cex = GBSG2w / 4,
       xlim = range(GBSG2learn$ltime[GBSG2w > 0]), ylim = yl, ylab = "Predicted", xlab = "Observed",
       main = "Quadratic Loss",  cex.axis = CEX, cex.main = CEX, cex.lab = CEX)
@
\caption{GBSG-2 data: Reproduction of Figure 7.}
\end{center}
\end{figure}

\clearpage

\bibliographystyle{plainnat}
\bibliography{boost}


\end{document}
