## ----Packages, message=FALSE--------------------------------------------------
library(survey)
library(ggplot2)
library(splines)
library(magrittr)

## ----LoadSurveyCV-------------------------------------------------------------
library(surveyCV)
rerunSims <- FALSE  # when we *do* rerun sims, remember to reinstall package & restart R
knitr::opts_chunk$set(dpi=300, out.extra ='WIDTH="95%"')

## ----ArtifPop-----------------------------------------------------------------
# Generate a cubic-ish artificial population with 1000 units
set.seed(47)

N <- 1000
nrStrata <- 10
nrClusters <- 100
clusSizes <- N/nrClusters
  
x1 = stats::runif(1:(N/2), min = 26, max = 38)
y1 = (x1-29)^3 - 13*(x1-29)^2 + 0*(x1-29) + 900

x2 = stats::runif(1:(N/2), min = 38, max = 50)
y2 = (x2-36)^3 - 10*(x2-36)^2 + 2*(x2-36) + 600

z1 = jitter(y1, 15000)
z2 = jitter(y2, 15000)

ds1 <- data.frame(Response = z1, Predictor = x1)
ds2 <- data.frame(Response = z2, Predictor = x2)
ds12 <- rbind(ds1, ds2)

b12 <- data.frame(ID = c(1:1000))
splinepop.df <- cbind(b12, ds12)
splinepop.df <- splinepop.df %>%
  dplyr::arrange(Predictor) %>%
  # Create Cluster and Stratum variables so that
  # we can (separately) simulate both kinds of sampling
  dplyr::mutate(Stratum = dplyr::row_number(),
                Cluster = dplyr::row_number()) %>% 
  dplyr::mutate(Stratum = cut(Stratum, nrStrata, 1:nrStrata),
                Cluster = cut(Cluster, nrClusters, 1:nrClusters)) %>%
  # Create fpc variables for each sampling type:
  # full pop has 1000 units; 100 units/stratum; 100 clusters (of 10 units each);
  # & if we added a combined Strat+Clus sim, we'd need different fpc's for that
  dplyr::mutate(fpcSRS = N,
                fpcStratum = N/nrStrata,
                fpcCluster = nrClusters) %>%
  dplyr::arrange(ID)

# Create unequal sampling weights that are intentionally 
# NOT well matched to the true shape of the population,
# so we can see how the use of weights impacts
# model-training step vs test-error estimation step
# in a case where naively ignoring weights will confidently pick wrong model
lm_quad <- stats::lm(Response ~ Predictor + I(Predictor^2),
                     data = splinepop.df)
splinepop.df$samp_prob_quad <-
  (1/(abs(lm_quad$residuals))) / sum(1/(abs(lm_quad$residuals)))
splinepop.df$samp_wt_quad <- 1/splinepop.df$samp_prob_quad

stopifnot(all.equal(sum(1/splinepop.df$samp_wt_quad), 1))

## ----PopulationPlot, fig.width=8.3, fig.height=3------------------------------
n <- 100

srs.df <- dplyr::sample_n(splinepop.df, n)

stratcounts <- rep(n/nrStrata, nrStrata)
names(stratcounts) <- 1:nrStrata
s <- stratsample(splinepop.df$Stratum, stratcounts)
strat.df <- splinepop.df[s,]

c <- unique(splinepop.df[["Cluster"]]) 
clus.df <- splinepop.df[splinepop.df[["Cluster"]] %in% sample(c, n/clusSizes),]

a <- ggplot(splinepop.df, aes(x = Predictor, y = Response)) + 
  geom_point(shape = 1) +
  labs(title = "Artificial Pop.", x = "Predictor", y = "Response") + 
  ylim(100,1650) + xlim(25, 51)
b <- ggplot(srs.df, aes(x = Predictor, y = Response)) + 
  geom_point(shape = 8, color = "darkgreen") +
  labs(title = "SRS", x = "Predictor", y = "Response") + 
  ylim(100,1650) + xlim(25, 51)
c <- ggplot(strat.df, aes(x = Predictor, y = Response)) +
  geom_point(shape = 3, color = "darkred") +
  labs(title = "Stratified Sample", x = "Predictor", y = "Response") + 
  ylim(100,1650) + xlim(25, 51)
d <- ggplot(clus.df, aes(x = Predictor, y = Response)) +
  geom_point(shape = 4, color = "steelblue") +
  labs(title = "Cluster Sample", x = "Predictor", y = "Response") + 
  ylim(100,1650) + xlim(25, 51)
gridExtra::grid.arrange(a, b, c, d, ncol = 4,
                        top = "Simulated Data and Examples of How It Was Sampled")

## ----FoldsSimsSetup-----------------------------------------------------------
# TODO: redo with different n's?
n <- 100
loops <- 500

## ----FoldsSimsResults, eval = rerunSims---------------------------------------
#  # Use SRS samples and make SRS folds
#  srssrsds <- data.frame(df = c(), MSE = c())
#  # Use SRS samples and evaluate on full pop
#  srspopds <- data.frame(df = c(), MSE = c())
#  
#  # Use Cluster samples and make Cluster folds
#  clusclusds <- data.frame(df = c(), MSE = c())
#  # Use Cluster samples and make SRS folds
#  clussrsds <- data.frame(df = c(), MSE = c())
#  # Use Cluster samples and evaluate on full pop
#  cluspopds <- data.frame(df = c(), MSE = c())
#  
#  # Use Strat samples and make Strat folds
#  stratstratds <- data.frame(df = c(), MSE = c())
#  # Use Strat samples and make SRS folds
#  stratsrsds <- data.frame(df = c(), MSE = c())
#  # Use Strat samples and evaluate on full pop
#  stratpopds <- data.frame(df = c(), MSE = c())
#  
#  modelsToFit <- c("Response~splines::ns(Predictor, df=1)",
#                   "Response~splines::ns(Predictor, df=2)",
#                   "Response~splines::ns(Predictor, df=3)",
#                   "Response~splines::ns(Predictor, df=4)",
#                   "Response~splines::ns(Predictor, df=5)",
#                   "Response~splines::ns(Predictor, df=6)")
#  
#  # Making as many simple random samples as we specify for 'loops'
#  for (i in 1:loops) {
#    # Take a SRS
#    sim.srs <- dplyr::sample_n(splinepop.df, n)
#  
#    # Using our SRS function on SRS samples
#    srs.CV.out <- cv.svy(sim.srs, modelsToFit,
#                         nfolds = 5, fpcID = "fpcSRS") %>% as.vector()
#    srssrsds2 <- data.frame(df = 1:6, MSE = srs.CV.out)
#    srssrsds <- rbind(srssrsds, srssrsds2)
#  
#    # Eval SRS samples on full pop
#    sub.srs <- dplyr::sample_n(sim.srs, n*.8)
#    srs.des <- svydesign(ids = ~1, fpc = ~fpcSRS, data = sub.srs)
#    srs.pop.out <- sapply(1:6,
#      function(ii) {
#          yhat <- predict(svyglm(modelsToFit[ii], srs.des),
#                          newdata = splinepop.df)
#          return(mean((yhat - splinepop.df$Response)^2))
#        })
#    srspopds2 <- data.frame(df = 1:6, MSE = srs.pop.out)
#    srspopds <- rbind(srspopds, srspopds2)
#  }
#  
#  # Making as many cluster samples as we specify for 'loops'
#  for (i in 1:loops) {
#    # Take a Cluster sample
#    c <- unique(splinepop.df[["Cluster"]])
#    sim.clus <- splinepop.df[splinepop.df[["Cluster"]] %in% sample(c, n/clusSizes),]
#  
#    # Using our Cluster function on Cluster samples
#    clus.CV.out <- cv.svy(sim.clus, modelsToFit,
#                          clusterID = "Cluster", nfolds = 5, fpcID = "fpcCluster") %>% as.vector()
#    clusclusds2 <- data.frame(df = 1:6, MSE = clus.CV.out)
#    clusclusds <- rbind(clusclusds, clusclusds2)
#  
#    # Using our SRS function on Cluster samples
#    srs.CV.out <- cv.svy(sim.clus, modelsToFit,
#                         nfolds = 5, fpcID = "fpcSRS") %>% as.vector()
#    clussrsds2 <- data.frame(df = 1:6, MSE = srs.CV.out)
#    clussrsds <- rbind(clussrsds, clussrsds2)
#  
#    # Eval 80% Cluster samples on full pop
#    sub.c <- unique(sim.clus[["Cluster"]])
#    sub.clus <- sim.clus[sim.clus[["Cluster"]] %in% sample(sub.c, (n/clusSizes)*.8),]
#  
#    clus.des <- svydesign(ids = ~Cluster, fpc = ~fpcCluster, data = sub.clus)
#    clus.pop.out <- sapply(1:6,
#      function(ii) {
#          yhat <- predict(svyglm(modelsToFit[ii], clus.des),
#                          newdata = splinepop.df)
#          return(mean((yhat - splinepop.df$Response)^2))
#        })
#    cluspopds2 <- data.frame(df = 1:6, MSE = clus.pop.out)
#    cluspopds <- rbind(cluspopds, cluspopds2)
#  }
#  
#  # Making as many stratified samples as we specify for 'loops'
#  for (i in 1:loops) {
#    # Take a Strat sample
#    stratcounts <- rep(n/nrStrata, nrStrata)
#    names(stratcounts) <- 1:nrStrata
#    s <- stratsample(splinepop.df$Stratum, stratcounts)
#    sim.strat <- splinepop.df[s,]
#  
#    # Using our Strat function on Strat samples
#    strat.CV.out <- cv.svy(sim.strat, modelsToFit,
#                           strataID = "Stratum", nfolds = 5, fpcID = "fpcStratum") %>% as.vector()
#    stratstratds2 <- data.frame(df = 1:6, MSE = strat.CV.out)
#    stratstratds <- rbind(stratstratds, stratstratds2)
#  
#    # Using our SRS function on Strat samples
#    srs.CV.out <- cv.svy(sim.strat, modelsToFit,
#                         nfolds = 5, fpcID = "fpcSRS") %>% as.vector()
#    stratsrsds2 <- data.frame(df = 1:6, MSE = srs.CV.out)
#    stratsrsds <- rbind(stratsrsds, stratsrsds2)
#  
#    # Eval 80% Strat samples on full pop
#    sub.stratcounts <- rep((n/nrStrata)*.8, nrStrata)
#    names(sub.stratcounts) <- 1:nrStrata
#    sub.s <- survey::stratsample(sim.strat$Stratum, sub.stratcounts)
#    sub.strat <- sim.strat[sub.s,]
#  
#    strat.des <- svydesign(ids = ~1, strata = ~Stratum, fpc = ~fpcStratum, data = sub.strat)
#    strat.pop.out <- sapply(1:6,
#      function(ii) {
#          yhat <- predict(svyglm(modelsToFit[ii], strat.des),
#                          newdata = splinepop.df)
#          return(mean((yhat - splinepop.df$Response)^2))
#        })
#    stratpopds2 <- data.frame(df = 1:6, MSE = strat.pop.out)
#    stratpopds <- rbind(stratpopds, stratpopds2)
#  }
#  
#  # Making the degrees of freedom variable a factor variable
#  srssrsds$df <- as.factor(srssrsds$df)
#  srspopds$df <- as.factor(srspopds$df)
#  clusclusds$df <- as.factor(clusclusds$df)
#  clussrsds$df <- as.factor(clussrsds$df)
#  cluspopds$df <- as.factor(cluspopds$df)
#  stratstratds$df <- as.factor(stratstratds$df)
#  stratsrsds$df <- as.factor(stratsrsds$df)
#  stratpopds$df <- as.factor(stratpopds$df)
#  
#  usethis::use_data(srssrsds, srspopds,
#                    clusclusds, clussrsds, cluspopds,
#                    stratstratds, stratsrsds, stratpopds,
#                    internal = FALSE, overwrite = TRUE)
#  # TODO: for now setting internal=FALSE
#  # so that we don't overwrite the sysdata.Rda for intro.Rmd;
#  # but eventually when we're ready to replace that old vignette,
#  # we can return to internal=TRUE here
#  # if that's indeed better for R packages
#  # ...
#  # OK, now we've written those dataframes,
#  #   AND the 4 below, all into R/sysdata.Rda together

## ----FoldsSimsPlots, fig.width=7, fig.height=4.7------------------------------
if(!rerunSims) {
  srssrsds <- surveyCV:::srssrsds
  clusclusds <- surveyCV:::clusclusds
  clussrsds <- surveyCV:::clussrsds
  stratstratds <- surveyCV:::stratstratds
  stratsrsds <- surveyCV:::stratsrsds
  srspopds <- surveyCV:::srspopds
  cluspopds <- surveyCV:::cluspopds
  stratpopds <- surveyCV:::stratpopds
}

# Find the y-ranges of MSEs
yminMSE <- min(min(srssrsds$MSE),
               min(clusclusds$MSE), min(clussrsds$MSE),
               min(stratstratds$MSE), min(stratsrsds$MSE))
ymaxMSE <- max(max(srssrsds$MSE),
               max(clusclusds$MSE), max(clussrsds$MSE),
               max(stratstratds$MSE), max(stratsrsds$MSE))

srspopds <- dplyr::group_by(srspopds, df) %>%
  dplyr::summarise(MSE = mean(MSE)) %>% 
  dplyr::mutate(df = as.numeric(df))
cluspopds <- dplyr::group_by(cluspopds, df) %>%
  dplyr::summarise(MSE = mean(MSE)) %>% 
  dplyr::mutate(df = as.numeric(df))
stratpopds <- dplyr::group_by(stratpopds, df) %>%
  dplyr::summarise(MSE = mean(MSE)) %>% 
  dplyr::mutate(df = as.numeric(df))


# Making ggplot objects for the MSEs and AICs
plot.srssrs <- ggplot(data = srssrsds, mapping = aes(x = df, y = MSE/1e4)) +
  geom_boxplot() +
  geom_line(data = srspopds, mapping = aes(x = df, y = MSE/1e4)) +
  ggtitle("CV: SRS folds,\nSRS sample") +
  scale_y_log10(limits = c(yminMSE, ymaxMSE)/1e4)
plot.blank <- rectGrob(width = 0, height = 0) # empty spot here -- no corresponding sim
plot.clusclus <- ggplot(data = clusclusds, mapping = aes(x = df, y = MSE/1e4)) +
  geom_boxplot() +
  geom_line(data = cluspopds) +
  ggtitle("CV: Cluster folds,\nCluster sample") +
  scale_y_log10(limits = c(yminMSE, ymaxMSE)/1e4)
plot.clussrs <- ggplot(data = clussrsds, mapping = aes(x = df, y = MSE/1e4)) +
  geom_boxplot() +
  geom_line(data = cluspopds) +
  ggtitle("CV: SRS folds,\nCluster sample") +
  scale_y_log10(limits = c(yminMSE, ymaxMSE)/1e4)
plot.stratstrat <- ggplot(data = stratstratds, mapping = aes(x = df, y = MSE/1e4)) +
  geom_boxplot() +
  geom_line(data = stratpopds) +
  ggtitle("CV: Strat folds,\nStrat sample") +
  scale_y_log10(limits = c(yminMSE, ymaxMSE)/1e4)
plot.stratsrs <- ggplot(data = stratsrsds, mapping = aes(x = df, y = MSE/1e4)) +
  geom_boxplot() +
  geom_line(data = stratpopds) +
  ggtitle("CV: SRS folds,\nStrat sample") +
  scale_y_log10(limits = c(yminMSE, ymaxMSE)/1e4)

gridExtra::grid.arrange(plot.srssrs, plot.clussrs, plot.stratsrs, 
                        plot.blank, plot.clusclus, plot.stratstrat,
             ncol = 3,
             top = paste0("Simulated Data (Sample Sizes = ", n,
                          ", Clusters = ", n/clusSizes,
                          " or Strata = ", nrStrata,
                          ", Loops = ", loops, ", Folds = 5)"))

## ----QuadWeightsPlot, fig.width=7, fig.height=3.5-----------------------------
ggplot(splinepop.df,
  aes(x = Predictor, y = Response)) +
  geom_point(aes(size = samp_prob_quad), alpha = 0.1) +
  stat_smooth(method = "lm", formula = y ~ x + I(x^2), se = FALSE) +
  labs(title = "Artificial Population with Sampling Probabilities",
       subtitle = "Sampling prob. is higher for points nearer the solid curve",
       x = "Predictor", y = "Response",
       size = "Sampling Probability")

## ----WeightsSimsSetup---------------------------------------------------------
# TODO: redo with different n's?
n <- 100
loops <- 500
weights <- "samp_wt_quad"

## ----WeightsSimsResults, eval = rerunSims-------------------------------------
#  # In all the following cases,
#  # we are only checking the effect of using sampling **weights**
#  # during model-fitting on training sets, and testing on test sets;
#  # we did not use cluster or stratified sampling,
#  # so all CV folds will (sensibly) be SRS (regardless of samp weights)
#  
#  # Use weighted models, and calculate MSEs using weighted design
#  AllW <- data.frame(df = c(), MSE = c())
#  # Use SRS models, and calculate MSEs using SRS design
#  NoW <- data.frame(df = c(), MSE = c())
#  # Use weighted models, and calculate MSEs using SRS design
#  ModW <- data.frame(df = c(), MSE = c())
#  # Use SRS models, and calculate MSEs using weighted design
#  MSEW <- data.frame(df = c(), MSE = c())
#  
#  for (i in 1:loops) {
#    # Take a sample of size n, using the sampling probabilities instead of SRS
#    # (using 1/samp_wt as the samp_prob per unit,
#    #  or n/samp_wt as overall samp_prob to get a sample of size n)
#    in.sample <- sampling::UPtille(n / splinepop.df[[weights]])
#    splinesamp.df <- splinepop.df[in.sample > 0, ]
#    modelsToFit <- c("Response~ns(Predictor, df=1)", "Response~ns(Predictor, df=2)",
#                     "Response~ns(Predictor, df=3)", "Response~ns(Predictor, df=4)",
#                     "Response~ns(Predictor, df=5)", "Response~ns(Predictor, df=6)")
#  
#    AllWdat <- cv.svy(splinesamp.df, modelsToFit,
#                      nfolds = 5, weightsID = weights) %>% as.vector()
#    NoWdat <- cv.svy(splinesamp.df, modelsToFit,
#                     nfolds = 5) %>% as.vector()
#    ModWdat <- cv.svy(splinesamp.df, modelsToFit,
#                      nfolds = 5, weightsID = weights,
#                      useSvyForLoss = FALSE) %>% as.vector()
#    MSEWdat <- cv.svy(splinesamp.df, modelsToFit,
#                      nfolds = 5, weightsID = weights,
#                      useSvyForFits = FALSE) %>% as.vector()
#  
#    # compiling one data frame
#    AllW2 <- data.frame(df = 1:6, MSE = AllWdat, sample = rep(i, 6))
#    AllW <- rbind(AllW, AllW2)
#    NoW2 <- data.frame(df = 1:6, MSE = NoWdat, sample = rep(i, 6))
#    NoW <- rbind(NoW, NoW2)
#    ModW2 <- data.frame(df = 1:6, MSE = ModWdat, sample = rep(i, 6))
#    ModW <- rbind(ModW, ModW2)
#    MSEW2 <- data.frame(df = 1:6, MSE = MSEWdat, sample = rep(i, 6))
#    MSEW <- rbind(MSEW, MSEW2)
#  }
#  
#  # Making the degrees of freedom variable a factor variable
#  AllW$df <- as.factor(AllW$df)
#  NoW$df <- as.factor(NoW$df)
#  MSEW$df <- as.factor(MSEW$df)
#  ModW$df <- as.factor(ModW$df)
#  
#  
#  usethis::use_data(AllW, NoW,
#                    MSEW, ModW,
#                    internal = FALSE, overwrite = TRUE)
#  # TODO: as above, for now setting internal=FALSE
#  # so that we don't overwrite the sysdata.Rda for intro.Rmd;
#  # but eventually when we're ready to replace that old vignette,
#  # we can return to internal=TRUE here
#  # if that's indeed better for R packages
#  # ...
#  # OK, now we've written those dataframes,
#  #   AND the 8 above, all into R/sysdata.Rda together

## ----WeightsSimsResultsAndPlots, fig.width=7, fig.height=7--------------------
if(!rerunSims) {
  AllW <- surveyCV:::AllW
  NoW <- surveyCV:::NoW
  MSEW <- surveyCV:::MSEW
  ModW <- surveyCV:::ModW
}

# Find the y-range of MSEs
ymin <- min(min(AllW$MSE), min(NoW$MSE), min(MSEW$MSE), min(ModW$MSE))
ymax <- max(max(AllW$MSE), max(NoW$MSE), max(MSEW$MSE), max(ModW$MSE))
# Making a ggplot object for the MSEs collected when using
# SRS folds, SRS models, and SRS error calculations
pAllW <- ggplot(data = AllW, mapping = aes(x = df, y = MSE/1e5)) +
  ggtitle("Weights for both training and testing") +
  scale_y_log10(limits = c(ymin, ymax)/1e5) + geom_boxplot()
# Making a ggplot object for the MSEs collected when using
# SRS folds, Clus models, and SRS error calculations
pNoW <- ggplot(data = NoW, mapping = aes(x = df, y = MSE/1e5)) +
  ggtitle("No weights") +
  scale_y_log10(limits = c(ymin, ymax)/1e5) + geom_boxplot()
# Making a ggplot object for the MSEs collected when using
# Clus folds, Clus models, and Clus error calculations
pModW <- ggplot(data = ModW, mapping = aes(x = df, y = MSE/1e5)) +
  ggtitle("Weights when training models") +
  scale_y_log10(limits = c(ymin, ymax)/1e5) + geom_boxplot()
# Making a ggplot object for the MSEs collected when using
# Clus folds, SRS models, and Clus error calculations
pMSEW <- ggplot(data = MSEW, mapping = aes(x = df, y = MSE/1e5)) +
  ggtitle("Weights when estimating test MSE") +
  scale_y_log10(limits = c(ymin, ymax)/1e5) + geom_boxplot()

# Making a grid display of the four plot objects above
lay <- matrix(c(NA, 1, 1, NA,
                NA, 1, 1, NA,
                2, 2, 3, 3,
                2, 2, 3, 3,
                NA, 4, 4, NA,
                NA, 4, 4, NA), byrow = TRUE, ncol = 4)
gridExtra::grid.arrange(pAllW, pModW, pMSEW, pNoW, ncol = 2,
             top = paste0("Simulated Data (Sample Sizes = ", n,
                          ", Loops = ", loops, ", 5 Folds, Samp. Wts from Quad. Fit)"),
             layout_matrix = lay)

