---
title: "How to compare CPC indicators between public and private institutions?"
output: rmarkdown::html_vignette
vignette: >
  %\VignetteIndexEntry{How to compare CPC indicators between public and private institutions?}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
---

```{r, include = FALSE}
knitr::opts_chunk$set(
  collapse = TRUE,
  comment  = "#>",
  eval     = FALSE,
  message  = FALSE,
  warning  = FALSE
)
suppressPackageStartupMessages(library(systemfonts))
suppressPackageStartupMessages(library(textshaping))
```

This vignette shows how to use educabR to compare course quality (CPC)
between public and private higher education institutions in Brazil.

```{r setup}
library(educabR)
library(dplyr)
library(tidyr)
library(ggplot2)
```

## Downloading CPC data

CPC (Conceito Preliminar de Curso) is a quality indicator for
undergraduate courses, ranging from 1 (lowest) to 5 (highest).
Courses scoring 1 or 2 are flagged for on-site evaluation.

```{r download}
cpc <- get_cpc(year = 2023)
glimpse(cpc)
```

```
#> Rows: 9,812
#> Columns: 39
#> $ ano                      <dbl> 2023, 2023, 2023, 2023, 2023, …
#> $ codigo_da_ies            <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, …
#> $ nome_da_ies              <chr> "UNIVERSIDADE FEDERAL DE MATO G…
#> $ sigla_da_ies             <chr> "UFMT", "UFMT", "UFMT", "UFMT"…
#> $ categoria_administrativa <chr> "Pública Federal", "Pública Fed…
#> $ codigo_do_curso          <dbl> 3, 9, 10, 12, 16, 17, 20, 37, …
#> $ area_de_avaliacao        <chr> "ENGENHARIA CIVIL", "AGRONOMIA"…
#> $ sigla_da_uf              <chr> "MT", "MT", "MT", "MT", "MT", "…
#> $ cpc_continuo             <dbl> 3.429, 3.482, 3.064, 2.792, 4.…
#> $ cpc_faixa                <dbl> 4, 4, 4, 3, 5, 4, 4, 4, 4, 5, …
#> # ℹ 29 more variables
```

## Identifying public vs private institutions

The `categoria_administrativa` column classifies institutions. The exact
column name and coding may vary by year -- check `names(cpc)` after
downloading.

```{r classify}
cpc_classified <-
  cpc |>
  mutate(
    sector = case_when(
      categoria_administrativa %in% c(
        "Publica Federal", "Publica Estadual", "Publica Municipal",
        "P\u00fablica Federal", "P\u00fablica Estadual", "P\u00fablica Municipal"
      ) ~ "Public",
      .default = "Private"
    )
  )
```

## CPC score distribution by sector

```{r distribution}
cpc_classified |>
  filter(!is.na(cpc_faixa)) |>
  count(sector, cpc_faixa) |>
  mutate(pct = n / sum(n) * 100, .by = sector) |>
  ggplot(aes(x = factor(cpc_faixa), y = pct, fill = sector)) +
  geom_col(position = "dodge") +
  labs(
    title = "CPC Score Distribution: Public vs Private (2023)",
    x     = "CPC Score (1-5)",
    y     = "Percentage of Courses (%)",
    fill  = "Sector"
  ) +
  theme_minimal()
```

![](../man/figures/vignette-cpc-distribution.png)

## Average CPC by sector and knowledge area

```{r by-area}
cpc_classified |>
  filter(!is.na(cpc_continuo), !is.na(area_de_avaliacao)) |>
  summarise(
    mean_cpc = mean(cpc_continuo, na.rm = TRUE),
    n = n(),
    .by = c(sector, area_de_avaliacao)
  ) |>
  filter(n >= 10) |>
  pivot_wider(
    names_from  = sector,
    values_from = c(mean_cpc, n)
  ) |>
  mutate(gap = mean_cpc_Public - mean_cpc_Private) |>
  slice_max(abs(gap), n = 15) |>
  ggplot(aes(x = reorder(area_de_avaliacao, gap), y = gap)) +
  geom_col(aes(fill = gap > 0)) +
  coord_flip() +
  scale_fill_manual(
    values = c("TRUE" = "#2a9d8f", "FALSE" = "#e76f51"),
    labels = c("TRUE" = "Public higher", "FALSE" = "Private higher")
  ) +
  labs(
    title = "CPC Gap: Public minus Private, by Knowledge Area (2023)",
    x     = NULL,
    y     = "CPC difference (public - private)",
    fill  = NULL
  ) +
  theme_minimal() +
  theme(legend.position = "none")
```

![](../man/figures/vignette-cpc-gap-area.png)

## Combining with IGC for institutional view

IGC (Indice Geral de Cursos) provides an institution-level quality score.
Combining CPC and IGC gives a course-level and institution-level
perspective.

```{r igc-comparison}
igc <- get_igc(year = 2023)

igc |>
  mutate(
    sector = case_when(
      categoria_administrativa %in% c(
        "Publica Federal", "Publica Estadual", "Publica Municipal",
        "P\u00fablica Federal", "P\u00fablica Estadual", "P\u00fablica Municipal"
      ) ~ "Public",
      .default = "Private"
    )
  ) |>
  filter(!is.na(igc_continuo)) |>
  ggplot(aes(x = sector, y = igc_continuo, fill = sector)) +
  geom_boxplot(alpha = 0.7) +
  scale_fill_manual(values = c("Public" = "#2a9d8f", "Private" = "#e76f51")) +
  labs(
    title = "IGC Distribution: Public vs Private (2023)",
    x     = NULL,
    y     = "IGC (Continuous)"
  ) +
  theme_minimal() +
  theme(legend.position = "none")
```

![](../man/figures/vignette-cpc-igc-boxplot.png)
