## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## ----texts-overview-----------------------------------------------------------
library(tipitaka.critical)

dim(texts)
names(texts)

## ----texts-example------------------------------------------------------------
# The Brahmajala Sutta (DN 1)
dn1 <- texts[texts$id == "dn1", ]
dn1$title

# First 120 characters of surface text
cat(substr(dn1$text, 1, 120), "...\n")

# Same passage, lemmatized
cat(substr(dn1$text_lemmatized, 1, 120), "...\n")

## ----texts-collections--------------------------------------------------------
table(texts$pitaka)
table(texts$collection)

## ----lemmas-overview----------------------------------------------------------
dim(lemmas)
head(lemmas)

## ----lemmas-top---------------------------------------------------------------
totals <- tapply(lemmas$n, lemmas$word, sum)
head(sort(totals, decreasing = TRUE), 15)

## ----lemmas-by-collection-----------------------------------------------------
dn_lemmas <- lemmas[lemmas$collection == "dn", ]
dn_totals <- tapply(dn_lemmas$n, dn_lemmas$word, sum)
head(sort(dn_totals, decreasing = TRUE), 10)

## ----search-------------------------------------------------------------------
# Where does "nibbana" appear most frequently?
nibbana <- search_lemma("nibbana")
head(nibbana[, c("id", "collection", "n", "freq")])

## ----search-dhamma------------------------------------------------------------
# "dhamma" across collections
dhamma <- search_lemma("dhamma")
tapply(dhamma$n, dhamma$collection, sum)

## ----dtm-overview-------------------------------------------------------------
dim(dtm)
class(dtm)

# Sparsity (proportion of zero entries)
1 - length(dtm@x) / prod(dim(dtm))

## ----dn-cluster, fig.width=7, fig.height=4------------------------------------
dn_ids <- texts$id[texts$collection == "dn"]
dn_dtm <- dtm[dn_ids, ]

# Drop empty columns
dn_dtm <- dn_dtm[, colSums(dn_dtm) > 0]

d <- dist(as.matrix(dn_dtm))
hc <- hclust(d, method = "ward.D2")
plot(hc, main = "Digha Nikaya — Hierarchical Clustering",
     xlab = "", sub = "", cex = 0.7)

## ----pca, fig.width=7, fig.height=6-------------------------------------------
# Select top 500 lemmas by total frequency
col_sums <- colSums(dtm)
top_terms <- names(sort(col_sums, decreasing = TRUE))[1:500]
dtm_sub <- as.matrix(dtm[, top_terms])

# PCA
pca <- prcomp(dtm_sub, center = TRUE, scale. = FALSE)
pct_var <- summary(pca)$importance[2, 1:2] * 100

# Color by collection
coll_colors <- c(
  abhidhamma = "#E41A1C", an = "#377EB8", dn = "#4DAF4A",
  kn = "#FF7F00", mn = "#984EA3", sn = "#A65628",
  vinaya = "#F781BF"
)
pt_col <- coll_colors[texts$collection]

plot(pca$x[, 1], pca$x[, 2],
     col = adjustcolor(pt_col, alpha.f = 0.5), pch = 16, cex = 0.6,
     xlab = paste0("PC1 (", round(pct_var[1], 1), "%)"),
     ylab = paste0("PC2 (", round(pct_var[2], 1), "%)"),
     main = "PCA of All Tipitaka Texts")
legend("topright",
       c("Abhidhamma", "AN", "DN", "KN", "MN", "SN", "Vinaya"),
       col = coll_colors, pch = 16, cex = 0.8)

## ----canon-cluster, fig.width=7, fig.height=10--------------------------------
# Create group IDs at an intermediate level
group_id <- texts$id

# SN: sn1.1 -> sn1 (by samyutta)
sn_mask <- texts$collection == "sn"
group_id[sn_mask] <- sub("\\..*", "", group_id[sn_mask])

# AN: an1.1 -> an1 (by nipata)
an_mask <- texts$collection == "an"
group_id[an_mask] <- sub("\\..*", "", group_id[an_mask])

# KN: dhp1-20 -> dhp, snp1.1 -> snp, etc. (by text)
kn_mask <- texts$collection == "kn"
group_id[kn_mask] <- sub("[0-9].*", "", group_id[kn_mask])

# Aggregate DTM by group (mean of member frequencies)
groups <- unique(group_id)
group_dtm <- matrix(0, length(groups), length(top_terms))
group_coll <- character(length(groups))
for (i in seq_along(groups)) {
  rows <- which(group_id == groups[i])
  if (length(rows) == 1) {
    group_dtm[i, ] <- dtm_sub[rows, ]
  } else {
    group_dtm[i, ] <- colMeans(dtm_sub[rows, ])
  }
  group_coll[i] <- texts$collection[rows[1]]
}
rownames(group_dtm) <- groups

# Cluster
d <- dist(group_dtm)
hc <- hclust(d, method = "ward.D2")

# Color labels by collection
label_col <- coll_colors[group_coll[hc$order]]
dend <- as.dendrogram(hc)
# Apply colors to leaf labels
color_labels <- function(n, col_vec) {
  if (is.leaf(n)) {
    i <- match(attr(n, "label"), groups[hc$order])
    attr(n, "nodePar") <- list(pch = NA, lab.col = col_vec[i], lab.cex = 0.45)
  }
  n
}
dend <- dendrapply(dend, color_labels, col_vec = label_col)

oldpar <- par(mar = c(2, 1, 2, 8))
plot(dend, horiz = TRUE, main = "Tipitaka — Hierarchical Clustering",
     xlab = "")
legend("topleft",
       c("Abhidhamma", "AN", "DN", "KN", "MN", "SN", "Vinaya"),
       text.col = coll_colors, cex = 0.7, bty = "n")
par(oldpar)