## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## ----data, eval=FALSE---------------------------------------------------------
# 
# library(rvest)
# library(dplyr)
# library(purrr)
# library(tibble)
# library(stringr)
# 
# class_of_interest <- ".mw-content-ltr" ## ids are #id-name, classes are .class-name
# 
# # Finding newest 150 versions of Wikipedia's highlighter article
# editurl <- "https://en.wikipedia.org/w/index.php?title=Highlighter&action=history&offset=&limit=150"
# editclass_of_interest <- ".mw-changeslist-date"
# 
# # Save the urls of the full articles
# url_list1 <- editurl %>%
#   rvest::read_html() %>%
#   rvest::html_nodes(editclass_of_interest) %>%
#   purrr::map(., list()) %>%
#   tibble::tibble(node = .) %>%
#   dplyr::mutate(link = purrr::map_chr(node, html_attr, "href") %>% paste0("https://en.wikipedia.org", .))
# 
# # Finding oldest 150 versions of Wikipedia's highlighter article
# editurl2 <- "https://en.wikipedia.org/w/index.php?title=Highlighter&action=history&dir=prev&limit=150"
# 
# # Save the urls of the full articles
# url_list2 <- editurl2 %>%
#   rvest::read_html() %>%
#   rvest::html_nodes(editclass_of_interest) %>%
#   purrr::map(., list()) %>%
#   tibble::tibble(node = .) %>%
#   dplyr::mutate(link = purrr::map_chr(node, html_attr, "href") %>% paste0("https://en.wikipedia.org", .))
# 
# # Combine url list
# url_list <- rbind(url_list1, url_list2)
# 
# # create a data frame with the text of the documents
# wiki_pages <- data.frame(page_notes = rep(NA, dim(url_list)[1]))
# 
# for (i in 1:dim(url_list)[1]){
# 
#   wiki_list <-  url_list$link[i] %>%
#     rvest::read_html() %>%
#     rvest::html_node(class_of_interest) %>%
#     rvest::html_children() %>%
#     purrr::map(., list()) %>%
#     tibble::tibble(node = .) %>%
#     dplyr::mutate(type = purrr::map_chr(node, html_name)) %>%
#     dplyr::filter(type == "p") %>%
#     dplyr::mutate(text = purrr::map_chr(node, html_text)) %>%
#     dplyr::mutate(cleantext = stringr::str_remove_all(text, "\\[.*?\\]") %>% stringr::str_trim()) %>%
#     plyr::summarise(cleantext = paste(cleantext, collapse = "<br> "))
# 
#   wiki_pages$page_notes[i] <- wiki_list$cleantext[1]
# 
# }
# 

## -----------------------------------------------------------------------------

library(highlightr)
# calculate frequencies with reference to source document (first row)
merged_frequency <- collocation_frequency(highlightr::wiki_pages, text_column = "page_notes",
                                    source_row = 1, fuzzy=TRUE)

head(merged_frequency)

## -----------------------------------------------------------------------------
# create a ggplot object of the transcript
freq_plot <- collocation_plot(merged_frequency)

# add html tags to source document
page_highlight <- highlighted_text(freq_plot, labels=c("(fewest articles)", "(most articles)"))

## -----------------------------------------------------------------------------

# calculate frequencies with reference to source document (last row)
merged_frequency2 <- collocation_frequency(highlightr::wiki_pages, text_column = "page_notes",
                                    source_row = nrow(wiki_pages), fuzzy=TRUE)

# create a gpplot object of the transcript
freq_plot2 <- collocation_plot(merged_frequency2)

# add html tags to source document
page_highlight2 <- highlighted_text(freq_plot2, labels=c("(fewest articles)", "(most articles)"))


