## ----include = FALSE---------------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ## ----data, eval=FALSE--------------------------------------------------------- # # library(rvest) # library(dplyr) # library(purrr) # library(tibble) # library(stringr) # # class_of_interest <- ".mw-content-ltr" ## ids are #id-name, classes are .class-name # # # Finding newest 150 versions of Wikipedia's highlighter article # editurl <- "https://en.wikipedia.org/w/index.php?title=Highlighter&action=history&offset=&limit=150" # editclass_of_interest <- ".mw-changeslist-date" # # # Save the urls of the full articles # url_list1 <- editurl %>% # rvest::read_html() %>% # rvest::html_nodes(editclass_of_interest) %>% # purrr::map(., list()) %>% # tibble::tibble(node = .) %>% # dplyr::mutate(link = purrr::map_chr(node, html_attr, "href") %>% paste0("https://en.wikipedia.org", .)) # # # Finding oldest 150 versions of Wikipedia's highlighter article # editurl2 <- "https://en.wikipedia.org/w/index.php?title=Highlighter&action=history&dir=prev&limit=150" # # # Save the urls of the full articles # url_list2 <- editurl2 %>% # rvest::read_html() %>% # rvest::html_nodes(editclass_of_interest) %>% # purrr::map(., list()) %>% # tibble::tibble(node = .) %>% # dplyr::mutate(link = purrr::map_chr(node, html_attr, "href") %>% paste0("https://en.wikipedia.org", .)) # # # Combine url list # url_list <- rbind(url_list1, url_list2) # # # create a data frame with the text of the documents # wiki_pages <- data.frame(page_notes = rep(NA, dim(url_list)[1])) # # for (i in 1:dim(url_list)[1]){ # # wiki_list <- url_list$link[i] %>% # rvest::read_html() %>% # rvest::html_node(class_of_interest) %>% # rvest::html_children() %>% # purrr::map(., list()) %>% # tibble::tibble(node = .) %>% # dplyr::mutate(type = purrr::map_chr(node, html_name)) %>% # dplyr::filter(type == "p") %>% # dplyr::mutate(text = purrr::map_chr(node, html_text)) %>% # dplyr::mutate(cleantext = stringr::str_remove_all(text, "\\[.*?\\]") %>% stringr::str_trim()) %>% # plyr::summarise(cleantext = paste(cleantext, collapse = "
")) # # wiki_pages$page_notes[i] <- wiki_list$cleantext[1] # # } # ## ----------------------------------------------------------------------------- library(highlightr) # calculate frequencies with reference to source document (first row) merged_frequency <- collocation_frequency(highlightr::wiki_pages, text_column = "page_notes", source_row = 1, fuzzy=TRUE) head(merged_frequency) ## ----------------------------------------------------------------------------- # create a ggplot object of the transcript freq_plot <- collocation_plot(merged_frequency) # add html tags to source document page_highlight <- highlighted_text(freq_plot, labels=c("(fewest articles)", "(most articles)")) ## ----------------------------------------------------------------------------- # calculate frequencies with reference to source document (last row) merged_frequency2 <- collocation_frequency(highlightr::wiki_pages, text_column = "page_notes", source_row = nrow(wiki_pages), fuzzy=TRUE) # create a gpplot object of the transcript freq_plot2 <- collocation_plot(merged_frequency2) # add html tags to source document page_highlight2 <- highlighted_text(freq_plot2, labels=c("(fewest articles)", "(most articles)"))