AE 11: Scraping multiple pages of articles from the Cornell Review

Packages

We will use the following packages in this application exercise.

tidyverse: For data import, wrangling, and visualization.
rvest: For scraping HTML files.
lubridate: For formatting date variables.
robotstxt: For verifying if we can scrape a website.

library(tidyverse)
library(rvest)
library(lubridate)
library(robotstxt)

Part 1 - Data scraping

See the code below stored in iterate-cornell-review.R.

# load packages
library(tidyverse)
library(rvest)
library(lubridate)
library(robotstxt)

# check that we can scrape data from the cornell review
paths_allowed("https://www.thecornellreview.org/")

# read the first page
page <- read_html("https://www.thecornellreview.org/")

# extract desired components
titles <- html_elements(x = page, css = "#main .read-title a") |>
  html_text2()

authors <- html_elements(x = page, css = "#main .byline a") |>
  html_text2()

article_dates <- html_elements(x = page, css = "#main .posts-date") |>
  html_text2()

topics <- html_elements(x = page, css = "#main .cat-links") |>
  html_text2()

abstracts <- html_elements(x = page, css = ".post-description") |>
  html_text2()

post_urls <- html_elements(x = page, css = ".aft-readmore") |>
  html_attr(name = "href")

# create a tibble with this data
review_raw <- tibble(
  title = titles,
  author = authors,
  date = article_dates,
  topic = topics,
  description = abstracts,
  url = post_urls
)

# clean up the data
review <- review_raw |>
  mutate(
    date = mdy(date),
    description = str_remove(string = description, pattern = "\nRead More")
  )

######## write a for loop to scrape the first 10 pages
scrape_results <- vector(mode = "list", length = 5)

for(page_num in 1:length(scrape_results)) {
  # pause for a couple of seconds to prevent rapid HTTP requests
  Sys.sleep(2)

  # create url
  url <- str_glue("https://www.thecornellreview.org/page/{page_num}/")

  # read the first page
  page <- read_html(url)

  # extract desired components
  titles <- html_elements(x = page, css = "#main .read-title a") |>
    html_text2()

  authors <- html_elements(x = page, css = "#main .byline a") |>
    html_text2()

  article_dates <- html_elements(x = page, css = "#main .posts-date") |>
    html_text2()

  topics <- html_elements(x = page, css = "#main .cat-links") |>
    html_text2()

  abstracts <- html_elements(x = page, css = ".post-description") |>
    html_text2()

  post_urls <- html_elements(x = page, css = ".aft-readmore") |>
    html_attr(name = "href")

  # create a tibble with this data
  review_raw <- tibble(
    title = titles,
    author = authors,
    date = article_dates,
    topic = topics,
    description = abstracts,
    url = post_urls
  )

  # clean up the data
  review <- review_raw |>
    mutate(
      date = mdy(date),
      description = str_remove(string = description, pattern = "\nRead More")
    )

  # store in list output
  scrape_results[[page_num]] <- review
}

# collapse list of data frames to a single data frame
scrape_df <- list_rbind(x = scrape_results)

######## write a function to scrape a single page and use a map() function
######## to iterate over the first ten pages
# convert to a function
scrape_review <- function(url){
  # pause for a couple of seconds to prevent rapid HTTP requests
  Sys.sleep(2)

  # read the first page
  page <- read_html(url)

  # extract desired components
  titles <- html_elements(x = page, css = "#main .read-title a") |>
    html_text2()

  authors <- html_elements(x = page, css = "#main .byline a") |>
    html_text2()

  article_dates <- html_elements(x = page, css = "#main .posts-date") |>
    html_text2()

  topics <- html_elements(x = page, css = "#main .cat-links") |>
    html_text2()

  abstracts <- html_elements(x = page, css = ".post-description") |>
    html_text2()

  post_urls <- html_elements(x = page, css = ".aft-readmore") |>
    html_attr(name = "href")

  # create a tibble with this data
  review_raw <- tibble(
    title = titles,
    author = authors,
    date = article_dates,
    topic = topics,
    description = abstracts,
    url = post_urls
  )

  # clean up the data
  review <- review_raw |>
    mutate(
      date = mdy(date),
      description = str_remove(string = description, pattern = "\nRead More")
    )

  # export the resulting data frame
  return(review)
}

# test function
## page 1
scrape_review(url = "https://www.thecornellreview.org/page/1/")

## page 2
scrape_review(url = "https://www.thecornellreview.org/page/2/")

## page 3
scrape_review(url = "https://www.thecornellreview.org/page/3/")

# create a vector of URLs
page_nums <- 1:10
cr_urls <- str_glue("https://www.thecornellreview.org/page/{page_nums}/")
cr_urls

# map function over URLs
cr_reviews <- map(.x = cr_urls, .f = scrape_review, .progress = TRUE) |>
  list_rbind()

# write data
write_csv(x = cr_reviews, file = "data/cornell-review-all.csv")

Part 2 - Data analysis

Demo: Import the scraped data set.

cr_reviews <- read_csv(file = "data/cornell-review-all.csv")

Rows: 100 Columns: 6
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (5): title, author, topic, description, url
date (1): date

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

cr_reviews

# A tibble: 100 × 6
   title                               author date       topic description url  
   <chr>                               <chr>  <date>     <chr> <chr>       <chr>
 1 Jamelle Bouie Emphasizes Substance… Casey… 2023-10-10 "Cam… "As part o… http…
 2 Noted Experts Discuss Free Express… Corne… 2023-10-09 "Cam… "Cornell h… http…
 3 Cornell Review’s Free Expression E… The C… 2023-10-05 "Cam… "The Corne… http…
 4 Opinion: Cornell Should Earmark It… Revie… 2023-10-04 "Bey… "President… http…
 5 Grad Student Organizing in the Fre… Corne… 2023-10-02 "Cam… "The Corne… http…
 6 Breaking: Common Council Postpones… Corne… 2023-09-28 "Bey… "Ithaca Co… http…
 7 WHALEN | Cornell’s Promise: Any (l… Rodge… 2023-09-27 "Cam… "With affi… http…
 8 FIRE speaker visits Cornell, gives… Revie… 2023-09-26 "Cam… "Murnane c… http…
 9 RESCHINI | Free expression and stu… Rodge… 2023-09-20 "Cam… "When one … http…
10 Ho Plaza to remain closed through … Rodge… 2023-09-19 "Cam… "Construct… http…
# ℹ 90 more rows

Demo: Who are the most prolific authors?

cr_reviews |>
  # adjust order of authors so they appear from most to least frequent
  mutate(author = fct_infreq(f = author) |>
    fct_rev()) |>
  # horizontal bar chart
  ggplot(mapping = aes(y = author)) +
  geom_bar()

Demo: What topics does The Cornell Review write about?

# basic bar plot
ggplot(data = cr_reviews, mapping = aes(y = topic)) +
  geom_bar()

Not super helpful. Each article can have multiple topics. What is the syntax for this column?

cr_reviews |>
  select(topic)

# A tibble: 100 × 1
   topic                   
   <chr>                   
 1 "Campus"                
 2 "Campus"                
 3 "Campus"                
 4 "Beyond Cayuga's Waters"
 5 "Campus"                
 6 "Beyond Cayuga's Waters"
 7 "Campus\nOpinion"       
 8 "Campus"                
 9 "Campus"                
10 "Campus"                
# ℹ 90 more rows

Each topic is separated by a "\n". Since the number of topics varies for each article, we cannot separate() this column. Instead we can use a stringr function to split them into distinct character strings.

cr_reviews |>
  mutate(topic = str_split(string = topic, pattern = "\\n"))

# A tibble: 100 × 6
   title                               author date       topic description url  
   <chr>                               <chr>  <date>     <lis> <chr>       <chr>
 1 Jamelle Bouie Emphasizes Substance… Casey… 2023-10-10 <chr> "As part o… http…
 2 Noted Experts Discuss Free Express… Corne… 2023-10-09 <chr> "Cornell h… http…
 3 Cornell Review’s Free Expression E… The C… 2023-10-05 <chr> "The Corne… http…
 4 Opinion: Cornell Should Earmark It… Revie… 2023-10-04 <chr> "President… http…
 5 Grad Student Organizing in the Fre… Corne… 2023-10-02 <chr> "The Corne… http…
 6 Breaking: Common Council Postpones… Corne… 2023-09-28 <chr> "Ithaca Co… http…
 7 WHALEN | Cornell’s Promise: Any (l… Rodge… 2023-09-27 <chr> "With affi… http…
 8 FIRE speaker visits Cornell, gives… Revie… 2023-09-26 <chr> "Murnane c… http…
 9 RESCHINI | Free expression and stu… Rodge… 2023-09-20 <chr> "When one … http…
10 Ho Plaza to remain closed through … Rodge… 2023-09-19 <chr> "Construct… http…
# ℹ 90 more rows

This makes the column a list-column with each element a separate character vector. From here we need to unnest the column so each row contains a single topic value.

cr_reviews |>
  mutate(topic = str_split(string = topic, pattern = "\\n")) |>
  unnest_longer(col = topic)

# A tibble: 135 × 6
   title                               author date       topic description url  
   <chr>                               <chr>  <date>     <chr> <chr>       <chr>
 1 Jamelle Bouie Emphasizes Substance… Casey… 2023-10-10 Camp… "As part o… http…
 2 Noted Experts Discuss Free Express… Corne… 2023-10-09 Camp… "Cornell h… http…
 3 Cornell Review’s Free Expression E… The C… 2023-10-05 Camp… "The Corne… http…
 4 Opinion: Cornell Should Earmark It… Revie… 2023-10-04 Beyo… "President… http…
 5 Grad Student Organizing in the Fre… Corne… 2023-10-02 Camp… "The Corne… http…
 6 Breaking: Common Council Postpones… Corne… 2023-09-28 Beyo… "Ithaca Co… http…
 7 WHALEN | Cornell’s Promise: Any (l… Rodge… 2023-09-27 Camp… "With affi… http…
 8 WHALEN | Cornell’s Promise: Any (l… Rodge… 2023-09-27 Opin… "With affi… http…
 9 FIRE speaker visits Cornell, gives… Revie… 2023-09-26 Camp… "Murnane c… http…
10 RESCHINI | Free expression and stu… Rodge… 2023-09-20 Camp… "When one … http…
# ℹ 125 more rows

Notice the data frame now has additional rows. The unit of analysis is now an article-topic combination, rather than one-row-per-article. Not entirely a tidy structure, but necessary to construct a chart to visualize topic frequency.

cr_reviews |>
  mutate(topic = str_split(string = topic, pattern = "\\n")) |>
  unnest_longer(col = topic) |>
  ggplot(mapping = aes(y = topic)) +
  geom_bar()

Let’s clean this up like the previous chart.

cr_reviews |>
  mutate(topic = str_split(string = topic, pattern = "\\n")) |>
  unnest_longer(col = topic) |>
  # str_trim() - remove remove whitespace characters at beginning
  # and end of character strings
  mutate(topic = str_trim(string = topic) |>
    fct_infreq() |>
    fct_rev()) |>
  ggplot(mapping = aes(y = topic)) +
  geom_bar()

Session information

sessioninfo::session_info()

─ Session info ───────────────────────────────────────────────────────────────
 setting  value
 version  R version 4.3.1 (2023-06-16)
 os       macOS Ventura 13.5.2
 system   aarch64, darwin20
 ui       X11
 language (EN)
 collate  en_US.UTF-8
 ctype    en_US.UTF-8
 tz       America/New_York
 date     2023-10-12
 pandoc   3.1.1 @ /Applications/RStudio.app/Contents/Resources/app/quarto/bin/tools/ (via rmarkdown)

─ Packages ───────────────────────────────────────────────────────────────────
 package     * version date (UTC) lib source
 bit           4.0.5   2022-11-15 [1] CRAN (R 4.3.0)
 bit64         4.0.5   2020-08-30 [1] CRAN (R 4.3.0)
 cli           3.6.1   2023-03-23 [1] CRAN (R 4.3.0)
 colorspace    2.1-0   2023-01-23 [1] CRAN (R 4.3.0)
 crayon        1.5.2   2022-09-29 [1] CRAN (R 4.3.0)
 digest        0.6.31  2022-12-11 [1] CRAN (R 4.3.0)
 dplyr       * 1.1.2   2023-04-20 [1] CRAN (R 4.3.0)
 evaluate      0.21    2023-05-05 [1] CRAN (R 4.3.0)
 fansi         1.0.4   2023-01-22 [1] CRAN (R 4.3.0)
 farver        2.1.1   2022-07-06 [1] CRAN (R 4.3.0)
 fastmap       1.1.1   2023-02-24 [1] CRAN (R 4.3.0)
 forcats     * 1.0.0   2023-01-29 [1] CRAN (R 4.3.0)
 generics      0.1.3   2022-07-05 [1] CRAN (R 4.3.0)
 ggplot2     * 3.4.2   2023-04-03 [1] CRAN (R 4.3.0)
 glue          1.6.2   2022-02-24 [1] CRAN (R 4.3.0)
 gtable        0.3.3   2023-03-21 [1] CRAN (R 4.3.0)
 here          1.0.1   2020-12-13 [1] CRAN (R 4.3.0)
 hms           1.1.3   2023-03-21 [1] CRAN (R 4.3.0)
 htmltools     0.5.5   2023-03-23 [1] CRAN (R 4.3.0)
 htmlwidgets   1.6.2   2023-03-17 [1] CRAN (R 4.3.0)
 httr          1.4.6   2023-05-08 [1] CRAN (R 4.3.0)
 jsonlite      1.8.5   2023-06-05 [1] CRAN (R 4.3.0)
 knitr         1.43    2023-05-25 [1] CRAN (R 4.3.0)
 labeling      0.4.2   2020-10-20 [1] CRAN (R 4.3.0)
 lifecycle     1.0.3   2022-10-07 [1] CRAN (R 4.3.0)
 lubridate   * 1.9.2   2023-02-10 [1] CRAN (R 4.3.0)
 magrittr      2.0.3   2022-03-30 [1] CRAN (R 4.3.0)
 munsell       0.5.0   2018-06-12 [1] CRAN (R 4.3.0)
 pillar        1.9.0   2023-03-22 [1] CRAN (R 4.3.0)
 pkgconfig     2.0.3   2019-09-22 [1] CRAN (R 4.3.0)
 purrr       * 1.0.1   2023-01-10 [1] CRAN (R 4.3.0)
 R6            2.5.1   2021-08-19 [1] CRAN (R 4.3.0)
 readr       * 2.1.4   2023-02-10 [1] CRAN (R 4.3.0)
 rlang         1.1.1   2023-04-28 [1] CRAN (R 4.3.0)
 rmarkdown     2.22    2023-06-01 [1] CRAN (R 4.3.0)
 robotstxt   * 0.7.13  2020-09-03 [1] CRAN (R 4.3.0)
 rprojroot     2.0.3   2022-04-02 [1] CRAN (R 4.3.0)
 rstudioapi    0.14    2022-08-22 [1] CRAN (R 4.3.0)
 rvest       * 1.0.3   2022-08-19 [1] CRAN (R 4.3.0)
 scales        1.2.1   2022-08-20 [1] CRAN (R 4.3.0)
 sessioninfo   1.2.2   2021-12-06 [1] CRAN (R 4.3.0)
 stringi       1.7.12  2023-01-11 [1] CRAN (R 4.3.0)
 stringr     * 1.5.0   2022-12-02 [1] CRAN (R 4.3.0)
 tibble      * 3.2.1   2023-03-20 [1] CRAN (R 4.3.0)
 tidyr       * 1.3.0   2023-01-24 [1] CRAN (R 4.3.0)
 tidyselect    1.2.0   2022-10-10 [1] CRAN (R 4.3.0)
 tidyverse   * 2.0.0   2023-02-22 [1] CRAN (R 4.3.0)
 timechange    0.2.0   2023-01-11 [1] CRAN (R 4.3.0)
 tzdb          0.4.0   2023-05-12 [1] CRAN (R 4.3.0)
 utf8          1.2.3   2023-01-31 [1] CRAN (R 4.3.0)
 vctrs         0.6.3   2023-06-14 [1] CRAN (R 4.3.0)
 vroom         1.6.3   2023-04-28 [1] CRAN (R 4.3.0)
 withr         2.5.0   2022-03-03 [1] CRAN (R 4.3.0)
 xfun          0.39    2023-04-20 [1] CRAN (R 4.3.0)
 xml2          1.3.4   2023-04-27 [1] CRAN (R 4.3.0)
 yaml          2.3.7   2023-01-23 [1] CRAN (R 4.3.0)

 [1] /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/library

──────────────────────────────────────────────────────────────────────────────