AE 11: Scraping multiple pages of articles from the Cornell Review

Suggested answers

Application exercise


We will use the following packages in this application exercise.

  • tidyverse: For data import, wrangling, and visualization.
  • rvest: For scraping HTML files.
  • lubridate: For formatting date variables.
  • robotstxt: For verifying if we can scrape a website.

Part 1 - Data scraping

See the code below stored in iterate-cornell-review.R.

# load packages

# check that we can scrape data from the cornell review

# read the first page
page <- read_html("")

# extract desired components
titles <- html_elements(x = page, css = "#main .read-title a") |>

authors <- html_elements(x = page, css = "#main .byline a") |>

article_dates <- html_elements(x = page, css = "#main .posts-date") |>

topics <- html_elements(x = page, css = "#main .cat-links") |>

abstracts <- html_elements(x = page, css = ".post-description") |>

post_urls <- html_elements(x = page, css = ".aft-readmore") |>
  html_attr(name = "href")

# create a tibble with this data
review_raw <- tibble(
  title = titles,
  author = authors,
  date = article_dates,
  topic = topics,
  description = abstracts,
  url = post_urls

# clean up the data
review <- review_raw |>
    date = mdy(date),
    description = str_remove(string = description, pattern = "\nRead More")

######## write a for loop to scrape the first 10 pages
scrape_results <- vector(mode = "list", length = 5)

for(page_num in 1:length(scrape_results)) {
  # pause for a couple of seconds to prevent rapid HTTP requests

  # create url
  url <- str_glue("{page_num}/")

  # read the first page
  page <- read_html(url)

  # extract desired components
  titles <- html_elements(x = page, css = "#main .read-title a") |>

  authors <- html_elements(x = page, css = "#main .byline a") |>

  article_dates <- html_elements(x = page, css = "#main .posts-date") |>

  topics <- html_elements(x = page, css = "#main .cat-links") |>

  abstracts <- html_elements(x = page, css = ".post-description") |>

  post_urls <- html_elements(x = page, css = ".aft-readmore") |>
    html_attr(name = "href")

  # create a tibble with this data
  review_raw <- tibble(
    title = titles,
    author = authors,
    date = article_dates,
    topic = topics,
    description = abstracts,
    url = post_urls

  # clean up the data
  review <- review_raw |>
      date = mdy(date),
      description = str_remove(string = description, pattern = "\nRead More")

  # store in list output
  scrape_results[[page_num]] <- review

# collapse list of data frames to a single data frame
scrape_df <- list_rbind(x = scrape_results)

######## write a function to scrape a single page and use a map() function
######## to iterate over the first ten pages
# convert to a function
scrape_review <- function(url){
  # pause for a couple of seconds to prevent rapid HTTP requests

  # read the first page
  page <- read_html(url)

  # extract desired components
  titles <- html_elements(x = page, css = "#main .read-title a") |>

  authors <- html_elements(x = page, css = "#main .byline a") |>

  article_dates <- html_elements(x = page, css = "#main .posts-date") |>

  topics <- html_elements(x = page, css = "#main .cat-links") |>

  abstracts <- html_elements(x = page, css = ".post-description") |>

  post_urls <- html_elements(x = page, css = ".aft-readmore") |>
    html_attr(name = "href")

  # create a tibble with this data
  review_raw <- tibble(
    title = titles,
    author = authors,
    date = article_dates,
    topic = topics,
    description = abstracts,
    url = post_urls

  # clean up the data
  review <- review_raw |>
      date = mdy(date),
      description = str_remove(string = description, pattern = "\nRead More")

  # export the resulting data frame

# test function
## page 1
scrape_review(url = "")

## page 2
scrape_review(url = "")

## page 3
scrape_review(url = "")

# create a vector of URLs
page_nums <- 1:10
cr_urls <- str_glue("{page_nums}/")

# map function over URLs
cr_reviews <- map(.x = cr_urls, .f = scrape_review, .progress = TRUE) |>

# write data
write_csv(x = cr_reviews, file = "data/cornell-review-all.csv")

Part 2 - Data analysis

Demo: Import the scraped data set.

cr_reviews <- read_csv(file = "data/cornell-review-all.csv")
Rows: 100 Columns: 6
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (5): title, author, topic, description, url
date (1): date

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# A tibble: 100 × 6
   title                               author date       topic description url  
   <chr>                               <chr>  <date>     <chr> <chr>       <chr>
 1 Jamelle Bouie Emphasizes Substance… Casey… 2023-10-10 "Cam… "As part o… http…
 2 Noted Experts Discuss Free Express… Corne… 2023-10-09 "Cam… "Cornell h… http…
 3 Cornell Review’s Free Expression E… The C… 2023-10-05 "Cam… "The Corne… http…
 4 Opinion: Cornell Should Earmark It… Revie… 2023-10-04 "Bey… "President… http…
 5 Grad Student Organizing in the Fre… Corne… 2023-10-02 "Cam… "The Corne… http…
 6 Breaking: Common Council Postpones… Corne… 2023-09-28 "Bey… "Ithaca Co… http…
 7 WHALEN | Cornell’s Promise: Any (l… Rodge… 2023-09-27 "Cam… "With affi… http…
 8 FIRE speaker visits Cornell, gives… Revie… 2023-09-26 "Cam… "Murnane c… http…
 9 RESCHINI | Free expression and stu… Rodge… 2023-09-20 "Cam… "When one … http…
10 Ho Plaza to remain closed through … Rodge… 2023-09-19 "Cam… "Construct… http…
# ℹ 90 more rows

Demo: Who are the most prolific authors?

cr_reviews |>
  # adjust order of authors so they appear from most to least frequent
  mutate(author = fct_infreq(f = author) |>
    fct_rev()) |>
  # horizontal bar chart
  ggplot(mapping = aes(y = author)) +

Demo: What topics does The Cornell Review write about?

# basic bar plot
ggplot(data = cr_reviews, mapping = aes(y = topic)) +

Not super helpful. Each article can have multiple topics. What is the syntax for this column?

cr_reviews |>
# A tibble: 100 × 1
 1 "Campus"                
 2 "Campus"                
 3 "Campus"                
 4 "Beyond Cayuga's Waters"
 5 "Campus"                
 6 "Beyond Cayuga's Waters"
 7 "Campus\nOpinion"       
 8 "Campus"                
 9 "Campus"                
10 "Campus"                
# ℹ 90 more rows

Each topic is separated by a "\n". Since the number of topics varies for each article, we cannot separate() this column. Instead we can use a stringr function to split them into distinct character strings.

cr_reviews |>
  mutate(topic = str_split(string = topic, pattern = "\\n"))
# A tibble: 100 × 6
   title                               author date       topic description url  
   <chr>                               <chr>  <date>     <lis> <chr>       <chr>
 1 Jamelle Bouie Emphasizes Substance… Casey… 2023-10-10 <chr> "As part o… http…
 2 Noted Experts Discuss Free Express… Corne… 2023-10-09 <chr> "Cornell h… http…
 3 Cornell Review’s Free Expression E… The C… 2023-10-05 <chr> "The Corne… http…
 4 Opinion: Cornell Should Earmark It… Revie… 2023-10-04 <chr> "President… http…
 5 Grad Student Organizing in the Fre… Corne… 2023-10-02 <chr> "The Corne… http…
 6 Breaking: Common Council Postpones… Corne… 2023-09-28 <chr> "Ithaca Co… http…
 7 WHALEN | Cornell’s Promise: Any (l… Rodge… 2023-09-27 <chr> "With affi… http…
 8 FIRE speaker visits Cornell, gives… Revie… 2023-09-26 <chr> "Murnane c… http…
 9 RESCHINI | Free expression and stu… Rodge… 2023-09-20 <chr> "When one … http…
10 Ho Plaza to remain closed through … Rodge… 2023-09-19 <chr> "Construct… http…
# ℹ 90 more rows

This makes the column a list-column with each element a separate character vector. From here we need to unnest the column so each row contains a single topic value.

cr_reviews |>
  mutate(topic = str_split(string = topic, pattern = "\\n")) |>
  unnest_longer(col = topic)
# A tibble: 135 × 6
   title                               author date       topic description url  
   <chr>                               <chr>  <date>     <chr> <chr>       <chr>
 1 Jamelle Bouie Emphasizes Substance… Casey… 2023-10-10 Camp… "As part o… http…
 2 Noted Experts Discuss Free Express… Corne… 2023-10-09 Camp… "Cornell h… http…
 3 Cornell Review’s Free Expression E… The C… 2023-10-05 Camp… "The Corne… http…
 4 Opinion: Cornell Should Earmark It… Revie… 2023-10-04 Beyo… "President… http…
 5 Grad Student Organizing in the Fre… Corne… 2023-10-02 Camp… "The Corne… http…
 6 Breaking: Common Council Postpones… Corne… 2023-09-28 Beyo… "Ithaca Co… http…
 7 WHALEN | Cornell’s Promise: Any (l… Rodge… 2023-09-27 Camp… "With affi… http…
 8 WHALEN | Cornell’s Promise: Any (l… Rodge… 2023-09-27 Opin… "With affi… http…
 9 FIRE speaker visits Cornell, gives… Revie… 2023-09-26 Camp… "Murnane c… http…
10 RESCHINI | Free expression and stu… Rodge… 2023-09-20 Camp… "When one … http…
# ℹ 125 more rows

Notice the data frame now has additional rows. The unit of analysis is now an article-topic combination, rather than one-row-per-article. Not entirely a tidy structure, but necessary to construct a chart to visualize topic frequency.

cr_reviews |>
  mutate(topic = str_split(string = topic, pattern = "\\n")) |>
  unnest_longer(col = topic) |>
  ggplot(mapping = aes(y = topic)) +

Let’s clean this up like the previous chart.

cr_reviews |>
  mutate(topic = str_split(string = topic, pattern = "\\n")) |>
  unnest_longer(col = topic) |>
  # str_trim() - remove remove whitespace characters at beginning
  # and end of character strings
  mutate(topic = str_trim(string = topic) |>
    fct_infreq() |>
    fct_rev()) |>
  ggplot(mapping = aes(y = topic)) +

