library(tidyverse)
library(rvest)
library(lubridate)
library(robotstxt)
AE 11: Scraping multiple pages of articles from the Cornell Review
Suggested answers
Packages
We will use the following packages in this application exercise.
- tidyverse: For data import, wrangling, and visualization.
- rvest: For scraping HTML files.
- lubridate: For formatting date variables.
- robotstxt: For verifying if we can scrape a website.
Part 1 - Data scraping
See the code below stored in iterate-cornell-review.R
.
# load packages
library(tidyverse)
library(rvest)
library(lubridate)
library(robotstxt)
# check that we can scrape data from the cornell review
paths_allowed("https://www.thecornellreview.org/")
# read the first page
<- read_html("https://www.thecornellreview.org/")
page
# extract desired components
<- html_elements(x = page, css = "#main .read-title a") |>
titles html_text2()
<- html_elements(x = page, css = "#main .byline a") |>
authors html_text2()
<- html_elements(x = page, css = "#main .posts-date") |>
article_dates html_text2()
<- html_elements(x = page, css = "#main .cat-links") |>
topics html_text2()
<- html_elements(x = page, css = ".post-description") |>
abstracts html_text2()
<- html_elements(x = page, css = ".aft-readmore") |>
post_urls html_attr(name = "href")
# create a tibble with this data
<- tibble(
review_raw title = titles,
author = authors,
date = article_dates,
topic = topics,
description = abstracts,
url = post_urls
)
# clean up the data
<- review_raw |>
review mutate(
date = mdy(date),
description = str_remove(string = description, pattern = "\nRead More")
)
######## write a for loop to scrape the first 10 pages
<- vector(mode = "list", length = 5)
scrape_results
for(page_num in 1:length(scrape_results)) {
# pause for a couple of seconds to prevent rapid HTTP requests
Sys.sleep(2)
# create url
<- str_glue("https://www.thecornellreview.org/page/{page_num}/")
url
# read the first page
<- read_html(url)
page
# extract desired components
<- html_elements(x = page, css = "#main .read-title a") |>
titles html_text2()
<- html_elements(x = page, css = "#main .byline a") |>
authors html_text2()
<- html_elements(x = page, css = "#main .posts-date") |>
article_dates html_text2()
<- html_elements(x = page, css = "#main .cat-links") |>
topics html_text2()
<- html_elements(x = page, css = ".post-description") |>
abstracts html_text2()
<- html_elements(x = page, css = ".aft-readmore") |>
post_urls html_attr(name = "href")
# create a tibble with this data
<- tibble(
review_raw title = titles,
author = authors,
date = article_dates,
topic = topics,
description = abstracts,
url = post_urls
)
# clean up the data
<- review_raw |>
review mutate(
date = mdy(date),
description = str_remove(string = description, pattern = "\nRead More")
)
# store in list output
<- review
scrape_results[[page_num]]
}
# collapse list of data frames to a single data frame
<- list_rbind(x = scrape_results)
scrape_df
######## write a function to scrape a single page and use a map() function
######## to iterate over the first ten pages
# convert to a function
<- function(url){
scrape_review # pause for a couple of seconds to prevent rapid HTTP requests
Sys.sleep(2)
# read the first page
<- read_html(url)
page
# extract desired components
<- html_elements(x = page, css = "#main .read-title a") |>
titles html_text2()
<- html_elements(x = page, css = "#main .byline a") |>
authors html_text2()
<- html_elements(x = page, css = "#main .posts-date") |>
article_dates html_text2()
<- html_elements(x = page, css = "#main .cat-links") |>
topics html_text2()
<- html_elements(x = page, css = ".post-description") |>
abstracts html_text2()
<- html_elements(x = page, css = ".aft-readmore") |>
post_urls html_attr(name = "href")
# create a tibble with this data
<- tibble(
review_raw title = titles,
author = authors,
date = article_dates,
topic = topics,
description = abstracts,
url = post_urls
)
# clean up the data
<- review_raw |>
review mutate(
date = mdy(date),
description = str_remove(string = description, pattern = "\nRead More")
)
# export the resulting data frame
return(review)
}
# test function
## page 1
scrape_review(url = "https://www.thecornellreview.org/page/1/")
## page 2
scrape_review(url = "https://www.thecornellreview.org/page/2/")
## page 3
scrape_review(url = "https://www.thecornellreview.org/page/3/")
# create a vector of URLs
<- 1:10
page_nums <- str_glue("https://www.thecornellreview.org/page/{page_nums}/")
cr_urls
cr_urls
# map function over URLs
<- map(.x = cr_urls, .f = scrape_review, .progress = TRUE) |>
cr_reviews list_rbind()
# write data
write_csv(x = cr_reviews, file = "data/cornell-review-all.csv")
Part 2 - Data analysis
Demo: Import the scraped data set.
<- read_csv(file = "data/cornell-review-all.csv") cr_reviews
Rows: 100 Columns: 6
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (5): title, author, topic, description, url
date (1): date
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
cr_reviews
# A tibble: 100 × 6
title author date topic description url
<chr> <chr> <date> <chr> <chr> <chr>
1 Jamelle Bouie Emphasizes Substance… Casey… 2023-10-10 "Cam… "As part o… http…
2 Noted Experts Discuss Free Express… Corne… 2023-10-09 "Cam… "Cornell h… http…
3 Cornell Review’s Free Expression E… The C… 2023-10-05 "Cam… "The Corne… http…
4 Opinion: Cornell Should Earmark It… Revie… 2023-10-04 "Bey… "President… http…
5 Grad Student Organizing in the Fre… Corne… 2023-10-02 "Cam… "The Corne… http…
6 Breaking: Common Council Postpones… Corne… 2023-09-28 "Bey… "Ithaca Co… http…
7 WHALEN | Cornell’s Promise: Any (l… Rodge… 2023-09-27 "Cam… "With affi… http…
8 FIRE speaker visits Cornell, gives… Revie… 2023-09-26 "Cam… "Murnane c… http…
9 RESCHINI | Free expression and stu… Rodge… 2023-09-20 "Cam… "When one … http…
10 Ho Plaza to remain closed through … Rodge… 2023-09-19 "Cam… "Construct… http…
# ℹ 90 more rows
Demo: Who are the most prolific authors?
|>
cr_reviews # adjust order of authors so they appear from most to least frequent
mutate(author = fct_infreq(f = author) |>
fct_rev()) |>
# horizontal bar chart
ggplot(mapping = aes(y = author)) +
geom_bar()
Demo: What topics does The Cornell Review write about?
# basic bar plot
ggplot(data = cr_reviews, mapping = aes(y = topic)) +
geom_bar()
Not super helpful. Each article can have multiple topics. What is the syntax for this column?
|>
cr_reviews select(topic)
# A tibble: 100 × 1
topic
<chr>
1 "Campus"
2 "Campus"
3 "Campus"
4 "Beyond Cayuga's Waters"
5 "Campus"
6 "Beyond Cayuga's Waters"
7 "Campus\nOpinion"
8 "Campus"
9 "Campus"
10 "Campus"
# ℹ 90 more rows
Each topic is separated by a "\n"
. Since the number of topics varies for each article, we cannot separate()
this column. Instead we can use a stringr function to split them into distinct character strings.
|>
cr_reviews mutate(topic = str_split(string = topic, pattern = "\\n"))
# A tibble: 100 × 6
title author date topic description url
<chr> <chr> <date> <lis> <chr> <chr>
1 Jamelle Bouie Emphasizes Substance… Casey… 2023-10-10 <chr> "As part o… http…
2 Noted Experts Discuss Free Express… Corne… 2023-10-09 <chr> "Cornell h… http…
3 Cornell Review’s Free Expression E… The C… 2023-10-05 <chr> "The Corne… http…
4 Opinion: Cornell Should Earmark It… Revie… 2023-10-04 <chr> "President… http…
5 Grad Student Organizing in the Fre… Corne… 2023-10-02 <chr> "The Corne… http…
6 Breaking: Common Council Postpones… Corne… 2023-09-28 <chr> "Ithaca Co… http…
7 WHALEN | Cornell’s Promise: Any (l… Rodge… 2023-09-27 <chr> "With affi… http…
8 FIRE speaker visits Cornell, gives… Revie… 2023-09-26 <chr> "Murnane c… http…
9 RESCHINI | Free expression and stu… Rodge… 2023-09-20 <chr> "When one … http…
10 Ho Plaza to remain closed through … Rodge… 2023-09-19 <chr> "Construct… http…
# ℹ 90 more rows
This makes the column a list-column with each element a separate character vector. From here we need to unnest the column so each row contains a single topic value.
|>
cr_reviews mutate(topic = str_split(string = topic, pattern = "\\n")) |>
unnest_longer(col = topic)
# A tibble: 135 × 6
title author date topic description url
<chr> <chr> <date> <chr> <chr> <chr>
1 Jamelle Bouie Emphasizes Substance… Casey… 2023-10-10 Camp… "As part o… http…
2 Noted Experts Discuss Free Express… Corne… 2023-10-09 Camp… "Cornell h… http…
3 Cornell Review’s Free Expression E… The C… 2023-10-05 Camp… "The Corne… http…
4 Opinion: Cornell Should Earmark It… Revie… 2023-10-04 Beyo… "President… http…
5 Grad Student Organizing in the Fre… Corne… 2023-10-02 Camp… "The Corne… http…
6 Breaking: Common Council Postpones… Corne… 2023-09-28 Beyo… "Ithaca Co… http…
7 WHALEN | Cornell’s Promise: Any (l… Rodge… 2023-09-27 Camp… "With affi… http…
8 WHALEN | Cornell’s Promise: Any (l… Rodge… 2023-09-27 Opin… "With affi… http…
9 FIRE speaker visits Cornell, gives… Revie… 2023-09-26 Camp… "Murnane c… http…
10 RESCHINI | Free expression and stu… Rodge… 2023-09-20 Camp… "When one … http…
# ℹ 125 more rows
Notice the data frame now has additional rows. The unit of analysis is now an article-topic combination, rather than one-row-per-article. Not entirely a tidy structure, but necessary to construct a chart to visualize topic frequency.
|>
cr_reviews mutate(topic = str_split(string = topic, pattern = "\\n")) |>
unnest_longer(col = topic) |>
ggplot(mapping = aes(y = topic)) +
geom_bar()
Let’s clean this up like the previous chart.
|>
cr_reviews mutate(topic = str_split(string = topic, pattern = "\\n")) |>
unnest_longer(col = topic) |>
# str_trim() - remove remove whitespace characters at beginning
# and end of character strings
mutate(topic = str_trim(string = topic) |>
fct_infreq() |>
fct_rev()) |>
ggplot(mapping = aes(y = topic)) +
geom_bar()
::session_info() sessioninfo
─ Session info ───────────────────────────────────────────────────────────────
setting value
version R version 4.3.1 (2023-06-16)
os macOS Ventura 13.5.2
system aarch64, darwin20
ui X11
language (EN)
collate en_US.UTF-8
ctype en_US.UTF-8
tz America/New_York
date 2023-10-12
pandoc 3.1.1 @ /Applications/RStudio.app/Contents/Resources/app/quarto/bin/tools/ (via rmarkdown)
─ Packages ───────────────────────────────────────────────────────────────────
package * version date (UTC) lib source
bit 4.0.5 2022-11-15 [1] CRAN (R 4.3.0)
bit64 4.0.5 2020-08-30 [1] CRAN (R 4.3.0)
cli 3.6.1 2023-03-23 [1] CRAN (R 4.3.0)
colorspace 2.1-0 2023-01-23 [1] CRAN (R 4.3.0)
crayon 1.5.2 2022-09-29 [1] CRAN (R 4.3.0)
digest 0.6.31 2022-12-11 [1] CRAN (R 4.3.0)
dplyr * 1.1.2 2023-04-20 [1] CRAN (R 4.3.0)
evaluate 0.21 2023-05-05 [1] CRAN (R 4.3.0)
fansi 1.0.4 2023-01-22 [1] CRAN (R 4.3.0)
farver 2.1.1 2022-07-06 [1] CRAN (R 4.3.0)
fastmap 1.1.1 2023-02-24 [1] CRAN (R 4.3.0)
forcats * 1.0.0 2023-01-29 [1] CRAN (R 4.3.0)
generics 0.1.3 2022-07-05 [1] CRAN (R 4.3.0)
ggplot2 * 3.4.2 2023-04-03 [1] CRAN (R 4.3.0)
glue 1.6.2 2022-02-24 [1] CRAN (R 4.3.0)
gtable 0.3.3 2023-03-21 [1] CRAN (R 4.3.0)
here 1.0.1 2020-12-13 [1] CRAN (R 4.3.0)
hms 1.1.3 2023-03-21 [1] CRAN (R 4.3.0)
htmltools 0.5.5 2023-03-23 [1] CRAN (R 4.3.0)
htmlwidgets 1.6.2 2023-03-17 [1] CRAN (R 4.3.0)
httr 1.4.6 2023-05-08 [1] CRAN (R 4.3.0)
jsonlite 1.8.5 2023-06-05 [1] CRAN (R 4.3.0)
knitr 1.43 2023-05-25 [1] CRAN (R 4.3.0)
labeling 0.4.2 2020-10-20 [1] CRAN (R 4.3.0)
lifecycle 1.0.3 2022-10-07 [1] CRAN (R 4.3.0)
lubridate * 1.9.2 2023-02-10 [1] CRAN (R 4.3.0)
magrittr 2.0.3 2022-03-30 [1] CRAN (R 4.3.0)
munsell 0.5.0 2018-06-12 [1] CRAN (R 4.3.0)
pillar 1.9.0 2023-03-22 [1] CRAN (R 4.3.0)
pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 4.3.0)
purrr * 1.0.1 2023-01-10 [1] CRAN (R 4.3.0)
R6 2.5.1 2021-08-19 [1] CRAN (R 4.3.0)
readr * 2.1.4 2023-02-10 [1] CRAN (R 4.3.0)
rlang 1.1.1 2023-04-28 [1] CRAN (R 4.3.0)
rmarkdown 2.22 2023-06-01 [1] CRAN (R 4.3.0)
robotstxt * 0.7.13 2020-09-03 [1] CRAN (R 4.3.0)
rprojroot 2.0.3 2022-04-02 [1] CRAN (R 4.3.0)
rstudioapi 0.14 2022-08-22 [1] CRAN (R 4.3.0)
rvest * 1.0.3 2022-08-19 [1] CRAN (R 4.3.0)
scales 1.2.1 2022-08-20 [1] CRAN (R 4.3.0)
sessioninfo 1.2.2 2021-12-06 [1] CRAN (R 4.3.0)
stringi 1.7.12 2023-01-11 [1] CRAN (R 4.3.0)
stringr * 1.5.0 2022-12-02 [1] CRAN (R 4.3.0)
tibble * 3.2.1 2023-03-20 [1] CRAN (R 4.3.0)
tidyr * 1.3.0 2023-01-24 [1] CRAN (R 4.3.0)
tidyselect 1.2.0 2022-10-10 [1] CRAN (R 4.3.0)
tidyverse * 2.0.0 2023-02-22 [1] CRAN (R 4.3.0)
timechange 0.2.0 2023-01-11 [1] CRAN (R 4.3.0)
tzdb 0.4.0 2023-05-12 [1] CRAN (R 4.3.0)
utf8 1.2.3 2023-01-31 [1] CRAN (R 4.3.0)
vctrs 0.6.3 2023-06-14 [1] CRAN (R 4.3.0)
vroom 1.6.3 2023-04-28 [1] CRAN (R 4.3.0)
withr 2.5.0 2022-03-03 [1] CRAN (R 4.3.0)
xfun 0.39 2023-04-20 [1] CRAN (R 4.3.0)
xml2 1.3.4 2023-04-27 [1] CRAN (R 4.3.0)
yaml 2.3.7 2023-01-23 [1] CRAN (R 4.3.0)
[1] /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/library
──────────────────────────────────────────────────────────────────────────────