library(tidyverse)
library(tidymodels)
library(stringr)
library(textrecipes)
library(themis)
library(vip)
# set seed for randomization
set.seed(123)
theme_set(theme_minimal(base_size = 13))
Slay: Predicting song artist based on lyrics
Application exercise
Import data
<- read_csv(file = "data/beyonce-swift-lyrics.csv") |>
lyrics mutate(artist = factor(artist))
lyrics
# A tibble: 309 × 19
album_name track_number track_name artist lyrics danceability energy loudness
<chr> <dbl> <chr> <fct> <chr> <dbl> <dbl> <dbl>
1 RENAISSAN… 1 I'M THAT … Beyon… "Plea… 0.554 0.535 -8.96
2 RENAISSAN… 2 COZY Beyon… "This… 0.556 0.63 -8.15
3 RENAISSAN… 3 ALIEN SUP… Beyon… "Plea… 0.545 0.641 -6.40
4 RENAISSAN… 4 CUFF IT Beyon… "I fe… 0.78 0.689 -5.67
5 RENAISSAN… 5 ENERGY (f… Beyon… "On s… 0.903 0.519 -9.15
6 RENAISSAN… 6 BREAK MY … Beyon… "I'm … 0.693 0.887 -5.04
7 RENAISSAN… 7 CHURCH GI… Beyon… "(Lor… 0.792 0.919 -5.69
8 RENAISSAN… 8 PLASTIC O… Beyon… "Boy,… 0.618 0.712 -8.25
9 RENAISSAN… 9 VIRGO'S G… Beyon… "Baby… 0.683 0.85 -5.04
10 RENAISSAN… 10 MOVE (fea… Beyon… "Move… 0.876 0.628 -6.60
# ℹ 299 more rows
# ℹ 11 more variables: speechiness <dbl>, acousticness <dbl>,
# instrumentalness <dbl>, liveness <dbl>, valence <dbl>, tempo <dbl>,
# time_signature <dbl>, duration_ms <dbl>, explicit <lgl>, key_name <chr>,
# mode_name <chr>
Split the data into analysis/assessment/test sets
Your turn:
- Split the data into training/test sets with 75% allocated for training
- Split the training set into 10 cross-validation folds
# split into training/testing
set.seed(123)
<- initial_split(data = _____, strata = _____, prop = _____)
lyrics_split
<- _____(lyrics_split)
lyrics_train <- _____(lyrics_split)
lyrics_test
# create cross-validation folds
<- vfold_cv(data = _____, strata = _____) lyrics_folds
Estimate the null model for a baseline comparison
Your turn: Estimate a null model to determine an appropriate baseline for evaluating a model’s performance.
<- null_model() |>
null_spec set_engine("parsnip") |>
set_mode("classification")
|>
null_spec fit_resamples(
~ .,
artist resamples = lyrics_folds
|>
) collect_metrics()
Fit a random forest model
Define the feature engineering recipe
Demonstration:
- Define a feature engineering recipe to predict the song’s artist as a function of the lyrics + audio features
- Exclude the ID variables from the recipe
- Tokenize the song lyrics
- Remove stop words
- Only keep the 500 most frequently appearing tokens
- Calculate tf-idf scores for the remaining tokens
This will generate one column for every token. Each column will have the standardized name
tfidf_lyrics_*
where*
is the specific token. Instead we would prefer the column names simply be*
. You can remove thetfidf_lyrics_
prefix using# Simplify these names step_rename_at(starts_with("tfidf_lyrics_"), fn = \(x) str_replace_all( string = x, pattern = "tfidf_lyrics_", replacement = "" ) )
This does cause a conflict between the
energy
audio feature and the tokenenergy
. We will add a prefix to the audio features to avoid this conflict.# Simplify these names step_rename_at( all_predictors(), -starts_with("tfidf_lyrics_"), fn = \(x) str_glue("af_{x}") )
- Downsample the observations so there are an equal number of songs by Beyoncé and Taylor Swift in the analysis set
# define preprocessing recipe
<- recipe(artist ~ ., data = lyrics_train) |>
lyrics_rec
... lyrics_rec
Fit the model
Demonstration:
- Define a random forest model grown with 1000 trees using the
ranger
engine. - Define a workflow using the feature engineering recipe and random forest model specification. Fit the workflow using the cross-validation folds.
- Use
control = control_resamples(save_pred = TRUE)
to save the assessment set predictions. We need these to assess the model’s performance.
- Use
# define the model specification
<- rand_forest(trees = 1000) |>
ranger_spec set_mode("classification") |>
# calculate feature importance metrics using the ranger engine
set_engine("ranger", importance = "permutation")
# define the workflow
<- workflow() |>
ranger_workflow add_recipe(lyrics_rec) |>
add_model(ranger_spec)
# fit the model to each of the cross-validation folds
<- ranger_workflow |>
ranger_cv fit_resamples(
resamples = lyrics_folds,
control = control_resamples(save_pred = TRUE, save_workflow = TRUE)
)
Evaluate model performance
Demonstration:
- Calculate the model’s accuracy and ROC AUC. How did it perform?
- Draw the ROC curve for each validation fold
- Generate the resampled confusion matrix for the model and draw it using a heatmap. How does the model perform predicting Beyoncé songs relative to Taylor Swift songs?
# extract metrics and predictions
<- collect_metrics(ranger_cv)
ranger_cv_metrics <- collect_predictions(ranger_cv)
ranger_cv_predictions
# how well did the model perform?
ranger_cv_metrics
# roc curve
|>
ranger_cv_predictions
...
# confusion matrix
## your code here
Penalized regression
Define the feature engineering recipe
Demonstration:
- Define a feature engineering recipe to predict the song’s artist as a function of the lyrics + audio features
- Exclude the ID variables from the recipe
- Tokenize the song lyrics
- Calculate all possible 1-grams, 2-grams, 3-grams, 4-grams, and 5-grams
- Remove stop words
- Only keep the 2000 most frequently appearing tokens
- Calculate tf-idf scores for the remaining tokens
- Rename audio feature and tf-idf as before
- Apply required steps for penalized regression models
- Convert the
explicit
variable to a factor - Convert nominal predictors to dummy variables
- Get rid of zero-variance predictors
- Normalize all predictors to mean of 0 and variance of 1
- Convert the
- Downsample the observations so there are an equal number of songs by Beyoncé and Taylor Swift in the analysis set
<- recipe(artist ~ ., data = lyrics_train) |>
glmnet_rec
... glmnet_rec
Tune the penalized regression model
Demonstration:
- Define the penalized regression model specification, including tuning placeholders for
penalty
andmixture
- Create the workflow object
- Define a tuning grid with every combination of:
penalty = 10^seq(-6, -1, length.out = 20)
mixture = c(0, 0.2, 0.4, 0.6, 0.8, 1)
- Tune the model using the cross-validation folds
- Evaluate the tuning procedure and identify the best performing models based on ROC AUC
# define the penalized regression model specification
<- logistic_reg(penalty = tune(), mixture = tune()) |>
glmnet_spec set_mode("classification") |>
set_engine("glmnet")
# define the new workflow
<- workflow() |>
glmnet_workflow add_recipe(glmnet_rec) |>
add_model(glmnet_spec)
# create the tuning grid
<- expand_grid(
glmnet_grid penalty = 10^seq(-6, -1, length.out = 20),
mixture = c(0, 0.2, 0.4, 0.6, 0.8, 1)
)
# tune over the model hyperparameters
<- tune_grid(
glmnet_tune object = glmnet_workflow,
resamples = lyrics_folds,
grid = glmnet_grid,
control = control_grid(save_pred = TRUE, save_workflow = TRUE)
)
# evaluate results
collect_metrics(x = glmnet_tune)
autoplot(glmnet_tune)
# identify the five best hyperparameter combinations
show_best(x = glmnet_tune, metric = "roc_auc")
Fit the best model
Your turn:
- Select the model + hyperparameter combinations that achieve the highest ROC AUC
- Fit that model using the best hyperparameters and the full training set. How well does the model perform on the test set?
# select the best model's hyperparameters
<- fit_best(...)
lyrics_best
# test set ROC AUC
bind_cols(
lyrics_test,predict(lyrics_best, new_data = lyrics_test, type = "prob")
|>
) roc_auc(truth = artist, .pred_Beyoncé)
Variable importance
We can examine the results of each model to evaluate which tokens were the most important in generating artist predictions. Here we use vip to calculate importance.
# extract parsnip model fit
<- extract_fit_parsnip(lyrics_best) |>
rf_imp vi(method = "model")
# clean up the data frame for visualization
|>
rf_imp # extract 20 most important n-grams
slice_max(order_by = Importance, n = 20) |>
mutate(Variable = fct_reorder(.f = Variable, .x = Importance)) |>
ggplot(mapping = aes(
x = Importance,
y = Variable
+
)) geom_col() +
labs(
y = NULL,
title = "Most relevant features for predicting whether\na song is by Beyoncé or Taylor Swift",
subtitle = "Random forest model"
)
# extract parsnip model fit
<- glmnet_tune |>
glmnet_imp fit_best() |>
extract_fit_parsnip() |>
vi(method = "model", lambda = select_best(x = glmnet_tune, metric = "roc_auc")$penalty)
# clean up the data frame for visualization
|>
glmnet_imp mutate(
Sign = case_when(
== "NEG" ~ "More likely from Beyoncé",
Sign == "POS" ~ "More likely from Taylor Swift"
Sign
),Importance = abs(Importance)
|>
) # importance must be greater than 0
filter(Importance > 0) |>
# keep top 20 features for each artist
slice_max(n = 20, order_by = Importance, by = Sign) |>
mutate(Variable = fct_reorder(.f = Variable, .x = Importance)) |>
ggplot(mapping = aes(
x = Importance,
y = Variable,
fill = Sign
+
)) geom_col(show.legend = FALSE) +
scale_fill_brewer(type = "qual") +
facet_wrap(facets = vars(Sign), scales = "free_y") +
labs(
y = NULL,
title = "Most relevant features for predicting whether\na song is by Beyoncé or Taylor Swift",
subtitle = "Penalized regression model"
)