Slay: Predicting song artist based on lyrics

Application exercise
library(tidyverse)
library(tidymodels)
library(stringr)
library(textrecipes)
library(themis)
library(vip)

# set seed for randomization
set.seed(123)

theme_set(theme_minimal(base_size = 13))

Import data

lyrics <- read_csv(file = "data/beyonce-swift-lyrics.csv") |>
  mutate(artist = factor(artist))
lyrics
# A tibble: 309 × 19
   album_name track_number track_name artist lyrics danceability energy loudness
   <chr>             <dbl> <chr>      <fct>  <chr>         <dbl>  <dbl>    <dbl>
 1 RENAISSAN…            1 I'M THAT … Beyon… "Plea…        0.554  0.535    -8.96
 2 RENAISSAN…            2 COZY       Beyon… "This…        0.556  0.63     -8.15
 3 RENAISSAN…            3 ALIEN SUP… Beyon… "Plea…        0.545  0.641    -6.40
 4 RENAISSAN…            4 CUFF IT    Beyon… "I fe…        0.78   0.689    -5.67
 5 RENAISSAN…            5 ENERGY (f… Beyon… "On s…        0.903  0.519    -9.15
 6 RENAISSAN…            6 BREAK MY … Beyon… "I'm …        0.693  0.887    -5.04
 7 RENAISSAN…            7 CHURCH GI… Beyon… "(Lor…        0.792  0.919    -5.69
 8 RENAISSAN…            8 PLASTIC O… Beyon… "Boy,…        0.618  0.712    -8.25
 9 RENAISSAN…            9 VIRGO'S G… Beyon… "Baby…        0.683  0.85     -5.04
10 RENAISSAN…           10 MOVE (fea… Beyon… "Move…        0.876  0.628    -6.60
# ℹ 299 more rows
# ℹ 11 more variables: speechiness <dbl>, acousticness <dbl>,
#   instrumentalness <dbl>, liveness <dbl>, valence <dbl>, tempo <dbl>,
#   time_signature <dbl>, duration_ms <dbl>, explicit <lgl>, key_name <chr>,
#   mode_name <chr>

Split the data into analysis/assessment/test sets

Your turn:

  • Split the data into training/test sets with 75% allocated for training
  • Split the training set into 10 cross-validation folds
# split into training/testing
set.seed(123)
lyrics_split <- initial_split(data = _____, strata = _____, prop = _____)

lyrics_train <- _____(lyrics_split)
lyrics_test <- _____(lyrics_split)

# create cross-validation folds
lyrics_folds <- vfold_cv(data = _____, strata = _____)

Estimate the null model for a baseline comparison

Your turn: Estimate a null model to determine an appropriate baseline for evaluating a model’s performance.

null_spec <- null_model() |>
  set_engine("parsnip") |>
  set_mode("classification")

null_spec |>
  fit_resamples(
    artist ~ .,
    resamples = lyrics_folds
  ) |>
  collect_metrics()

Fit a random forest model

Define the feature engineering recipe

Demonstration:

  • Define a feature engineering recipe to predict the song’s artist as a function of the lyrics + audio features
  • Exclude the ID variables from the recipe
  • Tokenize the song lyrics
  • Remove stop words
  • Only keep the 500 most frequently appearing tokens
  • Calculate tf-idf scores for the remaining tokens
    • This will generate one column for every token. Each column will have the standardized name tfidf_lyrics_* where * is the specific token. Instead we would prefer the column names simply be *. You can remove the tfidf_lyrics_ prefix using

      # Simplify these names
      step_rename_at(starts_with("tfidf_lyrics_"),
        fn = \(x) str_replace_all(
          string = x,
          pattern = "tfidf_lyrics_",
          replacement = ""
        )
      )
    • This does cause a conflict between the energy audio feature and the token energy. We will add a prefix to the audio features to avoid this conflict.

      # Simplify these names
      step_rename_at(
        all_predictors(), -starts_with("tfidf_lyrics_"),
        fn = \(x) str_glue("af_{x}")
      )
  • Downsample the observations so there are an equal number of songs by Beyoncé and Taylor Swift in the analysis set
# define preprocessing recipe
lyrics_rec <- recipe(artist ~ ., data = lyrics_train) |>
  ...
lyrics_rec

Fit the model

Demonstration:

  • Define a random forest model grown with 1000 trees using the ranger engine.
  • Define a workflow using the feature engineering recipe and random forest model specification. Fit the workflow using the cross-validation folds.
    • Use control = control_resamples(save_pred = TRUE) to save the assessment set predictions. We need these to assess the model’s performance.
# define the model specification
ranger_spec <- rand_forest(trees = 1000) |>
  set_mode("classification") |>
  # calculate feature importance metrics using the ranger engine
  set_engine("ranger", importance = "permutation")

# define the workflow
ranger_workflow <- workflow() |>
  add_recipe(lyrics_rec) |>
  add_model(ranger_spec)

# fit the model to each of the cross-validation folds
ranger_cv <- ranger_workflow |>
  fit_resamples(
    resamples = lyrics_folds,
    control = control_resamples(save_pred = TRUE, save_workflow = TRUE)
  )

Evaluate model performance

Demonstration:

  • Calculate the model’s accuracy and ROC AUC. How did it perform?
  • Draw the ROC curve for each validation fold
  • Generate the resampled confusion matrix for the model and draw it using a heatmap. How does the model perform predicting Beyoncé songs relative to Taylor Swift songs?
# extract metrics and predictions
ranger_cv_metrics <- collect_metrics(ranger_cv)
ranger_cv_predictions <- collect_predictions(ranger_cv)

# how well did the model perform?
ranger_cv_metrics

# roc curve
ranger_cv_predictions |>
  ...

# confusion matrix
## your code here

Penalized regression

Define the feature engineering recipe

Demonstration:

  • Define a feature engineering recipe to predict the song’s artist as a function of the lyrics + audio features
  • Exclude the ID variables from the recipe
  • Tokenize the song lyrics
  • Calculate all possible 1-grams, 2-grams, 3-grams, 4-grams, and 5-grams
  • Remove stop words
  • Only keep the 2000 most frequently appearing tokens
  • Calculate tf-idf scores for the remaining tokens
  • Rename audio feature and tf-idf as before
  • Apply required steps for penalized regression models
    • Convert the explicit variable to a factor
    • Convert nominal predictors to dummy variables
    • Get rid of zero-variance predictors
    • Normalize all predictors to mean of 0 and variance of 1
  • Downsample the observations so there are an equal number of songs by Beyoncé and Taylor Swift in the analysis set
glmnet_rec <- recipe(artist ~ ., data = lyrics_train) |>
  ...
glmnet_rec

Tune the penalized regression model

Demonstration:

  • Define the penalized regression model specification, including tuning placeholders for penalty and mixture
  • Create the workflow object
  • Define a tuning grid with every combination of:
    • penalty = 10^seq(-6, -1, length.out = 20)
    • mixture = c(0, 0.2, 0.4, 0.6, 0.8, 1)
  • Tune the model using the cross-validation folds
  • Evaluate the tuning procedure and identify the best performing models based on ROC AUC
# define the penalized regression model specification
glmnet_spec <- logistic_reg(penalty = tune(), mixture = tune()) |>
  set_mode("classification") |>
  set_engine("glmnet")

# define the new workflow
glmnet_workflow <- workflow() |>
  add_recipe(glmnet_rec) |>
  add_model(glmnet_spec)

# create the tuning grid
glmnet_grid <- expand_grid(
  penalty = 10^seq(-6, -1, length.out = 20),
  mixture = c(0, 0.2, 0.4, 0.6, 0.8, 1)
)

# tune over the model hyperparameters
glmnet_tune <- tune_grid(
  object = glmnet_workflow,
  resamples = lyrics_folds,
  grid = glmnet_grid,
  control = control_grid(save_pred = TRUE, save_workflow = TRUE)
)
# evaluate results
collect_metrics(x = glmnet_tune)
autoplot(glmnet_tune)

# identify the five best hyperparameter combinations
show_best(x = glmnet_tune, metric = "roc_auc")

Fit the best model

Your turn:

  • Select the model + hyperparameter combinations that achieve the highest ROC AUC
  • Fit that model using the best hyperparameters and the full training set. How well does the model perform on the test set?
# select the best model's hyperparameters
lyrics_best <- fit_best(...)

# test set ROC AUC
bind_cols(
  lyrics_test,
  predict(lyrics_best, new_data = lyrics_test, type = "prob")
) |>
  roc_auc(truth = artist, .pred_Beyoncé)

Variable importance

We can examine the results of each model to evaluate which tokens were the most important in generating artist predictions. Here we use vip to calculate importance.

# extract parsnip model fit
rf_imp <- extract_fit_parsnip(lyrics_best) |>
  vi(method = "model")

# clean up the data frame for visualization
rf_imp |>
  # extract 20 most important n-grams
  slice_max(order_by = Importance, n = 20) |>
  mutate(Variable = fct_reorder(.f = Variable, .x = Importance)) |>
  ggplot(mapping = aes(
    x = Importance,
    y = Variable
  )) +
  geom_col() +
  labs(
    y = NULL,
    title = "Most relevant features for predicting whether\na song is by Beyoncé or Taylor Swift",
    subtitle = "Random forest model"
  )

# extract parsnip model fit
glmnet_imp <- glmnet_tune |>
  fit_best() |>
  extract_fit_parsnip() |>
  vi(method = "model", lambda = select_best(x = glmnet_tune, metric = "roc_auc")$penalty)

# clean up the data frame for visualization
glmnet_imp |>
  mutate(
    Sign = case_when(
      Sign == "NEG" ~ "More likely from Beyoncé",
      Sign == "POS" ~ "More likely from Taylor Swift"
    ),
    Importance = abs(Importance)
  ) |>
  # importance must be greater than 0
  filter(Importance > 0) |>
  # keep top 20 features for each artist
  slice_max(n = 20, order_by = Importance, by = Sign) |>
  mutate(Variable = fct_reorder(.f = Variable, .x = Importance)) |>
  ggplot(mapping = aes(
    x = Importance,
    y = Variable,
    fill = Sign
  )) +
  geom_col(show.legend = FALSE) +
  scale_fill_brewer(type = "qual") +
  facet_wrap(facets = vars(Sign), scales = "free_y") +
  labs(
    y = NULL,
    title = "Most relevant features for predicting whether\na song is by Beyoncé or Taylor Swift",
    subtitle = "Penalized regression model"
  )