recipes

In this example we use tfhub and recipes to obtain pre-trained sentence embeddings. We then firt a logistic regression model.

The dataset comes from the Toxic Comment Classification Challenge in Kaggle and can be downlaoded here: https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/data

library(tfhub)
library(readr)
library(tidymodels)

# Read data ---------------------------------------------------------------

comments <- read_csv("train.csv.zip")

ind_train <- sample.int(nrow(comments), 0.8*nrow(comments))
train <- comments[ind_train,]
test <- comments[-ind_train,]

# Create our recipe specification -----------------------------------------

rec <- recipe(
  obscene ~ comment_text,
  data = train
  ) %>% step_pretrained_text_embedding(
    comment_text,
    handle = "https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim-with-oov/1"
  ) %>%
  step_bin2factor(obscene)

rec <- prep(rec)

# Train glmnet ------------------------------------------------------------

logistic_fit <-
  logistic_reg() %>%
  set_mode("classification") %>%
  set_engine("glm") %>%
  fit(obscene ~ ., data = juice(rec))

logistic_fit$fit

# Results -----------------------------------------------------------------

test_embedded <- bake(rec, test)

test_results <- test_embedded %>%
  select(obscene) %>%
  mutate(
    class = predict(logistic_fit, new_data = test_embedded) %>%
      pull(.pred_class),
    prob  = predict(logistic_fit, new_data = test_embedded, type = "prob") %>%
      pull(.pred_yes)
  )

test_results %>% roc_auc(truth = obscene, prob)
test_results %>% accuracy(truth = obscene, class)
test_results %>% conf_mat(truth = obscene, class)