recipes
In this example we use tfhub and recipes to obtain pre-trained sentence embeddings. We then firt a logistic regression model.
The dataset comes from the Toxic Comment Classification Challenge in Kaggle and can be downlaoded here: https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/data
library(tfhub)
library(readr)
library(tidymodels)
# Read data ---------------------------------------------------------------
comments <- read_csv("train.csv.zip")
ind_train <- sample.int(nrow(comments), 0.8*nrow(comments))
train <- comments[ind_train,]
test <- comments[-ind_train,]
# Create our recipe specification -----------------------------------------
rec <- recipe(
obscene ~ comment_text,
data = train
) %>% step_pretrained_text_embedding(
comment_text,
handle = "https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim-with-oov/1"
) %>%
step_bin2factor(obscene)
rec <- prep(rec)
# Train glmnet ------------------------------------------------------------
logistic_fit <-
logistic_reg() %>%
set_mode("classification") %>%
set_engine("glm") %>%
fit(obscene ~ ., data = juice(rec))
logistic_fit$fit
# Results -----------------------------------------------------------------
test_embedded <- bake(rec, test)
test_results <- test_embedded %>%
select(obscene) %>%
mutate(
class = predict(logistic_fit, new_data = test_embedded) %>%
pull(.pred_class),
prob = predict(logistic_fit, new_data = test_embedded, type = "prob") %>%
pull(.pred_yes)
)
test_results %>% roc_auc(truth = obscene, prob)
test_results %>% accuracy(truth = obscene, class)
test_results %>% conf_mat(truth = obscene, class)