### Load standardpackages
library(tidyverse) # Collection of all the good stuff like dplyr, ggplot2 ect.
Registered S3 methods overwritten by 'dbplyr':
method from
print.tbl_lazy
print.tbl_sql
── Attaching packages ─────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.1 ──
✓ ggplot2 3.3.5 ✓ purrr 0.3.4
✓ tibble 3.1.5 ✓ dplyr 1.0.7
✓ tidyr 1.1.4 ✓ stringr 1.4.0
✓ readr 2.0.2 ✓ forcats 0.5.1
── Conflicts ────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
x dplyr::filter() masks stats::filter()
x dplyr::lag() masks stats::lag()
library(magrittr) # For extra-piping operators (eg. %<>%)
Attaching package: ‘magrittr’
The following object is masked from ‘package:purrr’:
set_names
The following object is masked from ‘package:tidyr’:
extract
library(tidytext)
# download and open some Trump tweets from trump_tweet_data_archive
library(jsonlite)
Attaching package: ‘jsonlite’
The following object is masked from ‘package:purrr’:
flatten
tmp <- tempfile()
download.file("https://github.com/SDS-AAU/SDS-master/raw/master/M2/data/pol_tweets.gz", tmp)
trying URL 'https://github.com/SDS-AAU/SDS-master/raw/master/M2/data/pol_tweets.gz'
Content type 'application/octet-stream' length 7342085 bytes (7.0 MB)
==================================================
downloaded 7.0 MB
tweets_raw <- stream_in(gzfile(tmp, "pol_tweets"))
Found 1 records...
Imported 1 records. Simplifying...
Registered S3 method overwritten by 'data.table':
method from
print.data.table
Registered S3 methods overwritten by 'themis':
method from
bake.step_downsample recipes
bake.step_upsample recipes
prep.step_downsample recipes
prep.step_upsample recipes
tidy.step_downsample recipes
tidy.step_upsample recipes
tunable.step_downsample recipes
tunable.step_upsample recipes
tweets_raw %>% glimpse()
Rows: 1
Columns: 2
$ text <df[,50000]> <data.frame[1 x 50000]>
$ labels <df[,50000]> <data.frame[1 x 50000]>
tweets <- tibble(ID = colnames(tweets_raw[[1]]),
text = tweets_raw[[1]] %>% as.character(),
labels = tweets_raw[[2]] %>% as.logical())
#rm(tweets_raw)
tweets %>% glimpse()
Rows: 50,000
Columns: 3
$ ID <chr> "340675", "289492", "371088", "82212", "476047", "220741", "379074", "633731", "103805", "401277", "493433", "578814", "570425"…
$ text <chr> "RT @GreenBeretFound Today we remember Sgt. 1st Class Ryan J. Savard killed in action on this day eight years ago. SFC Savard w…
$ labels <lgl> FALSE, TRUE, TRUE, FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, FALSE, TRUE, FALSE, FALSE, FALSE, TRUE, TRUE, TRUE, FALSE, TRUE, TRUE,…
tweets %<>%
filter(!(text %>% str_detect('^RT'))) # Filter retweets
tweets %>% head()
tweets_tidy <- tweets %>%
unnest_tokens(word, text, token = "tweets")
Using `to_lower = TRUE` with `token = 'tweets'` may not preserve URLs.
tweets_tidy %>% head(50)
tweets_tidy %>% count(word, sort = TRUE)
# preprocessing
tweets_tidy %<>%
filter(!(word %>% str_detect('@'))) %>% # remove hashtags and mentions
filter(!(word %>% str_detect('^amp|^http|^t\\.co'))) %>% # Twitter specific stuff
# mutate(word = word %>% str_remove_all('[^[:alnum:]]')) %>% ## remove all special characters
filter(str_length(word) > 2 ) %>% # Remove words with less than 3 characters
group_by(word) %>%
filter(n() > 100) %>% # remove words occuring less than 100 times
ungroup() %>%
anti_join(stop_words, by = 'word') # remove stopwords
TFIDF weighting
# top words
tweets_tidy %>%
count(word, sort = TRUE) %>%
head(20)
# TFIDF weights
tweets_tidy %<>%
add_count(ID, word) %>%
bind_tf_idf(term = word,
document = ID,
n = n)
# TFIDF topwords
tweets_tidy %>%
count(word, wt = tf_idf, sort = TRUE) %>%
head(20)
labels_words <- tweets_tidy %>%
group_by(labels) %>%
count(word, wt = tf_idf, sort = TRUE, name = "tf_idf") %>%
slice(1:100) %>%
ungroup()
labels_words %>%
mutate(word = reorder_within(word, by = tf_idf, within = labels)) %>%
ggplot(aes(x = word, y = tf_idf, fill = labels)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~labels, ncol = 2, scales = "free") +
coord_flip() +
scale_x_reordered()
tweets_tidy %>% head()
library(tidymodels)
Registered S3 method overwritten by 'tune':
method from
required_pkgs.model_spec parsnip
── Attaching packages ────────────────────────────────────────────────────────────────────────────────────────────────────── tidymodels 0.1.3 ──
✓ broom 0.7.9 ✓ rsample 0.1.0
✓ dials 0.0.10 ✓ tune 0.1.6
✓ infer 1.0.0 ✓ workflows 0.2.3
✓ modeldata 0.1.1 ✓ workflowsets 0.1.0
✓ parsnip 0.1.7 ✓ yardstick 0.0.8
✓ recipes 0.1.16
── Conflicts ───────────────────────────────────────────────────────────────────────────────────────────────────────── tidymodels_conflicts() ──
x scales::discard() masks purrr::discard()
x magrittr::extract() masks tidyr::extract()
x dplyr::filter() masks stats::filter()
x recipes::fixed() masks stringr::fixed()
x jsonlite::flatten() masks purrr::flatten()
x dplyr::lag() masks stats::lag()
x magrittr::set_names() masks purrr::set_names()
x yardstick::spec() masks readr::spec()
x recipes::step() masks stats::step()
• Use tidymodels_prefer() to resolve common conflicts.
words_classifier <- labels_words %>%
arrange(desc(tf_idf)) %>%
distinct(word, .keep_all = TRUE) %>%
select(-tf_idf)
tweet_null_model <- tweets_tidy %>%
inner_join(labels_words, by = 'word')
null_res <- tweet_null_model %>%
group_by(ID) %>%
summarise(truth = mean(labels.x, na.rm = TRUE) %>% round(0),
pred = mean(labels.y, na.rm = TRUE) %>% round(0))
table(null_res$truth, null_res$pred)
0 1
0 8842 2609
1 11327 9235
# Notice, we use the initial untokenized tweets
data <- tweets %>%
select(labels, text) %>%
rename(y = labels) %>%
mutate(y = y %>% as.factor())
data_split <- initial_split(data, prop = 0.75, strata = y)
data_train <- data_split %>% training()
data_test <- data_split %>% testing()
library(textrecipes) # Adittional recipes for working with text data
# This recipe pretty much reconstructs all preprocessing we did so far
data_recipe <- data_train %>%
recipe(y ~.) %>%
themis::step_downsample(y) %>% # For downsampling class imbalances (optimal)
step_filter(!(text %>% str_detect('^RT'))) %>% # Upfront filtering retweets
step_filter(text != "") %>%
step_tokenize(text, token = "tweets") %>% # tokenize
step_tokenfilter(text, min_times = 75) %>% # Filter out rare words
step_stopwords(text, keep = FALSE) %>% # Filter stopwords
step_tfidf(text) %>% # TFIDF weighting
#step_pca(all_predictors()) %>% # Dimensionality reduction via PCA (optional)
prep() # NOTE: Only prep the recipe when not using in a workflow
data_recipe
Data Recipe
Inputs:
Training data contained 26239 data points and no missing data.
Operations:
Down-sampling based on y [trained]
Row filtering [trained]
Row filtering [trained]
Tokenization for text [trained]
Text filtering for text [trained]
Stop word removal for text [trained]
Term frequency-inverse document frequency with text [trained]
Since we will not do hyperparameter tuning, we directly bake/juice the recipe
data_train_prep <- data_recipe %>% juice()
data_test_prep <- data_recipe %>% bake(data_test)
model_null <- null_model(mode = 'classification')
model_en <- logistic_reg(mode = 'classification',
mixture = 0.5,
penalty = 0.5) %>%
set_engine('glm', family = binomial)
We will skip the workflow step this time, since we do not evaluate different models against each others.
fit_en <- model_en %>% fit(formula = y ~., data = data_train_prep)
pred_collected <- tibble(
truth = data_train_prep %>% pull(y),
pred = fit_en %>% predict(new_data = data_train_prep) %>% pull(.pred_class),
pred_prob = fit_en %>% predict(new_data = data_train_prep, type = "prob") %>% pull(.pred_TRUE),
)
pred_collected %>% conf_mat(truth, pred)
Truth
Prediction FALSE TRUE
FALSE 6178 4301
TRUE 3420 5297
pred_collected %>% conf_mat(truth, pred) %>% summary()
Well… soso
# How would the model predict given some tweet text
pred_own = tibble(text = 'USA USA WE NEED A WALL TO MAKE AMERICA GREAT AGAIN AND KEEP THE MEXICANS AND ALL REALLY BAD COUNTRIES OUT! AMNERICA FIRST')
fit_en %>% predict(new_data = data_recipe %>% bake(pred_own))
Further NLP packages ecosystem
Datacamp
Other online
sessionInfo()
R version 4.1.1 (2021-08-10)
Platform: x86_64-apple-darwin17.0 (64-bit)
Running under: macOS Catalina 10.15.7
Matrix products: default
BLAS: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] textrecipes_0.4.1 yardstick_0.0.8 workflowsets_0.1.0 workflows_0.2.3 tune_0.1.6 rsample_0.1.0 recipes_0.1.16
[8] parsnip_0.1.7 modeldata_0.1.1 infer_1.0.0 dials_0.0.10 scales_1.1.1 broom_0.7.9 tidymodels_0.1.3
[15] jsonlite_1.7.2 tidytext_0.3.1 magrittr_2.0.1 forcats_0.5.1 stringr_1.4.0 dplyr_1.0.7 purrr_0.3.4
[22] readr_2.0.2 tidyr_1.1.4 tibble_3.1.5 ggplot2_3.3.5 tidyverse_1.3.1 knitr_1.36
loaded via a namespace (and not attached):
[1] colorspace_2.0-2 ellipsis_0.3.2 class_7.3-19 fs_1.5.0 rstudioapi_0.13 listenv_0.8.0 furrr_0.2.3
[8] farver_2.1.0 ParamHelpers_1.14 SnowballC_0.7.0 prodlim_2019.11.13 fansi_0.5.0 lubridate_1.7.10 xml2_1.3.2
[15] codetools_0.2-18 splines_4.1.1 doParallel_1.0.16 pROC_1.18.0 dbplyr_2.1.1 compiler_4.1.1 httr_1.4.2
[22] backports_1.2.1 assertthat_0.2.1 Matrix_1.3-4 cli_3.0.1 tools_4.1.1 gtable_0.3.0 glue_1.4.2
[29] RANN_2.6.1 fastmatch_1.1-3 Rcpp_1.0.7 parallelMap_1.5.1 cellranger_1.1.0 DiceDesign_1.9 vctrs_0.3.8
[36] iterators_1.0.13 timeDate_3043.102 gower_0.2.2 xfun_0.26 mlr_2.19.0 stopwords_2.2 globals_0.14.0
[43] rvest_1.0.1 lifecycle_1.0.1 future_1.22.1 MASS_7.3-54 ipred_0.9-12 hms_1.1.1 parallel_4.1.1
[50] BBmisc_1.11 rpart_4.1-15 stringi_1.7.4 tokenizers_0.2.1 foreach_1.5.1 checkmate_2.0.0 lhs_1.1.3
[57] hardhat_0.1.6 lava_1.6.10 rlang_0.4.11 pkgconfig_2.0.3 lattice_0.20-44 labeling_0.4.2 tidyselect_1.1.1
[64] parallelly_1.28.1 plyr_1.8.6 R6_2.5.1 themis_0.1.4 generics_0.1.0 DBI_1.1.1 pillar_1.6.3
[71] haven_2.4.3 withr_2.4.2 survival_3.2-13 nnet_7.3-16 future.apply_1.8.1 ROSE_0.0-4 janeaustenr_0.1.5
[78] modelr_0.1.8 crayon_1.4.1 unbalanced_2.0 utf8_1.2.2 tzdb_0.1.2 grid_4.1.1 readxl_1.3.1
[85] data.table_1.14.2 FNN_1.1.3 reprex_2.0.1 digest_0.6.28 munsell_0.5.0 GPfit_1.0-8