### Load standardpackages
library(tidyverse) # Collection of all the good stuff like dplyr, ggplot2 ect.
library(magrittr) # For extra-piping operators (eg. %<>%)
library(tidytext)

Download the data

# download and open some Trump tweets from trump_tweet_data_archive
library(jsonlite)
tmp <- tempfile()
download.file("https://github.com/SDS-AAU/SDS-master/raw/master/M2/data/pol_tweets.gz", tmp)
trying URL 'https://github.com/SDS-AAU/SDS-master/raw/master/M2/data/pol_tweets.gz'
Content type 'application/octet-stream' length 7342085 bytes (7.0 MB)
==================================================
downloaded 7.0 MB
tweets_raw <- stream_in(gzfile(tmp, "pol_tweets"))

 Found 1 records...
 Imported 1 records. Simplifying...
tweets_raw %>% glimpse()
Rows: 1
Columns: 2
$ text   <df[,50000]> <data.frame[1 x 50000]>
$ labels <df[,50000]> <data.frame[1 x 50000]>
tweets <- tibble(ID = colnames(tweets_raw[[1]]), 
                 text = tweets_raw[[1]] %>% as.character(), 
                 labels = tweets_raw[[2]] %>% as.logical())
#rm(tweets_raw)
tweets %>% head()
tweets %<>%
  filter(!(text %>% str_detect('^RT'))) # Filter retweets
tweets %>% head()

Tidying

tweets_tidy <- tweets %>%
  unnest_tokens(word, text, token = "tweets") 
Using `to_lower = TRUE` with `token = 'tweets'` may not preserve URLs.
tweets_tidy %>% head(50)
tweets_tidy %>% count(word, sort = TRUE)

Preprocessing

# preprocessing
tweets_tidy %<>%
  filter(!(word %>% str_detect('@'))) %>% # remove mentions
  filter(!(word %>% str_detect('^amp|^http|^t\\.co'))) %>% # Twitter specific stuff
#  mutate(word = word %>% str_remove_all('[^[:alnum:]]')) %>% ## remove all special characters
  filter(str_length(word) > 2 ) %>% # Remove words with less than  3 characters
  group_by(word) %>%
  filter(n() > 100) %>% # remove words occuring less than 100 times
  ungroup() %>%
  anti_join(stop_words, by = 'word') # remove stopwords

TFIDF weighting

# top words
tweets_tidy %>%
  count(word, sort = TRUE) %>%
  head(20)
# TFIDF topwords
tweets_tidy %>%
  count(word, wt = tf_idf, sort = TRUE) %>%
  head(20)

Inspecting

Words by party affiliation

labels_words <- tweets_tidy %>%
  group_by(labels) %>%
  count(word, wt = tf_idf, sort = TRUE, name = "tf_idf") %>%
  slice(1:100) %>%
  ungroup() 
labels_words %>%
  mutate(word = reorder_within(word, by = tf_idf, within = labels)) %>%
  ggplot(aes(x = word, y = tf_idf, fill = labels)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~labels, ncol = 2, scales = "free") +
  coord_flip() +
  scale_x_reordered()

Sentiments?

sentiment_tweet <- tweets_tidy %>%
  inner_join(get_sentiments("bing"))

… To be continued by you

Towards prediction?

tweets_dtm %<>% mutate(across(everything(), .fns = ~replace_na(.,0))) 
rm(tweets_dtm)

Simple manual baseline

tweet_null_model <- tweets_tidy %>%
  inner_join(labels_words, by = 'word')
table(null_res$pred, null_res$truth)
   
        0     1
  0  8859 11501
  1  2588  9048

Predictive model

library(tidymodels)

Preprocessing

# Notice, we use the initial untokenized tweets
data <- tweets %>%
  select(labels, text) %>%
  rename(y = labels) %>%
  mutate(y = y  %>% as.factor()) 

Training & Test split

data_split <- initial_split(data, prop = 0.75, strata = y)

data_train <- data_split  %>%  training()
data_test <- data_split %>% testing()
data_train %>% count(y)

Preprocessing pipeline

library(textrecipes) # Adittional recipes for working with text data
# This recipe pretty much reconstructs all preprocessing we did so far
data_recipe <- data_train %>%
  recipe(y ~.) %>%
  themis::step_upsample(y) %>% # For up/downsampling class imbalances (optimal)
  step_filter(!(text %>% str_detect('^RT'))) %>% # Upfront filtering retweets
  step_filter(text != "") %>%
  # textreciepes
  step_tokenize(text, token = "tweets") %>% # tokenize
  step_tokenfilter(text, min_times = 75) %>%  # Filter out rare words
  step_stopwords(text, keep = FALSE) %>% # Filter stopwords
  step_tfidf(text) %>% # TFIDF weighting
  #step_pca(all_predictors()) %>% # Dimensionality reduction via PCA (optional)
  prep()
data_recipe
Data Recipe

Inputs:

Training data contained 26239 data points and no missing data.

Operations:

Up-sampling based on y [trained]
Row filtering [trained]
Row filtering [trained]
Tokenization for text [trained]
Text filtering for text [trained]
Stop word removal for text [trained]
Term frequency-inverse document frequency with text [trained]

Since we will not do hyperparameter tuning, we directly bake/juice the recipe

data_train_prep <- data_recipe %>% juice()
data_test_prep <- data_recipe %>% bake(data_test)

Defining the models

model_null <- null_model(mode = 'classification')
model_en <- logistic_reg(mode = 'classification',
                         mixture = 0.5,
                         penalty = 0.5) %>%
  set_engine('glm', family = binomial) 

Define the workflow

We will skip the workflow step this time, since we do not evaluate different models against each others.

fit the model

fit_null <- model_null %>% fit(formula = y ~., data = data_train_prep)
fit_en <- model_en %>% fit(formula = y ~., data = data_train_prep)
pred_collected <- tibble(
  truth = data_test_prep %>% pull(y),
  pred = fit_en %>% predict(new_data = data_test_prep) %>% pull(.pred_class),
  pred_prob = fit_en %>% predict(new_data = data_test_prep, type = "prob") %>% pull(.pred_TRUE),
  ) 
pred_collected %>% conf_mat(truth, pred) %>% autoplot(type = 'heatmap')

pred_collected %>% conf_mat(truth, pred) %>% summary()

Well… soso

Using the model for new prediction

Simple test

# How would the model predict given some tweet text
pred_own = tibble(text = 'trump is really bad. we need more green energy  to save the enviroment and fuuture of our children')
fit_en %>% predict(new_data = data_recipe %>% bake(pred_own))

Prediction on new tweets

New data

  • We could also use the model to predict on new data, such as the just scraped discussion on the presidential debate.
# download and open some Trump tweets from trump_tweet_data_archive
download.file("https://github.com/SDS-AAU/SDS-master/raw/master/M2/data/pres_debate_2020.gz", tmp)
trying URL 'https://github.com/SDS-AAU/SDS-master/raw/master/M2/data/pres_debate_2020.gz'
Content type 'application/octet-stream' length 2095527 bytes (2.0 MB)
==================================================
downloaded 2.0 MB
tweets_raw_new <- stream_in(gzfile(tmp, "pres_debate_2020"))

 Found 1 records...
 Imported 1 records. Simplifying...
tweets_raw_new %>% glimpse()
Rows: 1
Columns: 33
$ id              <df[,8811]> <data.frame[1 x 8811]>
$ conversation_id <df[,8811]> <data.frame[1 x 8811]>
$ created_at      <df[,8811]> <data.frame[1 x 8811]>
$ date            <df[,8811]> <data.frame[1 x 8811]>
$ timezone        <df[,8811]> <data.frame[1 x 8811]>
$ place           <df[,8811]> <data.frame[1 x 8811]>
$ tweet           <df[,8811]> <data.frame[1 x 8811]>
$ language        <df[,8811]> <data.frame[1 x 8811]>
$ hashtags        <df[,8811]> <data.frame[1 x 8811]>
$ cashtags        <df[,8811]> <data.frame[1 x 8811]>
$ user_id         <df[,8811]> <data.frame[1 x 8811]>
$ user_id_str     <df[,8811]> <data.frame[1 x 8811]>
$ username        <df[,8811]> <data.frame[1 x 8811]>
$ name            <df[,8811]> <data.frame[1 x 8811]>
$ day             <df[,8811]> <data.frame[1 x 8811]>
$ hour            <df[,8811]> <data.frame[1 x 8811]>
$ link            <df[,8811]> <data.frame[1 x 8811]>
$ urls            <df[,8811]> <data.frame[1 x 8811]>
$ photos          <df[,8811]> <data.frame[1 x 8811]>
$ video           <df[,8811]> <data.frame[1 x 8811]>
$ thumbnail       <df[,8811]> <data.frame[1 x 8811]>
$ nlikes          <df[,8811]> <data.frame[1 x 8811]>
$ nreplies        <df[,8811]> <data.frame[1 x 8811]>
$ nretweets       <df[,8811]> <data.frame[1 x 8811]>
$ quote_url       <df[,8811]> <data.frame[1 x 8811]>
$ search          <df[,8811]> <data.frame[1 x 8811]>
$ near            <df[,8811]> <data.frame[1 x 8811]>
$ geo             <df[,8811]> <data.frame[1 x 8811]>
$ source          <df[,8811]> <data.frame[1 x 8811]>
$ reply_to        <df[,8811]> <data.frame[1 x 8811]>
$ translate       <df[,8811]> <data.frame[1 x 8811]>
$ trans_src       <df[,8811]> <data.frame[1 x 8811]>
$ trans_dest      <df[,8811]> <data.frame[1 x 8811]>
tweets_new <- tibble(ID = tweets_raw_new$id[1,] %>% t() %>% as.character(), 
                     text = tweets_raw_new$tweet[1,] %>% t() %>% as.character())
#rm(tweets_raw_new)
tweets_new %>% glimpse()
Rows: 8,811
Columns: 2
$ ID   <chr> "1318944772183281664", "1318938583122743296", "1318932554897031169", "1318928783169245185", "…
$ text <chr> "Still time to register: Students can join the @UVADemocracy Student Advisory Council for a s…

Doing a prediction

data_new <- data_recipe %>% bake(tweets_new)
data_new %>% glimpse()
Rows: 8,811
Columns: 42
$ `tfidf_text_#COVID19`  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ tfidf_text_act         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ tfidf_text_american    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ tfidf_text_americans   <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000…
$ tfidf_text_amp         <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000…
$ tfidf_text_bill        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ tfidf_text_can         <dbl> 1.4189092, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000…
$ tfidf_text_care        <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 1.670…
$ tfidf_text_congress    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ tfidf_text_coronavirus <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ tfidf_text_country     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ tfidf_text_covid19     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ tfidf_text_day         <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
$ tfidf_text_families    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ tfidf_text_get         <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
$ tfidf_text_great       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ tfidf_text_health      <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 1.676…
$ tfidf_text_help        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ tfidf_text_house       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ tfidf_text_im          <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
$ tfidf_text_just        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ tfidf_text_make        <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
$ tfidf_text_must        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ tfidf_text_need        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ tfidf_text_new         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ tfidf_text_now         <dbl> 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.00000, 0.0…
$ tfidf_text_one         <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
$ tfidf_text_pandemic    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ tfidf_text_people      <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
$ tfidf_text_president   <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
$ tfidf_text_qt          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ tfidf_text_senate      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ tfidf_text_state       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ tfidf_text_support     <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000…
$ tfidf_text_thank       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ tfidf_text_time        <dbl> 1.4925070, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000…
$ tfidf_text_today       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ tfidf_text_trump       <dbl> 0.0000000, 2.1272525, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000…
$ tfidf_text_us          <dbl> 0.000000, 0.000000, 3.432999, 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
$ tfidf_text_vote        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ tfidf_text_work        <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000…
$ tfidf_text_workers     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …

Exploring the new data & predictions

data_new %<>%
  bind_cols(pred_new) %>%
  rename(pred = .pred_class) %>%
  bind_cols(pred_prob_new) %>%
  rename(pred_prob = .pred_TRUE) 
# preprocessing
tweets_tidy_new %<>%
  filter(!(word %>% str_detect('@|#presidential'))) %>% # remove hashtags and mentions
  filter(!(word %>% str_detect('^amp|^http|^t\\.co'))) %>% # Twitter specific stuff
#  mutate(word = word %>% str_remove_all('[^[:alnum:]]')) %>% ## remove all special characters
  filter(str_length(word) > 2 ) %>% # Remove words with less than  3 characters
  group_by(word) %>%
  filter(n() > 100) %>% # remove words occuring less than 100 times
  ungroup() %>%
  anti_join(stop_words, by = 'word') # remove stopwords
# TFIDF weights
tweets_tidy_new %<>%
  add_count(ID, word) %>%
  distinct(ID, word, .keep_all = TRUE) %>%
  bind_tf_idf(term = word,
              document = ID,
              n = n)
labels_words_new <- tweets_tidy_new %>%
  group_by(pred) %>%
  count(word, wt = tf_idf, sort = TRUE, name = "tf_idf") %>%
  slice(1:20) %>%
  ungroup() 
labels_words_new %>%
  mutate(word = reorder_within(word, by = tf_idf, within = pred)) %>%
  ggplot(aes(x = word, y = tf_idf, fill = pred)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~pred, ncol = 2, scales = "free") +
  coord_flip() +
  scale_x_reordered()

Topic models (LDA) on new data

# for LDA analysis
library(topicmodels)

Preparing the Data

# LDA via the topicmodel package requires a document-term-matrix (dtm)
tweets_dtm <- tweets_tidy_new %>%
  cast_dtm(document = ID, term = word, value = n)

Lets take a look:

tweets_dtm
<<DocumentTermMatrix (documents: 7567, terms: 95)>>
Non-/sparse entries: 22637/696228
Sparsity           : 97%
Maximal term length: 27
Weighting          : term frequency (tf)
  • We see again hat the matrix is still rather sparse, which is an artefact of text data generally, but even more so when using twitter data.
  • Lets try to see if we could reduce that somewhat by deleting less often used terms.
library(tm)
tweets_dtm %>% removeSparseTerms(sparse = .99)
<<DocumentTermMatrix (documents: 7567, terms: 95)>>
Non-/sparse entries: 22637/696228
Sparsity           : 97%
Maximal term length: 27
Weighting          : term frequency (tf)
  • Now we can perform a LDA, using the more accurate Gibbs sampling as method.
tweets_lda <- tweets_dtm %>% 
  LDA(k = 6, method = "Gibbs",
      control = list(seed = 1337))

\(\beta\): Word-Topic Association

  • \(\beta\) is an output of the LDA model, indicating the propability that a word occurs in a certain topic.
  • Therefore, loking at the top probability words of a topic often gives us a good intuition regarding its properties.
# LDA output is defined for tidy(), so we can easily extract it
lda_beta <- tweets_lda %>% 
  tidy(matrix = "beta") %>%
  group_by(topic) %>%
  arrange(topic, desc(beta)) %>%
  slice(1:10) %>%
  ungroup() 
lda_beta %>% head()
# Notice the "reorder_within()"
lda_beta %>%
  mutate(term = reorder_within(term, beta, topic)) %>%
  group_by(topic, term) %>%    
  arrange(desc(beta)) %>%  
  ungroup() %>%
  ggplot(aes(term, beta, fill = as.factor(topic))) +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  scale_x_reordered() +
  labs(title = "Top 10 terms in each LDA topic",
       x = NULL, y = expression(beta)) +
  facet_wrap(~ topic, ncol = 2, scales = "free")

\(\gamma\): Document-Topic Association

  • In LDA, documents are represented as a mix of topics. This association of a document to a topic is captured by \(\gamma\)
lda_gamma <- tweets_lda %>% 
  tidy(matrix = "gamma")
lda_gamma %>% head()
lda_gamma %>%
  ggplot(aes(gamma)) +
  geom_histogram() +
  scale_y_log10() +
  labs(title = "Distribution of probabilities for all topics",
       y = "Number of documents", x = expression(gamma))
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

lda_gamma %<>%
  left_join(tweets_new %>% select(ID, pred), by = c('document' = 'ID'))
lda_gamma %>%
  group_by(pred, topic) %>%
  summarise(gamma = sum(gamma)) %>%
  arrange(pred, gamma)
`summarise()` has grouped output by 'pred'. You can override using the `.groups` argument.
lda_gamma %>%
  ggplot(aes(gamma, fill = as.factor(topic))) +
  geom_histogram(show.legend = FALSE) +
  facet_wrap(~ topic, ncol = 2) +
  scale_y_log10() +
  labs(title = "Distribution of probability for each topic",
       y = "Number of documents", x = expression(gamma))
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

topicmodels_json_ldavis <- function(fitted, doc_dtm, method = "PCA", doc_in = NULL, topic_in = NULL){
  require(topicmodels); require(dplyr); require(LDAvis)
  
  # Find required quantities
  phi <- posterior(fitted)$terms %>% as.matrix() # Topic-term distribution
  theta <- posterior(fitted)$topics %>% as.matrix() # Document-topic matrix
  
  # Restrict
  if(!is_null(topic_in)){
    phi <- phi[topic_in, ]
    theta <- theta[ , topic_in]
  }
  text_tidy <- doc_dtm %>% tidy()
  vocab <- colnames(phi)
  doc_length <- tibble(document = rownames(theta)) %>% left_join(text_tidy %>% count(document, wt = count), by = 'document')
  tf <- tibble(term = vocab) %>% left_join(text_tidy %>% count(term, wt = count), by = "term") 
  
  if(method == "PCA"){mds <- jsPCA}
  if(method == "TSNE"){library(tsne); mds <- function(x){tsne(svd(x)$u)} }
  
  # Convert to json
  json_lda <- LDAvis::createJSON(phi = phi, theta = theta, vocab = vocab, doc.length = doc_length %>% pull(n), term.frequency = tf %>% pull(n),
                                 reorder.topics = FALSE, mds.method = mds,plot.opts = list(xlab = "Dim.1", ylab = "Dim.2")) 
  return(json_lda)
}
library(LDAvis)
json_lda <- topicmodels_json_ldavis(fitted = tweets_lda, 
                                    doc_dtm = tweets_dtm, 
                                    method = "TSNE")
json_lda %>% serVis()
# json_lda %>% serVis(out.dir = 'LDAviz')

Model explainability

Global

fit_en %>% vip::vip()

Local

library(lime)
explanation <- .load_image_example()
plot_image_explanation(explanation)

lime_tweets <- lime(data_new, fit_en)
recipe_deploy <- function(data){
  data_recipe %>% bake(data)
}
lime_tweets
$model
parsnip model object

Fit time:  561ms 

Call:  stats::glm(formula = y ~ ., family = ~binomial, data = data)

Coefficients:
           (Intercept)   `tfidf_text_#COVID19`          tfidf_text_act     tfidf_text_american  
             -0.405834                0.199698                0.202027               -0.103218  
  tfidf_text_americans          tfidf_text_amp         tfidf_text_bill          tfidf_text_can  
              0.349421                0.101057                0.017583                0.145496  
       tfidf_text_care     tfidf_text_congress  tfidf_text_coronavirus      tfidf_text_country  
              0.407076                0.216416                0.007831                0.116795  
    tfidf_text_covid19          tfidf_text_day     tfidf_text_families          tfidf_text_get  
              0.119973                0.045005                0.252039                0.233111  
      tfidf_text_great       tfidf_text_health         tfidf_text_help        tfidf_text_house  
             -0.540967                0.561288                0.045008                0.240416  
         tfidf_text_im         tfidf_text_just         tfidf_text_make         tfidf_text_must  
              0.312304                0.104538                0.380366                0.509962  
       tfidf_text_need          tfidf_text_new          tfidf_text_now          tfidf_text_one  
              0.431828                0.169590                0.311848                0.073044  
   tfidf_text_pandemic       tfidf_text_people    tfidf_text_president           tfidf_text_qt  
              0.393016                0.511551                0.031177               -0.165128  
     tfidf_text_senate        tfidf_text_state      tfidf_text_support        tfidf_text_thank  
             -0.022831               -0.069334               -0.029238                0.035723  
       tfidf_text_time        tfidf_text_today        tfidf_text_trump           tfidf_text_us  
              0.043689                0.217029                1.126121                0.090043  
       tfidf_text_vote         tfidf_text_work      tfidf_text_workers  
              0.340436               -0.023399                0.388925  

Degrees of Freedom: 33281 Total (i.e. Null);  33239 Residual
Null Deviance:      46140 
Residual Deviance: 44410    AIC: 44490

$preprocess
function (x) 
x
<bytecode: 0x7fbd76c45f68>
<environment: 0x7fbfc195c518>

$bin_continuous
[1] TRUE

$n_bins
[1] 4

$quantile_bins
[1] TRUE

$use_density
[1] TRUE

$feature_type
   tfidf_text_#COVID19         tfidf_text_act    tfidf_text_american   tfidf_text_americans 
             "numeric"              "numeric"              "numeric"              "numeric" 
        tfidf_text_amp        tfidf_text_bill         tfidf_text_can        tfidf_text_care 
             "numeric"              "numeric"              "numeric"              "numeric" 
   tfidf_text_congress tfidf_text_coronavirus     tfidf_text_country     tfidf_text_covid19 
             "numeric"              "numeric"              "numeric"              "numeric" 
        tfidf_text_day    tfidf_text_families         tfidf_text_get       tfidf_text_great 
             "numeric"              "numeric"              "numeric"              "numeric" 
     tfidf_text_health        tfidf_text_help       tfidf_text_house          tfidf_text_im 
             "numeric"              "numeric"              "numeric"              "numeric" 
       tfidf_text_just        tfidf_text_make        tfidf_text_must        tfidf_text_need 
             "numeric"              "numeric"              "numeric"              "numeric" 
        tfidf_text_new         tfidf_text_now         tfidf_text_one    tfidf_text_pandemic 
             "numeric"              "numeric"              "numeric"              "numeric" 
     tfidf_text_people   tfidf_text_president          tfidf_text_qt      tfidf_text_senate 
             "numeric"              "numeric"             "constant"              "numeric" 
      tfidf_text_state     tfidf_text_support       tfidf_text_thank        tfidf_text_time 
             "numeric"              "numeric"              "numeric"              "numeric" 
      tfidf_text_today       tfidf_text_trump          tfidf_text_us        tfidf_text_vote 
             "numeric"              "numeric"              "numeric"              "numeric" 
       tfidf_text_work     tfidf_text_workers                   pred            .pred_FALSE 
             "numeric"              "numeric"               "factor"              "numeric" 
             pred_prob 
             "numeric" 

$bin_cuts
$bin_cuts$`tfidf_text_#COVID19`
[1] 0.0000000 0.7582041 1.5164082 2.2746123 3.0328163

$bin_cuts$tfidf_text_act
[1] 0.000000 1.487717 2.975434 4.463152 5.950869

$bin_cuts$tfidf_text_american
[1] 0.000000 1.069362 2.138725 3.208087 4.277450

$bin_cuts$tfidf_text_americans
[1] 0.000000 1.117573 2.235147 3.352720 4.470293

$bin_cuts$tfidf_text_amp
[1] 0.0000000 0.7427382 1.4854765 2.2282147 2.9709530

$bin_cuts$tfidf_text_bill
[1] 0.000000 1.695576 3.391153 5.086729 6.782305

$bin_cuts$tfidf_text_can
[1] 0.0000000 0.7094546 1.4189092 2.1283638 2.8378184

$bin_cuts$tfidf_text_care
[1] 0.000000 1.253223 2.506446 3.759669 5.012893

$bin_cuts$tfidf_text_congress
[1] 0.000000 1.868721 3.737443 5.606164 7.474886

$bin_cuts$tfidf_text_coronavirus
[1] 0.000000 1.349852 2.699703 4.049555 5.399406

$bin_cuts$tfidf_text_country
[1] 0.000000 1.168573 2.337146 3.505719 4.674292

$bin_cuts$tfidf_text_covid19
[1] 0.000000 1.129999 2.259997 3.389996 4.519994

$bin_cuts$tfidf_text_day
[1] 0.000000 1.122468 2.244936 3.367404 4.489872

$bin_cuts$tfidf_text_families
[1] 0.000000 1.510404 3.020807 4.531211 6.041614

$bin_cuts$tfidf_text_get
[1] 0.0000000 0.7646984 1.5293968 2.2940952 3.0587936

$bin_cuts$tfidf_text_great
[1] 0.000000 1.101176 2.202352 3.303528 4.404703

$bin_cuts$tfidf_text_health
[1] 0.000000 1.257469 2.514937 3.772406 5.029874

$bin_cuts$tfidf_text_help
[1] 0.000000 1.244945 2.489891 3.734836 4.979782

$bin_cuts$tfidf_text_house
[1] 0.000000 1.279865 2.559731 3.839596 5.119461

$bin_cuts$tfidf_text_im
[1] 0.0000000 0.8640819 1.7281638 2.5922458 3.4563277

$bin_cuts$tfidf_text_just
[1] 0.000000 0.685848 1.371696 2.057544 2.743392

$bin_cuts$tfidf_text_make
[1] 0.0000000 0.9671067 1.9342134 2.9013201 3.8684268

$bin_cuts$tfidf_text_must
[1] 0.000000 1.174615 2.349229 3.523844 4.698458

$bin_cuts$tfidf_text_need
[1] 0.0000000 0.9024811 1.8049623 2.7074434 3.6099246

$bin_cuts$tfidf_text_new
[1] 0.000000 1.046605 2.093209 3.139814 4.186419

$bin_cuts$tfidf_text_now
[1] 0.0000000 0.7902302 1.5804604 2.3706905 3.1609207

$bin_cuts$tfidf_text_one
[1] 0.0000000 0.7829797 1.5659594 2.3489390 3.1319187

$bin_cuts$tfidf_text_pandemic
[1] 0.000000 1.447745 2.895490 4.343234 5.790979

$bin_cuts$tfidf_text_people
[1] 0.0000000 0.8210539 1.6421079 2.4631618 3.2842158

$bin_cuts$tfidf_text_president
[1] 0.0000000 0.7250178 1.4500356 2.1750534 2.9000712

$bin_cuts$tfidf_text_qt
NULL

$bin_cuts$tfidf_text_senate
[1] 0.000000 1.721888 3.443776 5.165664 6.887553

$bin_cuts$tfidf_text_state
[1] 0.000000 1.405411 2.810823 4.216234 5.621646

$bin_cuts$tfidf_text_support
[1] 0.000000 1.284599 2.569198 3.853798 5.138397

$bin_cuts$tfidf_text_thank
[1] 0.000000 1.187163 2.374326 3.561489 4.748652

$bin_cuts$tfidf_text_time
[1] 0.0000000 0.7462535 1.4925070 2.2387605 2.9850141

$bin_cuts$tfidf_text_today
[1] 0.000000 1.162676 2.325353 3.488029 4.650706

$bin_cuts$tfidf_text_trump
[1] 0.0000000 0.5318131 1.0636262 1.5954394 2.1272525

$bin_cuts$tfidf_text_us
[1] 0.0000000 0.8582498 1.7164996 2.5747494 3.4329992

$bin_cuts$tfidf_text_vote
[1] 0.0000000 0.9377365 1.8754729 2.8132094 3.7509459

$bin_cuts$tfidf_text_work
[1] 0.000000 1.217984 2.435968 3.653952 4.871937

$bin_cuts$tfidf_text_workers
[1] 0.000000 1.996371 3.992742 5.989113 7.985484

$bin_cuts$pred
NULL

$bin_cuts$.pred_FALSE
        0%        25%        50%        75%       100% 
0.06297783 0.38484544 0.55076602 0.60008848 0.94205857 

$bin_cuts$pred_prob
        0%        25%        50%        75%       100% 
0.05794143 0.39991152 0.44923398 0.61515456 0.93702217 


$feature_distribution
$feature_distribution$`tfidf_text_#COVID19`

           1            2            3            4 
0.9564181137 0.0253092725 0.0003404835 0.0179321303 

$feature_distribution$tfidf_text_act

           1            2            3            4 
0.9980705936 0.0012484395 0.0001134945 0.0005674725 

$feature_distribution$tfidf_text_american

          1           2           3           4 
0.991147429 0.007263648 0.000226989 0.001361934 

$feature_distribution$tfidf_text_americans

          1           2           4 
0.990806946 0.006015208 0.003177846 

$feature_distribution$tfidf_text_amp

         1          2          3          4 
0.95471570 0.02712518 0.00329134 0.01486778 

$feature_distribution$tfidf_text_bill

           1            2            3            4 
0.9989785495 0.0006809670 0.0001134945 0.0002269890 

$feature_distribution$tfidf_text_can

          1           2           3           4 
0.946657587 0.031324481 0.001361934 0.020655998 

$feature_distribution$tfidf_text_care

          1           2           4 
0.995006242 0.003858813 0.001134945 

$feature_distribution$tfidf_text_congress

           1            2            4 
0.9994325275 0.0003404835 0.0002269890 

$feature_distribution$tfidf_text_coronavirus

           1            2            4 
0.9972761321 0.0021563954 0.0005674725 

$feature_distribution$tfidf_text_country

          1           2           4 
0.992622858 0.004880263 0.002496879 

$feature_distribution$tfidf_text_covid19

          1           2           4 
0.992622858 0.004766769 0.002610373 

$feature_distribution$tfidf_text_day

           1            2            3            4 
0.9905799569 0.0049937578 0.0001134945 0.0043127908 

$feature_distribution$tfidf_text_families

           1            2            4 
0.9984110771 0.0014754284 0.0001134945 

$feature_distribution$tfidf_text_get

           1            2            3            4 
0.9582340256 0.0263307230 0.0005674725 0.0148677789 

$feature_distribution$tfidf_text_great

           1            2            3            4 
0.9891045284 0.0060152083 0.0001134945 0.0047667688 

$feature_distribution$tfidf_text_health

           1            2            3            4 
0.9947792532 0.0038588128 0.0003404835 0.0010214505 

$feature_distribution$tfidf_text_help

          1           2           4 
0.994211781 0.003631824 0.002156395 

$feature_distribution$tfidf_text_house

          1           2           3           4 
0.994552264 0.002610373 0.000226989 0.002610373 

$feature_distribution$tfidf_text_im

          1           2           3           4 
0.973328794 0.015208262 0.000680967 0.010781977 

$feature_distribution$tfidf_text_just

           1            2            3            4 
0.9407558733 0.0297355578 0.0007944615 0.0287141074 

$feature_distribution$tfidf_text_make

           1            2            3            4 
0.9820678697 0.0111224606 0.0001134945 0.0066961752 

$feature_distribution$tfidf_text_must

          1           2           3           4 
0.992963341 0.004199296 0.000226989 0.002610373 

$feature_distribution$tfidf_text_need

           1            2            3            4 
0.9779820679 0.0115764385 0.0005674725 0.0098740211 

$feature_distribution$tfidf_text_new

          1           2           3           4 
0.986267166 0.007377142 0.000226989 0.006128703 

$feature_distribution$tfidf_text_now

          1           2           3           4 
0.963227783 0.019975031 0.000453978 0.016343207 

$feature_distribution$tfidf_text_one

         1          2          3          4 
0.96107139 0.02133697 0.00102145 0.01657020 

$feature_distribution$tfidf_text_pandemic

          1           2           4 
0.997616616 0.002156395 0.000226989 

$feature_distribution$tfidf_text_people

           1            2            3            4 
0.9712858926 0.0203155147 0.0005674725 0.0078311202 

$feature_distribution$tfidf_text_president

          1           2           3           4 
0.951083872 0.030076041 0.001134945 0.017705141 

$feature_distribution$tfidf_text_qt
[1] NA

$feature_distribution$tfidf_text_senate

           1            2            4 
0.9990920440 0.0007944615 0.0001134945 

$feature_distribution$tfidf_text_state

          1           2           4 
0.997162638 0.001929406 0.000907956 

$feature_distribution$tfidf_text_support

           1            2            3            4 
0.9950062422 0.0031778459 0.0001134945 0.0017024174 

$feature_distribution$tfidf_text_thank

          1           2           3           4 
0.992736352 0.003404835 0.000226989 0.003631824 

$feature_distribution$tfidf_text_time

           1            2            3            4 
0.9550561798 0.0255362615 0.0005674725 0.0188400863 

$feature_distribution$tfidf_text_today

          1           2           4 
0.991714902 0.003518329 0.004766769 

$feature_distribution$tfidf_text_trump

          1           2           3           4 
0.882306208 0.062308478 0.003518329 0.051866984 

$feature_distribution$tfidf_text_us

           1            2            3            4 
0.9734422880 0.0148677789 0.0007944615 0.0108954716 

$feature_distribution$tfidf_text_vote

           1            2            3            4 
0.9804789468 0.0111224606 0.0007944615 0.0076041312 

$feature_distribution$tfidf_text_work

          1           2           4 
0.994098286 0.003631824 0.002269890 

$feature_distribution$tfidf_text_workers

           1            2            4 
0.9997730110 0.0001134945 0.0001134945 

$feature_distribution$pred

    FALSE      TRUE 
0.6044717 0.3955283 

$feature_distribution$.pred_FALSE

         1          2          3          4 
0.25002837 0.25286574 0.47758484 0.01952105 

$feature_distribution$pred_prob

        1         2         3         4 
0.4436500 0.0577687 0.2485529 0.2500284 


attr(,"class")
[1] "data_frame_explainer" "explainer"            "list"                
explained_tweets <-  explain(x = data_new %>% sample_n(8), 
                         explainer = lime_tweets, 
                         n_permutations = 5000,
                         dist_fun = "gower",
                         kernel_width = 0.75,
                         n_features = 10, 
                         feature_select = "highest_weights",
                         n_labels = 1 # to have the predicted class as baseline
                         # labels = "Yes" # to have te positive class as baseline
                         )
explained_tweets %>% plot_features()

explained_tweets %>% plot_text_explanations()
Registered S3 method overwritten by 'htmlwidgets':
  method           from         
  print.htmlwidget tools:rstudio
Error: original_text is not a string (a length one character vector).

Endnotes

Packages & Ecosystem

Further NLP packages ecosystem

References

  • Julia Silge and David Robinson (2020). Text Mining with R: A Tidy Approach, O’Reilly. Online available here
  • Emil Hvidfeldt and Julia Silge (2020). Supervised Machine Learning for Text Analysis in R, online available here

Further sources

Datacamp

Other online

  • Julia Silge’s Blog: Full of great examples of predictive modeling, NLP, and the combination fo both, using tidy ecosystems

Session Info

sessionInfo()
---
title: 'NLP workshop - Exploring Presidential Debate on twitter'
author: "Daniel S. Hain (dsh@business.aau.dk)"
date: "Updated `r format(Sys.time(), '%B %d, %Y')`"
output:
  html_notebook:
    code_folding: show
    df_print: paged
    toc: true
    toc_depth: 2
    toc_float:
      collapsed: false
    theme: flatly
---

```{r setup, include=FALSE}
### Generic preamble
rm(list=ls())
Sys.setenv(LANG = "en") # For english language
options(scipen = 5) # To deactivate annoying scientific number notation

### Knitr options
library(knitr) # For display of the markdown
knitr::opts_chunk$set(warning=FALSE,
                     message=FALSE,
                     comment=FALSE, 
                     fig.align="center"
                     )
```

```{r}
### Load standardpackages
library(tidyverse) # Collection of all the good stuff like dplyr, ggplot2 ect.
library(magrittr) # For extra-piping operators (eg. %<>%)
```

```{r}
library(tidytext)
```


# Download the data

```{r}
# download and open some Trump tweets from trump_tweet_data_archive
library(jsonlite)
tmp <- tempfile()
download.file("https://github.com/SDS-AAU/SDS-master/raw/master/M2/data/pol_tweets.gz", tmp)

tweets_raw <- stream_in(gzfile(tmp, "pol_tweets"))
```

```{r}
tweets_raw %>% glimpse()
```

```{r}
tweets <- tibble(ID = colnames(tweets_raw[[1]]), 
                 text = tweets_raw[[1]] %>% as.character(), 
                 labels = tweets_raw[[2]] %>% as.logical())
#rm(tweets_raw)
```

```{r}
tweets %>% head()
```


```{r}
tweets %<>%
  filter(!(text %>% str_detect('^RT'))) # Filter retweets
```

```{r}
tweets %>% head()
```

# Tidying

```{r}
tweets_tidy <- tweets %>%
  unnest_tokens(word, text, token = "tweets") 
```

```{r}
tweets_tidy %>% head(50)
```


```{r}
tweets_tidy %>% count(word, sort = TRUE)
```


# Preprocessing

```{r}
# preprocessing
tweets_tidy %<>%
  filter(!(word %>% str_detect('@'))) %>% # remove mentions
  filter(!(word %>% str_detect('^amp|^http|^t\\.co'))) %>% # Twitter specific stuff
#  mutate(word = word %>% str_remove_all('[^[:alnum:]]')) %>% ## remove all special characters
  filter(str_length(word) > 2 ) %>% # Remove words with less than  3 characters
  group_by(word) %>%
  filter(n() > 100) %>% # remove words occuring less than 100 times
  ungroup() %>%
  anti_join(stop_words, by = 'word') # remove stopwords
```

# TFIDF weighting

```{r}
# top words
tweets_tidy %>%
  count(word, sort = TRUE) %>%
  head(20)
```

```{r}
# TFIDF weights
tweets_tidy %<>%
  add_count(ID, word) %>%
  distinct(ID, word, .keep_all = TRUE) %>%
  bind_tf_idf(term = word,
              document = ID,
              n = n)
```


```{r}
# TFIDF topwords
tweets_tidy %>%
  count(word, wt = tf_idf, sort = TRUE) %>%
  head(20)
```

# Inspecting

## Words by party affiliation

```{r}
labels_words <- tweets_tidy %>%
  group_by(labels) %>%
  count(word, wt = tf_idf, sort = TRUE, name = "tf_idf") %>%
  slice(1:100) %>%
  ungroup() 
```

```{r, fig.width=10}
labels_words %>%
  mutate(word = reorder_within(word, by = tf_idf, within = labels)) %>%
  ggplot(aes(x = word, y = tf_idf, fill = labels)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~labels, ncol = 2, scales = "free") +
  coord_flip() +
  scale_x_reordered()
```

## Sentiments?

```{r}
sentiment_tweet <- tweets_tidy %>%
  inner_join(get_sentiments("bing"))
```

... To be continued by you

## Towards prediction?

```{r}
tweets_dtm <- tweets_tidy %>%
  pivot_wider(names_from = word, values_from = tf_idf)
```

```{r}
tweets_dtm %<>% mutate(across(everything(), .fns = ~replace_na(.,0))) 
```

```{r}
rm(tweets_dtm)
```

## Simple manual baseline

```{r}
words_classifier <- labels_words %>%
  arrange(desc(tf_idf)) %>%
  distinct(word, .keep_all = TRUE) %>%
  select(-tf_idf)
```

```{r}
tweet_null_model <- tweets_tidy %>%
  inner_join(labels_words, by = 'word')
```

```{r}
null_res <- tweet_null_model %>%
  group_by(ID) %>%
  summarise(truth = mean(labels.x, na.rm = TRUE) %>% round(0),
         pred = mean(labels.y, na.rm = TRUE) %>% round(0))
```

```{r}
table(null_res$pred, null_res$truth)
```

# Predictive model

```{r}
library(tidymodels)
```

## Preprocessing

```{r}
# Notice, we use the initial untokenized tweets
data <- tweets %>%
  select(labels, text) %>%
  rename(y = labels) %>%
  mutate(y = y  %>% as.factor()) 
```


## Training & Test split

```{r}
data_split <- initial_split(data, prop = 0.75, strata = y)

data_train <- data_split  %>%  training()
data_test <- data_split %>% testing()
```

```{r}
data_train %>% count(y)
```


## Preprocessing pipeline

```{r}
library(textrecipes) # Adittional recipes for working with text data
```

```{r}
# This recipe pretty much reconstructs all preprocessing we did so far
data_recipe <- data_train %>%
  recipe(y ~.) %>%
  themis::step_upsample(y) %>% # For up/downsampling class imbalances (optimal)
  step_filter(!(text %>% str_detect('^RT'))) %>% # Upfront filtering retweets
  step_filter(text != "") %>%
  # textreciepes
  step_tokenize(text, token = "tweets") %>% # tokenize
  step_tokenfilter(text, min_times = 75) %>%  # Filter out rare words
  step_stopwords(text, keep = FALSE) %>% # Filter stopwords
  step_tfidf(text) %>% # TFIDF weighting
  #step_pca(all_predictors()) %>% # Dimensionality reduction via PCA (optional)
  prep()
```


```{r}
data_recipe
```

Since we will not do hyperparameter tuning, we directly bake/juice the recipe

```{r}
data_train_prep <- data_recipe %>% juice()
data_test_prep <- data_recipe %>% bake(data_test)
```


## Defining the models

```{r}
model_null <- null_model(mode = 'classification')
```

```{r}
model_en <- logistic_reg(mode = 'classification',
                         mixture = 0.5,
                         penalty = 0.5) %>%
  set_engine('glm', family = binomial) 
```


## Define the workflow

We will skip the workflow step this time, since we do not evaluate different models against each others.

## fit the model

```{r}
fit_null <- model_null %>% fit(formula = y ~., data = data_train_prep)
```

```{r}
fit_en <- model_en %>% fit(formula = y ~., data = data_train_prep)
```


```{r}
pred_collected <- tibble(
  truth = data_test_prep %>% pull(y),
  pred = fit_en %>% predict(new_data = data_test_prep) %>% pull(.pred_class),
  pred_prob = fit_en %>% predict(new_data = data_test_prep, type = "prob") %>% pull(.pred_TRUE),
  ) 
```

```{r}
pred_collected %>% conf_mat(truth, pred) %>% autoplot(type = 'heatmap')
```

```{r}
pred_collected %>% conf_mat(truth, pred) %>% summary()
```
Well... soso

# Using the model for new prediction

## Simple test

```{r}
# How would the model predict given some tweet text
pred_own = tibble(text = 'trump is really bad. we need more green energy  to save the enviroment and fuuture of our children')
```

```{r}
fit_en %>% predict(new_data = data_recipe %>% bake(pred_own))
```

# Prediction on new tweets

## New data

* We could also use the model to predict on new data, such as the just scraped discussion on the presidential debate.

```{r}
# download and open some Trump tweets from trump_tweet_data_archive
download.file("https://github.com/SDS-AAU/SDS-master/raw/master/M2/data/pres_debate_2020.gz", tmp)

tweets_raw_new <- stream_in(gzfile(tmp, "pres_debate_2020"))
```

```{r}
tweets_raw_new %>% glimpse()
```

```{r}
tweets_new <- tibble(ID = tweets_raw_new$id[1,] %>% t() %>% as.character(), 
                     text = tweets_raw_new$tweet[1,] %>% t() %>% as.character())
#rm(tweets_raw_new)
```

```{r}
tweets_new %>% glimpse()
```

## Doing a prediction
```{r}
data_new <- data_recipe %>% bake(tweets_new)
```

```{r}
data_new %>% glimpse()
```

```{r}
pred_new <- fit_en %>% predict(new_data = data_new)
pred_prob_new <- fit_en %>% predict(new_data = data_new, type = "prob")
```

## Exploring the new data & predictions

```{r}
data_new %<>%
  bind_cols(pred_new) %>%
  rename(pred = .pred_class) %>%
  bind_cols(pred_prob_new) %>%
  rename(pred_prob = .pred_TRUE) 
```

```{r}
tweets_new %<>%
  bind_cols(pred_new) %>%
  rename(pred = .pred_class) 
```


```{r}
tweets_tidy_new <- tweets_new %>%
  unnest_tokens(word, text, token = "tweets") 
```

```{r}
# preprocessing
tweets_tidy_new %<>%
  filter(!(word %>% str_detect('@|#presidential'))) %>% # remove hashtags and mentions
  filter(!(word %>% str_detect('^amp|^http|^t\\.co'))) %>% # Twitter specific stuff
#  mutate(word = word %>% str_remove_all('[^[:alnum:]]')) %>% ## remove all special characters
  filter(str_length(word) > 2 ) %>% # Remove words with less than  3 characters
  group_by(word) %>%
  filter(n() > 100) %>% # remove words occuring less than 100 times
  ungroup() %>%
  anti_join(stop_words, by = 'word') # remove stopwords
```


```{r}
# TFIDF weights
tweets_tidy_new %<>%
  add_count(ID, word) %>%
  distinct(ID, word, .keep_all = TRUE) %>%
  bind_tf_idf(term = word,
              document = ID,
              n = n)
```

```{r}
labels_words_new <- tweets_tidy_new %>%
  group_by(pred) %>%
  count(word, wt = tf_idf, sort = TRUE, name = "tf_idf") %>%
  slice(1:20) %>%
  ungroup() 
```

```{r, fig.width=10}
labels_words_new %>%
  mutate(word = reorder_within(word, by = tf_idf, within = pred)) %>%
  ggplot(aes(x = word, y = tf_idf, fill = pred)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~pred, ncol = 2, scales = "free") +
  coord_flip() +
  scale_x_reordered()
```


# Topic models (LDA) on new data

```{r}
# for LDA analysis
library(topicmodels)
```

### Preparing the Data

```{r}
# LDA via the topicmodel package requires a document-term-matrix (dtm)
tweets_dtm <- tweets_tidy_new %>%
  cast_dtm(document = ID, term = word, value = n)
```

Lets take a look:

```{r}
tweets_dtm
```

* We see again hat the matrix is still rather sparse, which is an artefact of text data generally, but even more so when using twitter data. 
* Lets try to see if we could reduce that somewhat by deleting less often used terms.

```{r}
library(tm)
tweets_dtm %>% removeSparseTerms(sparse = .99)
```

* Now we can perform a LDA, using the more accurate Gibbs sampling as `method`.

```{r}
tweets_lda <- tweets_dtm %>% 
  LDA(k = 6, method = "Gibbs",
      control = list(seed = 1337))
```

### $\beta$: Word-Topic Association

* $\beta$ is an output of the LDA model, indicating the propability that a word occurs in a certain topic.
* Therefore, loking at the top probability words of a topic often gives us a good intuition regarding its properties.

```{r}
# LDA output is defined for tidy(), so we can easily extract it
lda_beta <- tweets_lda %>% 
  tidy(matrix = "beta") %>%
  group_by(topic) %>%
  arrange(topic, desc(beta)) %>%
  slice(1:10) %>%
  ungroup() 
```

```{r}
lda_beta %>% head()
```

```{r}
# Notice the "reorder_within()"
lda_beta %>%
  mutate(term = reorder_within(term, beta, topic)) %>%
  group_by(topic, term) %>%    
  arrange(desc(beta)) %>%  
  ungroup() %>%
  ggplot(aes(term, beta, fill = as.factor(topic))) +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  scale_x_reordered() +
  labs(title = "Top 10 terms in each LDA topic",
       x = NULL, y = expression(beta)) +
  facet_wrap(~ topic, ncol = 2, scales = "free")
```

### $\gamma$: Document-Topic Association

* In LDA, documents are represented as a mix of topics. This association of a document to a topic is captured by $\gamma$

```{r}
lda_gamma <- tweets_lda %>% 
  tidy(matrix = "gamma")
```

```{r}
lda_gamma %>% head()
```

```{r}
lda_gamma %>%
  ggplot(aes(gamma)) +
  geom_histogram() +
  scale_y_log10() +
  labs(title = "Distribution of probabilities for all topics",
       y = "Number of documents", x = expression(gamma))
```

```{r}
lda_gamma %<>%
  left_join(tweets_new %>% select(ID, pred), by = c('document' = 'ID'))
```

```{r}
lda_gamma %>%
  group_by(pred, topic) %>%
  summarise(gamma = sum(gamma)) %>%
  arrange(pred, gamma)
```

```{r}
lda_gamma %>%
  ggplot(aes(gamma, fill = as.factor(topic))) +
  geom_histogram(show.legend = FALSE) +
  facet_wrap(~ topic, ncol = 2) +
  scale_y_log10() +
  labs(title = "Distribution of probability for each topic",
       y = "Number of documents", x = expression(gamma))
```

```{r}
topicmodels_json_ldavis <- function(fitted, doc_dtm, method = "PCA", doc_in = NULL, topic_in = NULL){
  require(topicmodels); require(dplyr); require(LDAvis)
  
  # Find required quantities
  phi <- posterior(fitted)$terms %>% as.matrix() # Topic-term distribution
  theta <- posterior(fitted)$topics %>% as.matrix() # Document-topic matrix
  
  # Restrict
  if(!is_null(topic_in)){
    phi <- phi[topic_in, ]
    theta <- theta[ , topic_in]
  }
  text_tidy <- doc_dtm %>% tidy()
  vocab <- colnames(phi)
  doc_length <- tibble(document = rownames(theta)) %>% left_join(text_tidy %>% count(document, wt = count), by = 'document')
  tf <- tibble(term = vocab) %>% left_join(text_tidy %>% count(term, wt = count), by = "term") 
  
  if(method == "PCA"){mds <- jsPCA}
  if(method == "TSNE"){library(tsne); mds <- function(x){tsne(svd(x)$u)} }
  
  # Convert to json
  json_lda <- LDAvis::createJSON(phi = phi, theta = theta, vocab = vocab, doc.length = doc_length %>% pull(n), term.frequency = tf %>% pull(n),
                                 reorder.topics = FALSE, mds.method = mds,plot.opts = list(xlab = "Dim.1", ylab = "Dim.2")) 
  return(json_lda)
}
```


```{r}
library(LDAvis)
json_lda <- topicmodels_json_ldavis(fitted = tweets_lda, 
                                    doc_dtm = tweets_dtm, 
                                    method = "TSNE")
json_lda %>% serVis()
# json_lda %>% serVis(out.dir = 'LDAviz')
```

# Model explainability

## Global

```{r}
fit_en %>% vip::vip()
```

## Local

```{r}
library(lime)
```

```{r}
explanation <- .load_image_example()
plot_image_explanation(explanation)
```

```{r}
lime_tweets <- lime(data_new, fit_en)
```

```{r}
recipe_deploy <- function(data){
  data_recipe %>% bake(data)
}
```


```{r}
lime_tweets <- lime(data_new, fit_en,
                    preprocess = recipe_deploy)
```


```{r}
explained_tweets <-  explain(x = data_new %>% sample_n(8), 
                         explainer = lime_tweets, 
                         n_permutations = 5000,
                         dist_fun = "gower",
                         kernel_width = 0.75,
                         n_features = 10, 
                         feature_select = "highest_weights",
                         n_labels = 1 # to have the predicted class as baseline
                         # labels = "Yes" # to have te positive class as baseline
                         )
```


```{r fig.width=12, fig.height=12}
explained_tweets %>% plot_features()
```

```{r}
explained_tweets %>% plot_text_explanations()
```


# Endnotes

### Packages & Ecosystem

* [`tidytext`](https://github.com/juliasilge/tidytext)
* [`textrecipes`](https://textrecipes.tidymodels.org/)
* [`topicmodels`](https://cran.r-project.org/web/packages/topicmodels/vignettes/topicmodels.pdf)

Further NLP packages ecosystem

* `tm` [here](https://cran.r-project.org/web/packages/tm/)
* `quanteda` [here](https://quanteda.io/), and many many great tutorials [here](https://tutorials.quanteda.io/)


### References 

* Julia Silge and David Robinson (2020). Text Mining with R: A Tidy Approach, O’Reilly. Online available [here](https://www.tidytextmining.com/)
   * [Chapter 6](https://www.tidytextmining.com/topicmodeling.html): Introduction topic models
* Emil Hvidfeldt and Julia Silge (2020). Supervised Machine Learning for Text Analysis in R, online available [here](https://smltar.com/)
   * [Chapter 7](https://smltar.com/mlclassification.html): Classification

### Further sources

Datacamp

*  [Topic Modeling in R](https://learn.datacamp.com/courses/topic-modeling-in-r) 

Other online

* [Julia Silge's Blog](https://juliasilge.com/): Full of great examples of predictive modeling, NLP, and the combination fo both, using tidy ecosystems

### Session Info

```{r}
sessionInfo()
```



