### Load standardpackages
library(tidyverse) # Collection of all the good stuff like dplyr, ggplot2 ect.
library(magrittr) # For extra-piping operators (eg. %<>%)
library(tidytext)
# download and open some Trump tweets from trump_tweet_data_archive
library(jsonlite)
tmp <- tempfile()
download.file("https://github.com/SDS-AAU/SDS-master/raw/master/M2/data/pol_tweets.gz", tmp)
trying URL 'https://github.com/SDS-AAU/SDS-master/raw/master/M2/data/pol_tweets.gz'
Content type 'application/octet-stream' length 7342085 bytes (7.0 MB)
==================================================
downloaded 7.0 MB
tweets_raw <- stream_in(gzfile(tmp, "pol_tweets"))
Found 1 records...
Imported 1 records. Simplifying...
tweets_raw %>% glimpse()
Rows: 1
Columns: 2
$ text <df[,50000]> <data.frame[1 x 50000]>
$ labels <df[,50000]> <data.frame[1 x 50000]>
tweets <- tibble(ID = colnames(tweets_raw[[1]]),
text = tweets_raw[[1]] %>% as.character(),
labels = tweets_raw[[2]] %>% as.logical())
#rm(tweets_raw)
tweets %>% glimpse()
Rows: 50,000
Columns: 3
$ ID <chr> "340675", "289492", "371088", "82212", "476047", "220741", "379074", "633731", "103805", "401277", "493433", "578814", "570425", "307632", "634542", "276734…
$ text <chr> "RT @GreenBeretFound Today we remember Sgt. 1st Class Ryan J. Savard killed in action on this day eight years ago. SFC Savard was assigned to U.S. Army Spec…
$ labels <lgl> FALSE, TRUE, TRUE, FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, FALSE, TRUE, FALSE, FALSE, FALSE, TRUE, TRUE, TRUE, FALSE, TRUE, TRUE, TRUE, FALSE, FALSE, TRUE, TR…
tweets %<>%
filter(!(text %>% str_detect('^RT'))) # Filter retweets
tweets %>% head()
tweets_tidy <- tweets %>%
unnest_tokens(word, text, token = "tweets")
tweets_tidy %>% head(50)
tweets_tidy %>% count(word, sort = TRUE)
# preprocessing
tweets_tidy %<>%
filter(!(word %>% str_detect('@'))) %>% # remove hashtags and mentions
filter(!(word %>% str_detect('^amp|^http|^t\\.co'))) %>% # Twitter specific stuff
# mutate(word = word %>% str_remove_all('[^[:alnum:]]')) %>% ## remove all special characters
filter(str_length(word) > 2 ) %>% # Remove words with less than 3 characters
group_by(word) %>%
filter(n() > 100) %>% # remove words occuring less than 100 times
ungroup() %>%
anti_join(stop_words, by = 'word') # remove stopwords
TFIDF weighting
# top words
tweets_tidy %>%
count(word, sort = TRUE) %>%
head(20)
# TFIDF weights
tweets_tidy %<>%
add_count(ID, word) %>%
bind_tf_idf(term = word,
document = ID,
n = n)
# TFIDF topwords
tweets_tidy %>%
count(word, wt = tf_idf, sort = TRUE) %>%
head(20)
labels_words <- tweets_tidy %>%
group_by(labels) %>%
count(word, wt = tf_idf, sort = TRUE, name = "tf_idf") %>%
slice(1:100) %>%
ungroup()
labels_words %>%
mutate(word = reorder_within(word, by = tf_idf, within = labels)) %>%
ggplot(aes(x = word, y = tf_idf, fill = labels)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~labels, ncol = 2, scales = "free") +
coord_flip() +
scale_x_reordered()
tweets_tidy %>% head()
library(tidymodels)
words_classifier <- labels_words %>%
arrange(desc(tf_idf)) %>%
distinct(word, .keep_all = TRUE) %>%
select(-tf_idf)
tweet_null_model <- tweets_tidy %>%
inner_join(labels_words, by = 'word')
null_res <- tweet_null_model %>%
group_by(ID) %>%
summarise(truth = mean(labels.x, na.rm = TRUE) %>% round(0),
pred = mean(labels.y, na.rm = TRUE) %>% round(0))
table(null_res$truth, null_res$pred)
0 1
0 8842 2609
1 11327 9235
# Notice, we use the initial untokenized tweets
data <- tweets %>%
select(labels, text) %>%
rename(y = labels) %>%
mutate(y = y %>% as.factor())
data_split <- initial_split(data, prop = 0.75, strata = y)
data_train <- data_split %>% training()
data_test <- data_split %>% testing()
library(textrecipes) # Adittional recipes for working with text data
# This recipe pretty much reconstructs all preprocessing we did so far
data_recipe <- data_train %>%
recipe(y ~.) %>%
themis::step_downsample(y) %>% # For downsampling class imbalances (optimal)
step_filter(!(text %>% str_detect('^RT'))) %>% # Upfront filtering retweets
step_filter(text != "") %>%
step_tokenize(text, token = "tweets") %>% # tokenize
step_tokenfilter(text, min_times = 75) %>% # Filter out rare words
step_stopwords(text, keep = FALSE) %>% # Filter stopwords
step_tfidf(text) %>% # TFIDF weighting
#step_pca(all_predictors()) %>% # Dimensionality reduction via PCA (optional)
prep() # NOTE: Only prep the recipe when not using in a workflow
data_recipe
Data Recipe
Inputs:
Training data contained 26241 data points and no missing data.
Operations:
Down-sampling based on y [trained]
Row filtering [trained]
Row filtering [trained]
Tokenization for text [trained]
Text filtering for text [trained]
Stop word removal for text [trained]
Term frequency-inverse document frequency with text [trained]
Since we will not do hyperparameter tuning, we directly bake/juice the recipe
data_train_prep <- data_recipe %>% juice()
data_test_prep <- data_recipe %>% bake(data_test)
model_en <- logistic_reg(mode = 'classification',
mixture = 0.5,
penalty = 0.5) %>%
set_engine('glm', family = binomial)
We will skip the workflow step this time, since we do not evaluate different models against each others.
fit_en <- model_en %>% fit(formula = y ~., data = data_train_prep)
pred_collected <- tibble(
truth = data_train_prep %>% pull(y),
pred = fit_en %>% predict(new_data = data_train_prep) %>% pull(.pred_class),
pred_prob = fit_en %>% predict(new_data = data_train_prep, type = "prob") %>% pull(.pred_TRUE),
)
pred_collected %>% conf_mat(truth, pred)
Truth
Prediction FALSE TRUE
FALSE 6243 4276
TRUE 3356 5323
pred_collected %>% conf_mat(truth, pred) %>% summary()
Well… soso
# How would the model predict given some tweet text
pred_own = tibble(text = 'USA USA WE NEED A WALL TO MAKE AMERICA GREAT AGAIN AND KEEP THE MEXICANS AND ALL REALLY BAD COUNTRIES OUT! AMNERICA FIRST')
fit_en %>% predict(new_data = data_recipe %>% bake(pred_own))
# download and open some Trump tweets from trump_tweet_data_archive
library(jsonlite)
tmp <- tempfile()
download.file("https://github.com/SDS-AAU/SDS-master/raw/master/M2/data/pres_debate_2020.gz", tmp)
trying URL 'https://github.com/SDS-AAU/SDS-master/raw/master/M2/data/pres_debate_2020.gz'
Content type 'application/octet-stream' length 2095527 bytes (2.0 MB)
==================================================
downloaded 2.0 MB
tweets_raw_new <- stream_in(gzfile(tmp, "pres_debate_2020"))
Found 1 records...
Imported 1 records. Simplifying...
tweets_raw_new %>% glimpse()
Rows: 1
Columns: 33
$ id <df[,8811]> <data.frame[1 x 8811]>
$ conversation_id <df[,8811]> <data.frame[1 x 8811]>
$ created_at <df[,8811]> <data.frame[1 x 8811]>
$ date <df[,8811]> <data.frame[1 x 8811]>
$ timezone <df[,8811]> <data.frame[1 x 8811]>
$ place <df[,8811]> <data.frame[1 x 8811]>
$ tweet <df[,8811]> <data.frame[1 x 8811]>
$ language <df[,8811]> <data.frame[1 x 8811]>
$ hashtags <df[,8811]> <data.frame[1 x 8811]>
$ cashtags <df[,8811]> <data.frame[1 x 8811]>
$ user_id <df[,8811]> <data.frame[1 x 8811]>
$ user_id_str <df[,8811]> <data.frame[1 x 8811]>
$ username <df[,8811]> <data.frame[1 x 8811]>
$ name <df[,8811]> <data.frame[1 x 8811]>
$ day <df[,8811]> <data.frame[1 x 8811]>
$ hour <df[,8811]> <data.frame[1 x 8811]>
$ link <df[,8811]> <data.frame[1 x 8811]>
$ urls <df[,8811]> <data.frame[1 x 8811]>
$ photos <df[,8811]> <data.frame[1 x 8811]>
$ video <df[,8811]> <data.frame[1 x 8811]>
$ thumbnail <df[,8811]> <data.frame[1 x 8811]>
$ nlikes <df[,8811]> <data.frame[1 x 8811]>
$ nreplies <df[,8811]> <data.frame[1 x 8811]>
$ nretweets <df[,8811]> <data.frame[1 x 8811]>
$ quote_url <df[,8811]> <data.frame[1 x 8811]>
$ search <df[,8811]> <data.frame[1 x 8811]>
$ near <df[,8811]> <data.frame[1 x 8811]>
$ geo <df[,8811]> <data.frame[1 x 8811]>
$ source <df[,8811]> <data.frame[1 x 8811]>
$ reply_to <df[,8811]> <data.frame[1 x 8811]>
$ translate <df[,8811]> <data.frame[1 x 8811]>
$ trans_src <df[,8811]> <data.frame[1 x 8811]>
$ trans_dest <df[,8811]> <data.frame[1 x 8811]>
tweets_new <- tibble(ID = tweets_raw_new$id[1,] %>% t() %>% as.character(),
text = tweets_raw_new$tweet[1,] %>% t() %>% as.character())
#rm(tweets_raw_new)
tweets_new %>% glimpse()
Rows: 8,811
Columns: 2
$ ID <chr> "1318944772183281664", "1318938583122743296", "1318932554897031169", "1318928783169245185", "1318927150247018498", "1318926624327454721", "1318926578647207939…
$ text <chr> "Still time to register: Students can join the @UVADemocracy Student Advisory Council for a socially-distanced Presidential Debate Watch Party on Thursday, Oc…
data_new <- data_recipe %>% bake(tweets_new)
data_new %>% glimpse()
Rows: 8,811
Columns: 43
$ `tfidf_text_#COVID19` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ tfidf_text_act <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ tfidf_text_american <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, …
$ tfidf_text_americans <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.000000…
$ tfidf_text_amp <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 1.9806353, 0.0000000, 0.000000…
$ tfidf_text_bill <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ tfidf_text_businesses <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ tfidf_text_can <dbl> 1.4189092, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 1.4189092, 0.0000000, 0.0000000, 1.418909…
$ tfidf_text_care <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 1.670964, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, …
$ tfidf_text_congress <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ tfidf_text_coronavirus <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ tfidf_text_country <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ tfidf_text_covid19 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ tfidf_text_day <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 2.244936, 0.000000, …
$ tfidf_text_get <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, …
$ tfidf_text_great <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.000000…
$ tfidf_text_health <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 1.676625, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, …
$ tfidf_text_help <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ tfidf_text_house <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ tfidf_text_im <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, …
$ tfidf_text_just <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ tfidf_text_make <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 1.289476, 0.000000, 0.000000, 0.000000, …
$ tfidf_text_must <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ tfidf_text_need <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ tfidf_text_new <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ tfidf_text_now <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, …
$ tfidf_text_one <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, …
$ tfidf_text_pandemic <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ tfidf_text_people <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, …
$ tfidf_text_president <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 1.4500356, 0.0000000, 0.0000000, 0.000000…
$ tfidf_text_qt <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ tfidf_text_senate <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ tfidf_text_small <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, …
$ tfidf_text_state <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ tfidf_text_support <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.000000…
$ tfidf_text_thank <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ tfidf_text_time <dbl> 1.4925070, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.000000…
$ tfidf_text_today <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ tfidf_text_trump <dbl> 0.0000000, 2.1272525, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.7090842, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.000000…
$ tfidf_text_us <dbl> 0.000000, 0.000000, 3.432999, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 3.432999, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, …
$ tfidf_text_vote <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ tfidf_text_work <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, …
$ tfidf_text_workers <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
pred_new <- fit_en %>% predict(new_data = data_new)
tweets_new %<>%
mutate(pred = pred_new %>% pull(.pred_class))
tweets_new %>% count(pred)
tweets_tidy_new <- tweets_new %>%
unnest_tokens(word, text, token = "tweets")
# preprocessing
tweets_tidy_new %<>%
filter(!(word %>% str_detect('@|#presidential'))) %>% # remove hashtags and mentions
filter(!(word %>% str_detect('^amp|^http|^t\\.co'))) %>% # Twitter specific stuff
# mutate(word = word %>% str_remove_all('[^[:alnum:]]')) %>% ## remove all special characters
filter(str_length(word) > 2 ) %>% # Remove words with less than 3 characters
group_by(word) %>%
filter(n() > 100) %>% # remove words occuring less than 100 times
ungroup() %>%
anti_join(stop_words, by = 'word') # remove stopwords
# TFIDF weights
tweets_tidy_new %<>%
add_count(ID, word) %>%
bind_tf_idf(term = word,
document = ID,
n = n)
labels_words_new <- tweets_tidy_new %>%
group_by(pred) %>%
count(word, wt = tf_idf, sort = TRUE, name = "tf_idf") %>%
slice(1:20) %>%
ungroup()
hashtags_words_new <- tweets_tidy_new %>%
filter(word %>% str_detect('#')) %>%
group_by(pred) %>%
count(word, wt = tf_idf, sort = TRUE, name = "tf_idf") %>%
slice(1:20) %>%
ungroup()
labels_words_new %>%
mutate(word = reorder_within(word, by = tf_idf, within = pred)) %>%
ggplot(aes(x = word, y = tf_idf, fill = pred)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~pred, ncol = 2, scales = "free") +
coord_flip() +
scale_x_reordered()
hashtags_words_new %>%
mutate(word = reorder_within(word, by = tf_idf, within = pred)) %>%
ggplot(aes(x = word, y = tf_idf, fill = pred)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~pred, ncol = 2, scales = "free") +
coord_flip() +
scale_x_reordered()
To be continued by you :)
X
X
X
X
X
# for LDA analysis
library(topicmodels)
# LDA via the topicmodel package requires a document-term-matrix (dtm)
tweets_dtm <- tweets_tidy_new %>%
cast_dtm(document = ID, term = word, value = n)
Lets take a look:
tweets_dtm
<<DocumentTermMatrix (documents: 7567, terms: 95)>>
Non-/sparse entries: 22637/696228
Sparsity : 97%
Maximal term length: 27
Weighting : term frequency (tf)
library(tm)
tweets_dtm %>% removeSparseTerms(sparse = .99)
<<DocumentTermMatrix (documents: 7567, terms: 95)>>
Non-/sparse entries: 22637/696228
Sparsity : 97%
Maximal term length: 27
Weighting : term frequency (tf)
method
.tweets_lda <- tweets_dtm %>%
LDA(k = 6, method = "Gibbs",
control = list(seed = 1337))
# LDA output is defined for tidy(), so we can easily extract it
lda_beta <- tweets_lda %>%
tidy(matrix = "beta") %>%
group_by(topic) %>%
arrange(topic, desc(beta)) %>%
slice(1:10) %>%
ungroup()
lda_beta %>% head()
# Notice the "reorder_within()"
lda_beta %>%
mutate(term = reorder_within(term, beta, topic)) %>%
group_by(topic, term) %>%
arrange(desc(beta)) %>%
ungroup() %>%
ggplot(aes(term, beta, fill = as.factor(topic))) +
geom_col(show.legend = FALSE) +
coord_flip() +
scale_x_reordered() +
labs(title = "Top 10 terms in each LDA topic",
x = NULL, y = expression(beta)) +
facet_wrap(~ topic, ncol = 2, scales = "free")
lda_gamma <- tweets_lda %>%
tidy(matrix = "gamma")
lda_gamma %>% head()
lda_gamma %>%
ggplot(aes(gamma)) +
geom_histogram() +
scale_y_log10() +
labs(title = "Distribution of probabilities for all topics",
y = "Number of documents", x = expression(gamma))
lda_gamma %<>%
left_join(tweets_new %>% select(ID, pred), by = c('document' = 'ID'))
lda_gamma %>%
group_by(pred, topic) %>%
summarise(gamma = sum(gamma)) %>%
arrange(pred, gamma)
lda_gamma %>%
ggplot(aes(gamma, fill = as.factor(topic))) +
geom_histogram(show.legend = FALSE) +
facet_wrap(~ topic, ncol = 2) +
scale_y_log10() +
labs(title = "Distribution of probability for each topic",
y = "Number of documents", x = expression(gamma))
top_topics <- tweets_lda %>%
tidy(matrix = "gamma") %>%
group_by(document) %>%
top_n(1, wt = gamma) %>%
ungroup()
top_topics %>%
count(topic)
topicmodels_json_ldavis <- function(fitted, doc_dtm, method = "PCA", doc_in = NULL, topic_in = NULL){
require(topicmodels); require(dplyr); require(LDAvis)
# Find required quantities
phi <- posterior(fitted)$terms %>% as.matrix() # Topic-term distribution
theta <- posterior(fitted)$topics %>% as.matrix() # Document-topic matrix
# # Restrict (not working atm)
# if(!is_null(ID_in)){theta <- theta[rownames(theta) %in% doc_in,]; doc_fm %<>% dfm_subset(dimnames(doc_fm)$docs %in% doc_in)}
# Restrict
if(!is_null(topic_in)){
phi <- phi[topic_in, ]
theta <- theta[ , topic_in]
}
text_tidy <- doc_dtm %>% tidy()
vocab <- colnames(phi)
doc_length <- tibble(document = rownames(theta)) %>% left_join(text_tidy %>% count(document, wt = count), by = 'document')
tf <- tibble(term = vocab) %>% left_join(text_tidy %>% count(term, wt = count), by = "term")
if(method == "PCA"){mds <- jsPCA}
if(method == "TSNE"){library(tsne); mds <- function(x){tsne(svd(x)$u)} }
# Convert to json
json_lda <- LDAvis::createJSON(phi = phi, theta = theta, vocab = vocab, doc.length = doc_length %>% pull(n), term.frequency = tf %>% pull(n),
reorder.topics = FALSE, mds.method = mds,plot.opts = list(xlab = "Dim.1", ylab = "Dim.2"))
return(json_lda)
}
library(LDAvis)
json_lda <- topicmodels_json_ldavis(fitted = tweets_lda,
doc_dtm = tweets_dtm,
method = "TSNE")
json_lda %>% serVis()
# json_lda %>% serVis(out.dir = 'LDAviz')
Further NLP packages ecosystem
Datacamp
Other online
sessionInfo()