### Load standardpackages
library(tidyverse) # Collection of all the good stuff like dplyr, ggplot2 ect.
library(magrittr) # For extra-piping operators (eg. %<>%)
library(tidytext)
# download and open some Trump tweets from trump_tweet_data_archive
library(jsonlite)
tmp <- tempfile()
download.file("https://github.com/SDS-AAU/SDS-master/raw/master/M2/data/pol_tweets.gz", tmp)
trying URL 'https://github.com/SDS-AAU/SDS-master/raw/master/M2/data/pol_tweets.gz'
Content type 'application/octet-stream' length 7342085 bytes (7.0 MB)
==================================================
downloaded 7.0 MB
tweets_raw <- stream_in(gzfile(tmp, "pol_tweets"))
Found 1 records...
Imported 1 records. Simplifying...
tweets_raw %>% glimpse()
Rows: 1
Columns: 2
$ text <df[,50000]> <data.frame[1 x 50000]>
$ labels <df[,50000]> <data.frame[1 x 50000]>
tweets <- tibble(ID = colnames(tweets_raw[[1]]),
text = tweets_raw[[1]] %>% as.character(),
labels = tweets_raw[[2]] %>% as.logical())
#rm(tweets_raw)
tweets %>% head()
tweets %<>%
filter(!(text %>% str_detect('^RT'))) # Filter retweets
tweets %>% head()
tweets_tidy <- tweets %>%
unnest_tokens(word, text, token = "tweets")
Using `to_lower = TRUE` with `token = 'tweets'` may not preserve URLs.
tweets_tidy %>% head(50)
tweets_tidy %>% count(word, sort = TRUE)
# preprocessing
tweets_tidy %<>%
filter(!(word %>% str_detect('@'))) %>% # remove mentions
filter(!(word %>% str_detect('^amp|^http|^t\\.co'))) %>% # Twitter specific stuff
# mutate(word = word %>% str_remove_all('[^[:alnum:]]')) %>% ## remove all special characters
filter(str_length(word) > 2 ) %>% # Remove words with less than 3 characters
group_by(word) %>%
filter(n() > 100) %>% # remove words occuring less than 100 times
ungroup() %>%
anti_join(stop_words, by = 'word') # remove stopwords
# top words
tweets_tidy %>%
count(word, sort = TRUE) %>%
head(20)
# TFIDF topwords
tweets_tidy %>%
count(word, wt = tf_idf, sort = TRUE) %>%
head(20)
labels_words <- tweets_tidy %>%
group_by(labels) %>%
count(word, wt = tf_idf, sort = TRUE, name = "tf_idf") %>%
slice(1:100) %>%
ungroup()
labels_words %>%
mutate(word = reorder_within(word, by = tf_idf, within = labels)) %>%
ggplot(aes(x = word, y = tf_idf, fill = labels)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~labels, ncol = 2, scales = "free") +
coord_flip() +
scale_x_reordered()
sentiment_tweet <- tweets_tidy %>%
inner_join(get_sentiments("bing"))
… To be continued by you