### Load standardpackages
library(tidyverse) # Collection of all the good stuff like dplyr, ggplot2 ect.
library(magrittr) # For extra-piping operators (eg. %<>%)
tidytext
functionality.Spacy
, a high-level DeepLearning based NLP library that will help us to do complex stuff with not too much codeLet’s raise the bar with some Fyodor Dostoevsky
library(tidytext)
# We first need to get the book text. It can be conveniently retrieved via the gutenbergr library, linking r to the Gutenberg project
library(gutenbergr)
# check the id of crime and punishment
gutenberg_metadata %>%
filter(title == "Crime and Punishment")
text_raw <- gutenberg_download(2554)
text_raw %>% glimpse()
Rows: 22,061
Columns: 2
$ gutenberg_id <int> 2554, 2554, 2554, 2554, 2554, 2554, 2554, 2554, 2554, 2554, 2554, 255…
$ text <chr> "CRIME AND PUNISHMENT", "", "By Fyodor Dostoevsky", "", "", "", "Tran…
# LEts take a look
text_raw %>% head(200)
text <- text_raw %>%
select(-gutenberg_id) %>%
slice(-c(1:102)) %>%
mutate(line_num = row_number(),# create new variable line_num
part = cumsum(str_detect(text, regex("^PART [\\divxlc]",
ignore_case = TRUE)))) %>% # create variable part: Crime and Punishment has 7 parts %>%
group_by(part) %>%
mutate(chapter = cumsum(str_detect(text, regex("^CHAPTER [\\divxlc]",
ignore_case = TRUE)))) %>% # create new variable number of Chapter per part %>%
ungroup() %>%
filter(text != "" & !str_detect(text, regex("^[PART|CHAPTER]"))) %>%
mutate(index = 1:n()) %>%
relocate(index, line_num, part, chapter, text)
text %>% glimpse()
Rows: 16,548
Columns: 5
$ index <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21…
$ line_num <int> 7, 8, 9, 12, 13, 14, 15, 16, 17, 18, 21, 22, 23, 24, 25, 26, 27, 28, 29, …
$ part <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ chapter <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ text <chr> "On an exceptionally hot evening early in July a young man came out of", …
Cool!
text_tidy <- text %>% unnest_tokens(word, text, token = 'words') %>%
anti_join(stop_words, by = 'word')
text_tidy %>% head(10)
library(SnowballC) # Includes stemming algos
text_tidy %>%
mutate(stem = wordStem(word)) %>%
head(10)
text_tidy %>%
mutate(stem = wordStem(word)) %>%
count(stem, sort = TRUE) %>%
head(50)
# top 10 words used in Crime and Punishment
text_tidy %>%
count(word, sort = TRUE) %>%
slice(1:10) %>%
ggplot(aes(x = fct_reorder(word, n), y = n, fill = word)) +
geom_col(show.legend = FALSE) +
coord_flip() +
labs(title = "Crime and Punishment: Top 10 words used", x = NULL)
# People love wordclouds
library(wordcloud)
text_tidy %>%
count(word) %>%
with(wordcloud(word, n,
max.words = 50,
color = "blue"))
While interesting, word frequency does not tell us much about the emotions/states of mind present in the novel.
For this reason, we will go ahead with a sentiment analysis of “Crime and Punishment”
While sounding very comlpex, sentiment analysis is usually done in a rather simple way.
There are already predefined sentiment lexica around, linking words ith certain sentiments
So we do have to only join our word-token with the corresponding sentiments
The most popular dictionaries available are
# You might need to first install the 'textdata' package for some of the lexica
get_sentiments("bing") %>%
head(20)
Lets calculate them all
sentiment_bing <- text_tidy %>%
inner_join(get_sentiments("bing")) %>%
count(chapter, index = index %/% 100, sentiment) %>% # index of 100 lines of text
mutate(lexicon = 'Bing')
sentiment_nrc <- text_tidy %>%
inner_join(get_sentiments("nrc")) %>%
count(chapter, index = index %/% 100, sentiment) %>% # index of 100 lines of text
mutate(lexicon = 'NRC')
sentiment_afinn <- text_tidy %>%
select(-line_num, -part) %>%
inner_join(get_sentiments("afinn")) %>%
group_by(index = index %/% 100) %>% # index of 100 lines of text
summarise(sentiment = sum(value, na.rm = TRUE)) %>%
mutate(lexicon = 'AFINN')
# Lets join them all together for plotting
sentiment_all <- sentiment_afinn %>%
bind_rows(sentiment_bing %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
mutate(sentiment = positive - negative) %>%
select(index, sentiment, lexicon) ) %>%
bind_rows(sentiment_nrc %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
mutate(sentiment = positive - negative) %>%
select(index, sentiment, lexicon) )
# crime and punishment -
sentiment_all %>%
ggplot(aes(x = index, y = sentiment, fill = lexicon)) +
geom_col(show.legend = FALSE) +
facet_wrap(~ lexicon) +
labs(title = "Sentiment Analysis: “Crime and Punishment",
subtitle = 'Using the Bing, NRC, AFINN lexicon')
text_tidy %>%
inner_join(get_sentiments("nrc")) %>%
count(word, sentiment, sort = TRUE) %>%
filter(sentiment %in% c("joy", "sadness")) %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
as.data.frame() %>%
remove_rownames() %>%
column_to_rownames("word") %>%
comparison.cloud(colors = c("darkgreen", "grey75"),
max.words = 100,
title.size = 1.5)
# crime and punishment -
sentiment_nrc %>%
ggplot(aes(x = index, y = sentiment, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~ chapter) +
labs(title = "Sentiment Analysis: “Crime and Punishment",
subtitle = 'By chapter: Using NRC lexicon')
tidytext
had an amazing function pairwise_count
for that. However, since it is of more general use, the developers outsourced it into the not-text-specific widyr
package, which they also maintain.library(widyr)
el_words <- text_tidy %>%
pairwise_count(word, index, sort = TRUE) %>%
rename(from = item1, to = item2, weight = n)
el_words %>% head()
library(tidygraph)
library(ggraph)
g <- el_words %>%
filter(weight >= 9) %>%
as_tbl_graph(directed = FALSE) %>%
igraph::simplify() %>% as_tbl_graph()
set.seed(1337)
g %N>%
# filter(centrality_degree(weight = weight) > 100) %>%
ggraph(layout = "fr") +
geom_edge_link(aes(width = weight, edge_alpha = weight)) +
geom_node_point(aes(size = centrality_degree(weight = weight)), color = "plum4") +
geom_node_text(aes(label = name,), repel = TRUE) +
theme_graph() +
theme(legend.position = 'none') +
labs(title = 'Co-Word Network Crime and Punishment')
unnest_tokens
function to tokenize by word, or sometimes by sentence, which is useful for the kinds of sentiment and frequency analyses we’ve been doing so far.token = 'ngrams'
argument, and specify how many subsequent words we want to consider.text_tidy_ngrams <- text %>%
unnest_tokens(bigram, text, token = 'ngrams', n = 2) %>%
na.omit()
text_tidy_ngrams %>% head()
text_tidy_ngrams %>%
count(bigram, sort = TRUE) %>%
head(100)
# Seperate them
text_tidy_ngrams %<>%
separate(bigram, c("word1", "word2"), sep = " ")
text_tidy_ngrams %>% head()
# Get rid of stopwords
text_tidy_ngrams %<>%
anti_join(stop_words, by = c('word1' = 'word')) %>%
anti_join(stop_words, by = c('word2' = 'word'))
# And unite again
text_tidy_ngrams %<>%
unite(bigram, word1, word2, sep = " ")
# And finally count
text_tidy_ngrams %<>%
count(index, bigram)
text_tidy_ngrams %>%
count(bigram, wt = n, sort = TRUE) %>%
head(50)
Just for fun, a little gender analysis now…
# Define our pronouns
pronouns <- c("he", "she")
# Get our bigram where first word is a pronoun
gender_bigrams <- text %>%
unnest_tokens(bigram, text, token = 'ngrams', n = 2) %>%
na.omit() %>%
count(bigram) %>%
separate(bigram, c("word1", "word2"), sep = " ") %>%
filter(word1 %in% pronouns) %>%
count(word1, word2, wt = n, sort = TRUE)
{width=600}
spacyr
wrapper.library(spacyr)
# spacy_install() # creates a new conda environment called spacy_condaenv, as long as some version of conda is installed
spacy_initialize(model = "en_core_web_sm")
NULL
text_example <- c(d1 = "spaCy is great at fast natural language processing. Everybody loves it!
In Denmark and elsewhere.",
d2 = "We can also use it in R via the great spacyR wrapper. Daniel Michels does that sometimes")
# process documents and obtain a data.table
text_parsed <- spacy_parse(text_example)
text_parsed
text_example %>% spacy_parse(dependency = TRUE, lemma = FALSE, pos = FALSE)
text_chapter <- text %>%
group_by(chapter) %>%
summarise(text = paste(text, collapse = ' ')) %>%
pivot_wider(names_from = chapter, values_from = text)
text_entities <-text_chapter %>% as.character() %>% spacy_parse(entity = TRUE)
text_entities %>% head(1000)
text_entities %>% head(1000)
Cool or cool?
We can now again create a network, right? this time really a character network!
el_persons <- text_entities %>%
entity_consolidate() %>%
filter(entity_type %>% str_detect('PERSON')) %>%
unite(chap_sent_id, doc_id, sentence_id, sep = '_') %>%
pairwise_count(token, chap_sent_id, sort = TRUE) %>%
rename(from = item1, to = item2, weight = n)
el_persons %>% head(50)
el_persons %>% head(50)
g <- el_persons %>%
as_tbl_graph(directed = FALSE) %>%
igraph::simplify() %>% as_tbl_graph()
set.seed(1337)
g %N>%
ggraph(layout = "fr") +
geom_edge_link(aes(width = weight, edge_alpha = weight)) +
geom_node_point(aes(size = centrality_degree(weight = weight)), color = "plum4") +
geom_node_text(aes(label = name,), repel = TRUE) +
theme_graph() +
theme(legend.position = 'none') +
labs(title = 'Character Network Crime and Punishment')
set.seed(1337)
g %N>%
ggraph(layout = "fr") +
geom_edge_link(aes(width = weight, edge_alpha = weight)) +
geom_node_point(aes(size = centrality_degree(weight = weight)), color = "plum4") +
geom_node_text(aes(label = name,), repel = TRUE) +
theme_graph() +
theme(legend.position = 'none') +
labs(title = 'Character Network Crime and Punishment')
DataCamp (!Most courses have somewhat outdated ecosystems)
sessionInfo()