### Load standardpackages
library(tidyverse) # Collection of all the good stuff like dplyr, ggplot2 ect.
Registered S3 methods overwritten by 'dbplyr':
method from
print.tbl_lazy
print.tbl_sql
── Attaching packages ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.1 ──
✓ ggplot2 3.3.5 ✓ purrr 0.3.4
✓ tibble 3.1.4 ✓ dplyr 1.0.7
✓ tidyr 1.1.3 ✓ stringr 1.4.0
✓ readr 2.0.1 ✓ forcats 0.5.1
── Conflicts ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
x dplyr::filter() masks stats::filter()
x dplyr::lag() masks stats::lag()
library(magrittr) # For extra-piping operators (eg. %<>%)
Attaching package: ‘magrittr’
The following object is masked from ‘package:purrr’:
set_names
The following object is masked from ‘package:tidyr’:
extract
Back to the teens. You will work on Pokemon data. No data munging needed.
data <- read_csv('https://sds-aau.github.io/SDS-master/00_data/pokemon.csv')
Rows: 800 Columns: 13
── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (3): Name, Type1, Type2
dbl (9): Number, Total, HitPoints, Attack, Defense, SpecialAttack, SpecialDefense, Speed, Generation
lgl (1): Legendary
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data %>% head()
data %>% glimpse()
Rows: 800
Columns: 13
$ Number <dbl> 1, 2, 3, 3, 4, 5, 6, 6, 6, 7, 8, 9, 9, 10, 11, 12, 13, 14, 15, 15, 16, 17, 18, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40…
$ Name <chr> "Bulbasaur", "Ivysaur", "Venusaur", "VenusaurMega Venusaur", "Charmander", "Charmeleon", "Charizard", "CharizardMega Charizard X", "CharizardMega Charizard Y", "Squirtle…
$ Type1 <chr> "Grass", "Grass", "Grass", "Grass", "Fire", "Fire", "Fire", "Fire", "Fire", "Water", "Water", "Water", "Water", "Bug", "Bug", "Bug", "Bug", "Bug", "Bug", "Bug", "Normal"…
$ Type2 <chr> "Poison", "Poison", "Poison", "Poison", NA, NA, "Flying", "Dragon", "Flying", NA, NA, NA, NA, NA, NA, "Flying", "Poison", "Poison", "Poison", "Poison", "Flying", "Flying…
$ Total <dbl> 318, 405, 525, 625, 309, 405, 534, 634, 634, 314, 405, 530, 630, 195, 205, 395, 195, 205, 395, 495, 251, 349, 479, 579, 253, 413, 262, 442, 288, 438, 320, 485, 300, 450,…
$ HitPoints <dbl> 45, 60, 80, 80, 39, 58, 78, 78, 78, 44, 59, 79, 79, 45, 50, 60, 40, 45, 65, 65, 40, 63, 83, 83, 30, 55, 40, 65, 35, 60, 35, 60, 50, 75, 55, 70, 90, 46, 61, 81, 70, 95, 3…
$ Attack <dbl> 49, 62, 82, 100, 52, 64, 84, 130, 104, 48, 63, 83, 103, 30, 20, 45, 35, 25, 90, 150, 45, 60, 80, 80, 56, 81, 60, 90, 60, 85, 55, 90, 75, 100, 47, 62, 92, 57, 72, 102, 45…
$ Defense <dbl> 49, 63, 83, 123, 43, 58, 78, 111, 78, 65, 80, 100, 120, 35, 55, 50, 30, 50, 40, 40, 40, 55, 75, 80, 35, 60, 30, 65, 44, 69, 40, 55, 85, 110, 52, 67, 87, 40, 57, 77, 48, …
$ SpecialAttack <dbl> 65, 80, 100, 122, 60, 80, 109, 130, 159, 50, 65, 85, 135, 20, 25, 90, 20, 25, 45, 15, 35, 50, 70, 135, 25, 50, 31, 61, 40, 65, 50, 90, 20, 45, 40, 55, 75, 40, 55, 85, 60…
$ SpecialDefense <dbl> 65, 80, 100, 120, 50, 65, 85, 85, 115, 64, 80, 105, 115, 20, 25, 80, 20, 25, 80, 80, 35, 50, 70, 80, 35, 70, 31, 61, 54, 79, 50, 80, 30, 55, 40, 55, 85, 40, 55, 75, 65, …
$ Speed <dbl> 45, 60, 80, 80, 65, 80, 100, 100, 100, 43, 58, 78, 78, 45, 30, 70, 50, 35, 75, 145, 56, 71, 101, 121, 72, 97, 70, 100, 55, 80, 90, 110, 40, 65, 41, 56, 76, 50, 65, 85, 3…
$ Generation <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ Legendary <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, F…
library(skimr)
Registered S3 methods overwritten by 'htmltools':
method from
print.html tools:rstudio
print.shiny.tag tools:rstudio
print.shiny.tag.list tools:rstudio
data %>% skim()
── Data Summary ────────────────────────
Values
Name Piped data
Number of rows 800
Number of columns 13
_______________________
Column type frequency:
character 3
logical 1
numeric 9
________________________
Group variables None
── Variable type: character ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
skim_variable n_missing complete_rate min max empty n_unique whitespace
1 Name 0 1 3 25 0 800 0
2 Type1 0 1 3 8 0 18 0
3 Type2 386 0.518 3 8 0 18 0
── Variable type: logical ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
skim_variable n_missing complete_rate mean count
1 Legendary 0 1 0.0812 FAL: 735, TRU: 65
── Variable type: numeric ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
1 Number 0 1 363. 208. 1 185. 364. 539. 721 ▇▇▇▇▇
2 Total 0 1 435. 120. 180 330 450 515 780 ▃▆▇▂▁
3 HitPoints 0 1 69.3 25.5 1 50 65 80 255 ▃▇▁▁▁
4 Attack 0 1 79.0 32.5 5 55 75 100 190 ▂▇▆▂▁
5 Defense 0 1 73.8 31.2 5 50 70 90 230 ▃▇▂▁▁
6 SpecialAttack 0 1 72.8 32.7 10 49.8 65 95 194 ▅▇▅▂▁
7 SpecialDefense 0 1 71.9 27.8 20 50 70 90 230 ▇▇▂▁▁
8 Speed 0 1 68.3 29.1 5 45 65 90 180 ▃▇▆▁▁
9 Generation 0 1 3.32 1.66 1 2 3 5 6 ▇▅▃▅▂
We have 3 string variables, where one is the uniqu pokemon name, and two others it’s type. Our outcome variable legendary
is categorical, TRUE
if it is an legendary pokemon, FALSE
otherwise. All other variables are numerical.
Lets briefly look at th categorical variables:
data %>% count(Type1, sort = TRUE)
We see most pokemon are water-pokemon. However, there is no super-sparse class, so we can work with that.
data %>% count(Type2, sort = TRUE)
We see most pokemons have an NA
as Type2
, probably meening they have no second type.
We directly see some things should be changed:
Number
variable, which is just their unique id, and not of value for the analysis, so we drop it.Type2
probably indicate that the pokemon has no second type, should therefore not be treated as missing datageneration
variable is numeric, but probably should be interpreted as categorical.Lets do the necessary changes upfront before investigating further.
data %<>%
select(-Number) %>%
replace_na(list(Type2 = 'None')) %>%
mutate(Generation = Generation %>% as.character()) %>%
relocate(Name, Legendary)
We now can do some visualization. First my favorite standard one. Since we already know we later will be interested in legendary pokemon, we can already use the olor aestetic on this variable.
library(GGally)
Registered S3 method overwritten by 'GGally':
method from
+.gg ggplot2
data %>%
select(-Name, -Type1, -Type2) %>%
ggpairs(legend = 1,
mapping = ggplot2::aes(colour=Legendary, alpha = 0.5),
lower = list(continuous = wrap("smooth", alpha = 0.3, size=0.1))) +
theme(legend.position = "bottom")
plot: [1,1] [=>------------------------------------------------------------------------------------------------------------------------------------------------------------------] 1% est: 0s
plot: [1,2] [===>----------------------------------------------------------------------------------------------------------------------------------------------------------------] 2% est: 6s
plot: [1,3] [=====>--------------------------------------------------------------------------------------------------------------------------------------------------------------] 4% est: 8s
plot: [1,4] [=======>------------------------------------------------------------------------------------------------------------------------------------------------------------] 5% est: 7s
plot: [1,5] [=========>----------------------------------------------------------------------------------------------------------------------------------------------------------] 6% est: 7s
plot: [1,6] [===========>--------------------------------------------------------------------------------------------------------------------------------------------------------] 7% est: 7s
plot: [1,7] [=============>------------------------------------------------------------------------------------------------------------------------------------------------------] 9% est: 7s
plot: [1,8] [===============>----------------------------------------------------------------------------------------------------------------------------------------------------] 10% est: 7s
plot: [1,9] [=================>--------------------------------------------------------------------------------------------------------------------------------------------------] 11% est: 7s
plot: [2,1] [===================>------------------------------------------------------------------------------------------------------------------------------------------------] 12% est: 7s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot: [2,2] [=====================>----------------------------------------------------------------------------------------------------------------------------------------------] 14% est: 7s
plot: [2,3] [=======================>--------------------------------------------------------------------------------------------------------------------------------------------] 15% est: 7s
plot: [2,4] [=========================>------------------------------------------------------------------------------------------------------------------------------------------] 16% est: 7s
plot: [2,5] [===========================>----------------------------------------------------------------------------------------------------------------------------------------] 17% est: 7s
plot: [2,6] [=============================>--------------------------------------------------------------------------------------------------------------------------------------] 19% est: 7s
plot: [2,7] [===============================>------------------------------------------------------------------------------------------------------------------------------------] 20% est: 6s
plot: [2,8] [=================================>----------------------------------------------------------------------------------------------------------------------------------] 21% est: 6s
plot: [2,9] [===================================>--------------------------------------------------------------------------------------------------------------------------------] 22% est: 6s
plot: [3,1] [=====================================>------------------------------------------------------------------------------------------------------------------------------] 23% est: 7s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot: [3,2] [=======================================>----------------------------------------------------------------------------------------------------------------------------] 25% est: 7s
plot: [3,3] [==========================================>-------------------------------------------------------------------------------------------------------------------------] 26% est: 9s
plot: [3,4] [============================================>-----------------------------------------------------------------------------------------------------------------------] 27% est: 9s
plot: [3,5] [==============================================>---------------------------------------------------------------------------------------------------------------------] 28% est: 9s
plot: [3,6] [================================================>-------------------------------------------------------------------------------------------------------------------] 30% est: 8s
plot: [3,7] [==================================================>-----------------------------------------------------------------------------------------------------------------] 31% est: 8s
plot: [3,8] [====================================================>---------------------------------------------------------------------------------------------------------------] 32% est: 8s
plot: [3,9] [======================================================>-------------------------------------------------------------------------------------------------------------] 33% est: 7s
plot: [4,1] [========================================================>-----------------------------------------------------------------------------------------------------------] 35% est: 7s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot: [4,2] [==========================================================>---------------------------------------------------------------------------------------------------------] 36% est: 7s
plot: [4,3] [============================================================>-------------------------------------------------------------------------------------------------------] 37% est: 7s
plot: [4,4] [==============================================================>-----------------------------------------------------------------------------------------------------] 38% est: 7s
plot: [4,5] [================================================================>---------------------------------------------------------------------------------------------------] 40% est: 6s
plot: [4,6] [==================================================================>-------------------------------------------------------------------------------------------------] 41% est: 6s
plot: [4,7] [====================================================================>-----------------------------------------------------------------------------------------------] 42% est: 6s
plot: [4,8] [======================================================================>---------------------------------------------------------------------------------------------] 43% est: 6s
plot: [4,9] [========================================================================>-------------------------------------------------------------------------------------------] 44% est: 6s
plot: [5,1] [==========================================================================>-----------------------------------------------------------------------------------------] 46% est: 6s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot: [5,2] [============================================================================>---------------------------------------------------------------------------------------] 47% est: 5s
plot: [5,3] [==============================================================================>-------------------------------------------------------------------------------------] 48% est: 5s
plot: [5,4] [================================================================================>-----------------------------------------------------------------------------------] 49% est: 5s
plot: [5,5] [==================================================================================>---------------------------------------------------------------------------------] 51% est: 5s
plot: [5,6] [====================================================================================>-------------------------------------------------------------------------------] 52% est: 5s
plot: [5,7] [======================================================================================>-----------------------------------------------------------------------------] 53% est: 5s
plot: [5,8] [========================================================================================>---------------------------------------------------------------------------] 54% est: 4s
plot: [5,9] [==========================================================================================>-------------------------------------------------------------------------] 56% est: 4s
plot: [6,1] [============================================================================================>-----------------------------------------------------------------------] 57% est: 4s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot: [6,2] [==============================================================================================>---------------------------------------------------------------------] 58% est: 4s
plot: [6,3] [================================================================================================>-------------------------------------------------------------------] 59% est: 4s
plot: [6,4] [==================================================================================================>-----------------------------------------------------------------] 60% est: 4s
plot: [6,5] [====================================================================================================>---------------------------------------------------------------] 62% est: 4s
plot: [6,6] [======================================================================================================>-------------------------------------------------------------] 63% est: 4s
plot: [6,7] [========================================================================================================>-----------------------------------------------------------] 64% est: 3s
plot: [6,8] [==========================================================================================================>---------------------------------------------------------] 65% est: 3s
plot: [6,9] [============================================================================================================>-------------------------------------------------------] 67% est: 3s
plot: [7,1] [==============================================================================================================>-----------------------------------------------------] 68% est: 3s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot: [7,2] [================================================================================================================>---------------------------------------------------] 69% est: 3s
plot: [7,3] [==================================================================================================================>-------------------------------------------------] 70% est: 3s
plot: [7,4] [====================================================================================================================>-----------------------------------------------] 72% est: 3s
plot: [7,5] [======================================================================================================================>---------------------------------------------] 73% est: 3s
plot: [7,6] [========================================================================================================================>-------------------------------------------] 74% est: 2s
plot: [7,7] [===========================================================================================================================>----------------------------------------] 75% est: 2s
plot: [7,8] [=============================================================================================================================>--------------------------------------] 77% est: 2s
plot: [7,9] [===============================================================================================================================>------------------------------------] 78% est: 2s
plot: [8,1] [=================================================================================================================================>----------------------------------] 79% est: 2s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot: [8,2] [===================================================================================================================================>--------------------------------] 80% est: 2s
plot: [8,3] [=====================================================================================================================================>------------------------------] 81% est: 2s
plot: [8,4] [=======================================================================================================================================>----------------------------] 83% est: 2s
plot: [8,5] [=========================================================================================================================================>--------------------------] 84% est: 1s
plot: [8,6] [===========================================================================================================================================>------------------------] 85% est: 1s
plot: [8,7] [=============================================================================================================================================>----------------------] 86% est: 1s
plot: [8,8] [===============================================================================================================================================>--------------------] 88% est: 1s
plot: [8,9] [=================================================================================================================================================>------------------] 89% est: 1s
plot: [9,1] [===================================================================================================================================================>----------------] 90% est: 1s
plot: [9,2] [=====================================================================================================================================================>--------------] 91% est: 1s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot: [9,3] [=======================================================================================================================================================>------------] 93% est: 1s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot: [9,4] [=========================================================================================================================================================>----------] 94% est: 1s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot: [9,5] [===========================================================================================================================================================>--------] 95% est: 0s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot: [9,6] [=============================================================================================================================================================>------] 96% est: 0s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot: [9,7] [===============================================================================================================================================================>----] 98% est: 0s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot: [9,8] [=================================================================================================================================================================>--] 99% est: 0s `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot: [9,9] [====================================================================================================================================================================]100% est: 0s
Long story short, we already see that legendary pokemon appear to be pretty much better in everything.
We can also zoom into the conditional distributions with a joyplot
library(ggridges) # install if necessary
data %>%
gather(variable, value, -Legendary) %>% # Note: At one point do pivot_longer instead
ggplot(aes(y = as.factor(variable),
fill = as.factor(Legendary),
x = percent_rank(value)) ) +
ggridges::geom_density_ridges(alpha = 0.75)
Picking joint bandwidth of 0.0477
Registered S3 method overwritten by 'htmlwidgets':
method from
print.htmlwidget tools:rstudio
Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
res_pca <- data %>%
column_to_rownames('Name') %>%
select_if(is_numeric) %>%
PCA(scale.unit = TRUE,
graph = FALSE,
ncp = 4)
res_pca %>%
fviz_screeplot(addlabels = TRUE,
ncp = 4)
Registered S3 method overwritten by 'data.table':
method from
print.data.table
res_pca$eig %>% as_tibble()
4 Components sem to capture about 90% of the variance.
BONUS: Some visualization:
res_pca %>%
fviz_pca_var(alpha.var = "cos2",
col.var = "contrib",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE,
ggtheme = theme_gray())
All variables pretty much point into the same direction, meaning that we mainly capture a scaling phenomenon, where variables tend to move jointly. However, we do see in the secomd component, variables seem to split between defensive and offensive attributes.
res_pca %>%
fviz_pca_biplot(alpha.ind = "cos2",
geom = "point",
habillage = data %>% pull(Legendary) %>% factor(),
addEllipses = TRUE,
ggtheme = theme_gray())
We generally see again legendary pokemon to be more in the right quadrants indicating high values in their characteristics. However, there is a large grey-zone, where the 2d projection cannot really distinguist between legendary and non-legendary pokemons.
res_pca %>%
fviz_pca_biplot(alpha.ind = "cos2",
geom = "point",
habillage = data %>% pull(Type1) %>% factor(),
addEllipses = TRUE,
ggtheme = theme_gray())
For Type1
, we get a mess…
We ill run a simple UMAP
library(uwot)
Loading required package: Matrix
Attaching package: ‘Matrix’
The following objects are masked from ‘package:tidyr’:
expand, pack, unpack
res_umap <-data%>%
select_if(is_numeric) %>%
umap(n_neighbors = 15,
metric = "cosine",
min_dist = 0.01,
scale = TRUE)
res_umap %>%
as_tibble() %>%
bind_cols(data %>% select(Legendary)) %>%
ggplot(aes(x = V1, y = V2, col = Legendary)) +
geom_point(shape = 21, alpha = 0.5)
Well, UMAP seems to do a better job in sepperation high and low performing pokemons in spacw. However, we also see that only somewhat helps us to distinguish between legendary and non-legendary ones.
res_umap %>%
as_tibble() %>%
bind_cols(data %>% select(Type1)) %>%
ggplot(aes(x = V1, y = V2, col = Type1)) +
geom_point(shape = 21, alpha = 0.5)
Same goes for types…
# We use the viz_nbclust() function of the factorextra package
data %>%
select_if(is_numeric) %>%
scale() %>%
fviz_nbclust(kmeans, method = "wss")
Ok,we here settle for 2 (executive decision, since we want to identify 2 distinct classes).
res_km <- data %>%
column_to_rownames('Name') %>%
select_if(is_numeric) %>%
scale() %>%
kmeans(centers = 2, nstart = 20)
res_km %>%
fviz_cluster(data = data %>% select_if(is_numeric))
table(data$Type1, res_km$cluster)
1 2
Bug 24 45
Dark 17 14
Dragon 23 9
Electric 25 19
Fairy 8 9
Fighting 15 12
Fire 29 23
Flying 3 1
Ghost 19 13
Grass 35 35
Ground 16 16
Ice 14 10
Normal 43 55
Poison 14 14
Psychic 35 22
Rock 24 20
Steel 19 8
Water 59 53
Well, not really. However, in case I would have selected more, for instance 4, clusters, I might have had a different picture.
data %>%
bind_cols(cluster = res_km$cluster) %>%
select_if(is_numeric) %>%
group_by(cluster) %>%
mutate(n = n()) %>%
summarise_all(funs(mean)) %>%
pivot_longer(-cluster) %>%
pivot_wider(names_from = cluster, values_from = value)
As we already guessed, the 2 clusters are mainly formed by overall high or low attributes.
Since we didnt specify which type of clustering, we cann also do hirarchical clustering now and use the HCPCA function for convenience.
res_hcpc <- res_pca %>%
HCPC(nb.clust = -1, # self determined: higher relative loss of inertia
graph = FALSE)
res_hcpc %>%
plot(choice = "3D.map")
Interestingly, here we would have 3 clusters, with one in the middle.
table(data$Type1, res_hcpc$data.clust$clust)
1 2 3
Bug 39 21 9
Dark 11 12 8
Dragon 5 7 20
Electric 16 9 19
Fairy 8 6 3
Fighting 9 13 5
Fire 19 11 22
Flying 1 0 3
Ghost 13 12 7
Grass 28 31 11
Ground 14 13 5
Ice 9 9 6
Normal 43 46 9
Poison 14 13 1
Psychic 19 15 23
Rock 16 20 8
Steel 6 15 6
Water 44 50 18
Well, not reall in most cases…