### Load packages
library(tidyverse) # Collection of all the good stuff like dplyr, ggplot2 ect.
Registered S3 methods overwritten by 'dbplyr':
method from
print.tbl_lazy
print.tbl_sql
── Attaching packages ───────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.1 ──
✓ ggplot2 3.3.5 ✓ purrr 0.3.4
✓ tibble 3.1.4 ✓ dplyr 1.0.7
✓ tidyr 1.1.3 ✓ stringr 1.4.0
✓ readr 2.0.1 ✓ forcats 0.5.1
── Conflicts ──────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
x dplyr::filter() masks stats::filter()
x dplyr::lag() masks stats::lag()
library(magrittr) # For extra-piping operators (eg. %<>%)
Attaching package: ‘magrittr’
The following object is masked from ‘package:purrr’:
set_names
The following object is masked from ‘package:tidyr’:
extract
data <- read_csv('https://sds-aau.github.io/SDS-master/M1/data/cities.csv')
Rows: 780 Columns: 25
── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (4): place, alpha-2, region, sub-region
dbl (21): cost_nomad, cost_coworking, cost_expat, coffee_in_cafe, cost_beer, places_to_work, free_wifi_available, internet_speed, freedom_...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data %>% glimpse()
Rows: 780
Columns: 25
$ place <chr> "Budapest", "Chiang Mai", "Phuket", "Bangkok", "Ko Samui", "Ko Lanta", "Chiang Rai", "Pattaya", "Ao Nang", "H…
$ `alpha-2` <chr> "HU", "TH", "TH", "TH", "TH", "TH", "TH", "TH", "TH", "TH", "TH", "TH", "TH", "TH", "CZ", "CZ", "TW", "TW", "…
$ region <chr> "Europe", "Asia", "Asia", "Asia", "Asia", "Asia", "Asia", "Asia", "Asia", "Asia", "Asia", "Asia", "Asia", "As…
$ `sub-region` <chr> "Eastern Europe", "South-eastern Asia", "South-eastern Asia", "South-eastern Asia", "South-eastern Asia", "So…
$ cost_nomad <dbl> 1364, 777, 1012, 1197, 1352, 812, 1134, 1134, 866, 1232, 777, 1247, 1285, 1297, 1639, 1699, 1545, 2695, 1456,…
$ cost_coworking <dbl> 152.41, 98.88, 155.43, 131.41, 169.56, 135.65, 195.38, 113.04, 172.39, 172.39, 110.21, 28.26, 110.21, 423.90,…
$ cost_expat <dbl> 1273, 780, 1714, 1158, 1347, 1016, 1119, 1100, 1483, 2173, 1152, 1092, 730, 971, 1653, 912, 1640, 909, 1010, …
$ coffee_in_cafe <dbl> 1.73, 0.85, 1.41, 2.12, 1.41, 1.41, 1.41, 2.12, 1.84, 1.55, 1.41, 1.41, 1.41, 1.41, 1.99, 0.97, 1.88, 0.00, 1…
$ cost_beer <dbl> 1.73, 0.85, 1.41, 2.12, 1.41, 1.41, 1.41, 2.12, 1.84, 1.55, 1.41, 1.41, 1.41, 1.41, 1.99, 0.97, 1.88, 0.00, 1…
$ places_to_work <dbl> 1.0, 0.8, 0.8, 1.0, 0.8, 0.4, 0.6, 0.4, 0.4, 0.4, 0.4, 0.2, 0.2, 0.6, 1.0, 0.4, 1.0, 0.8, 0.6, 0.4, 0.6, 1.0,…
$ free_wifi_available <dbl> 0.40, 0.60, 0.40, 1.00, 0.40, 0.20, 0.60, 0.60, 0.40, 0.40, 0.20, 0.40, 0.20, 0.40, 0.60, 0.40, 1.00, 0.80, 1…
$ internet_speed <dbl> 31, 14, 14, 24, 15, 15, 12, 9, 0, 13, 2, 2, 3, 4, 15, 26, 16, 25, 7, 9, 5, 118, 81, 23, 55, 99, 17, 20, 15, 5…
$ freedom_score <dbl> 0.6, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.8, 0.8, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6,…
$ peace_score <dbl> 8.000000e-01, 4.000000e-01, 4.000000e-01, 4.000000e-01, 4.000000e-01, 4.000000e-01, 4.000000e-01, 4.000000e-0…
$ safety <dbl> 0.60, 0.80, 0.80, 0.77, 0.80, 0.80, 0.60, 0.56, 0.60, 0.40, 0.60, 0.60, 0.80, 0.80, 0.80, 0.80, 1.00, 0.80, 0…
$ fragile_states_index <dbl> 5.270000e+01, 7.880000e+01, 7.880000e+01, 7.880000e+01, 7.880000e+01, 7.880000e+01, 7.880000e+01, 7.880000e+0…
$ press_freedom_index <dbl> 28.17, 44.53, 44.53, 44.53, 44.53, 44.53, 44.53, 44.53, 44.53, 44.53, 44.53, 44.53, 44.53, 44.53, 16.66, 16.6…
$ female_friendly <dbl> 1.0, 0.8, 0.6, 0.8, 0.8, 1.0, 0.6, 0.2, 0.6, 0.6, 0.6, 1.0, 0.6, 0.6, 1.0, 0.8, 1.0, 0.8, 0.8, 1.0, 0.8, 0.8,…
$ lgbt_friendly <dbl> 0.27, 0.60, 0.80, 0.80, 0.80, 0.80, 0.40, 1.00, 0.40, 0.40, 0.80, 0.80, 0.60, 0.60, 0.60, 0.60, 0.80, 0.60, 0…
$ friendly_to_foreigners <dbl> 0.60, 0.60, 0.60, 1.00, 1.00, 0.80, 1.00, 1.00, 0.80, 0.60, 0.80, 0.80, 0.80, 1.00, 0.80, 0.80, 0.80, 0.60, 0…
$ racism <dbl> 0.40, 0.40, 0.42, 0.42, 0.40, 0.40, 0.60, 0.40, 0.40, 0.60, 0.40, 0.40, 0.40, 0.40, 0.42, 0.40, 0.00, 0.60, 0…
$ leisure <dbl> 0.80, 0.62, 0.60, 0.82, 0.80, 0.62, 0.80, 0.80, 0.60, 0.60, 0.60, 0.60, 0.60, 0.40, 1.00, 0.40, 1.00, 0.80, 0…
$ life_score <dbl> 0.86, 0.75, 0.75, 0.72, 0.80, 0.73, 0.76, 0.66, 0.70, 0.66, 0.65, 0.67, 0.66, 0.66, 0.83, 0.74, 0.93, 0.85, 0…
$ nightlife <dbl> 1.00, 0.40, 0.82, 1.00, 0.80, 0.43, 0.80, 0.80, 0.40, 0.60, 1.00, 0.40, 1.00, 0.20, 1.00, 0.60, 0.60, 0.80, 0…
$ weed <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0…
# Variables for descriptives
vars.desc <- c("cost_nomad", "places_to_work", "freedom_score", "friendly_to_foreigners", "life_score")
Registered S3 method overwritten by 'GGally':
method from
+.gg ggplot2
First, lets look at a classical correlation matrix.
ggcorr(data[,vars.desc], label = TRUE, label_size = 3, label_round = 2, label_alpha = TRUE)
library(FactoMineR)
Registered S3 methods overwritten by 'htmltools':
method from
print.html tools:rstudio
print.shiny.tag tools:rstudio
print.shiny.tag.list tools:rstudio
Registered S3 method overwritten by 'htmlwidgets':
method from
print.htmlwidget tools:rstudio
library(factoextra)
Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
Do the PCA
res_pca <- data %>%
column_to_rownames('place') %>%
select_if(is_numeric) %>%
PCA(scale.unit = TRUE, graph =FALSE)
Do a screeplot
res_pca %>%
fviz_screeplot(addlabels = TRUE,
ncp = 10,
ggtheme = theme_gray())
Registered S3 method overwritten by 'data.table':
method from
print.data.table
Plot the variable loadings
res_pca %>%
fviz_pca_var(alpha.var = "cos2",
col.var = "contrib",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE)
Bonus: Plot variables plus observations
res_pca %>%
fviz_pca_biplot(alpha.ind = "cos2",
col.ind = "contrib",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
geom = "point")
Determine number of clusters
data %>%
drop_na() %>%
column_to_rownames('place') %>%
select_if(is_numeric) %>%
scale() %>%
fviz_nbclust(hcut, method = "wss")
Do the clustering
hc <- data %>%
select_if(is_numeric) %>%
hcut(hc_func = "hclust",
k = 3,
stand = TRUE)
Visualize clusters
hc %>%
fviz_cluster(data = data %>% select_if(is_numeric))
Where do we find the clusters?
hc %>%
glimpse()
List of 12
$ merge : int [1:779, 1:2] -460 -493 -547 -463 -459 -487 -475 -578 -453 -539 ...
$ height : num [1:779] 0.157 0.161 0.169 0.201 0.22 ...
$ order : int [1:780] 111 122 28 95 128 140 141 145 146 73 ...
$ labels : NULL
$ method : chr "ward.D2"
$ call : language stats::hclust(d = x, method = hc_method)
$ dist.method: chr "euclidean"
$ cluster : int [1:780] 1 1 1 1 1 1 1 1 2 2 ...
$ nbclust : num 3
$ silinfo :List of 3
..$ widths :'data.frame': 780 obs. of 3 variables:
.. ..$ cluster : Factor w/ 3 levels "1","2","3": 1 1 1 1 1 1 1 1 1 1 ...
.. ..$ neighbor : num [1:780] 2 3 2 3 2 2 2 2 2 2 ...
.. ..$ sil_width: num [1:780] 0.228 0.216 0.215 0.213 0.212 ...
..$ clus.avg.widths: num [1:3] 0.0542 0.2006 0.1811
..$ avg.width : num 0.144
$ size : int [1:3] 257 200 323
$ data : num [1:780, 1:21] -0.866 -1.39 -1.18 -1.015 -0.876 ...
..- attr(*, "dimnames")=List of 2
.. ..$ : NULL
.. ..$ : chr [1:21] "cost_nomad" "cost_coworking" "cost_expat" "coffee_in_cafe" ...
..- attr(*, "scaled:center")= Named num [1:21] 2331.7 210.3 1880.9 3.3 3.3 ...
.. ..- attr(*, "names")= chr [1:21] "cost_nomad" "cost_coworking" "cost_expat" "coffee_in_cafe" ...
..- attr(*, "scaled:scale")= Named num [1:21] 1118.12 174.07 1266.15 1.98 1.98 ...
.. ..- attr(*, "names")= chr [1:21] "cost_nomad" "cost_coworking" "cost_expat" "coffee_in_cafe" ...
- attr(*, "class")= chr [1:2] "hclust" "hcut"
hc$cluster
[1] 1 1 1 1 1 1 1 1 2 2 1 1 1 1 3 1 1 1 1 1 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
[69] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 1 3 3 3 3 3 3 3 3 3 3
[137] 3 3 3 3 3 1 3 1 3 3 1 3 1 1 1 3 3 3 3 3 3 3 3 3 3 3 3 1 1 1 1 2 2 2 2 1 1 2 2 2 2 1 1 1 1 1 1 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
[205] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 1 1 1 3 1 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
[273] 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 1 1 1 3 3 3 1 3 3 3 3 3 3 3 3 3 3 1 1 3 3 1 1 1 3 1 3 1 3 1 3 1 1 1 1 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
[341] 3 3 3 3 3 3 3 3 3 1 3 3 3 3 3 3 3 3 2 3 1 1 1 3 3 1 1 1 3 3 1 3 3 3 3 3 3 3 3 1 1 1 1 1 1 1 1 3 3 3 3 3 1 1 1 1 1 1 1 1 1 3 2 2 2 2 1 2
[409] 2 2 2 3 1 1 1 1 1 1 1 1 1 1 1 1 3 3 1 1 2 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 2 2 2 2 2 2 1 2 3 2 2 2 2 2 2 2 2 2
[477] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 1 2 1 1 2 1 2 1 2 2 2 2 2 2 2 2 2 2 1 2 1 1 2 2 2 2 1 1 2 2 2 1 1 1 2 2 2 2
[545] 2 1 2 1 1 1 2 2 2 2 2 1 1 1 1 1 2 2 2 2 2 1 2 2 2 1 2 2 2 2 2 2 2 2 1 2 2 2 1 1 1 1 3 3 3 1 1 1 1 1 1 1 1 2 3 1 1 1 3 3 3 3 3 3 3 3 3 1
[613] 1 1 1 1 1 1 3 1 1 3 1 3 1 1 1 1 1 1 2 1 2 2 1 1 1 1 1 3 3 3 3 1 3 1 1 1 1 3 1 1 1 2 1 3 1 2 1 1 1 1 1 1 1 1 1 2 1 1 3 1 1 2 1 1 1 1 1 1
[681] 1 1 1 1 1 3 1 1 1 1 1 3 2 1 2 1 3 1 1 1 1 1 1 1 1 1 1 2 2 2 3 2 3 1 1 1 2 2 2 2 2 2 1 1 1 2 1 2 2 2 2 2 2 2 2 1 1 2 2 2 1 1 2 1 1 2 1 1
[749] 1 1 2 2 1 2 1 2 2 2 2 1 2 2 2 1 2 2 2 2 2 2 2 2 3 2 2 2 2 2 2 2
Add them to dataset
data[,"cluster"] <- hc$cluster
Inspect clusters per region
table(data$cluster, data$region)
Africa Americas Asia Europe Oceania
1 17 67 90 83 0
2 23 15 145 16 1
3 5 139 27 136 16
Also add PCA to orignal data
data[,"pca1"] <- res_pca$ind$coord[,1]
data[,"pca2"] <- res_pca$ind$coord[,2]
Component mean per cluster
data %>%
group_by(cluster) %>%
summarise(pca1 = pca1 %>% mean(),
pca2 = pca2 %>% mean())
Bonus: add trips data
trips <- read_csv('https://sds-aau.github.io/SDS-master/M1/data/trips.csv')
New names:
* `` -> ...1
Rows: 46510 Columns: 11
── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (6): username, country, country_code, country_slug, place, place_slug
dbl (3): ...1, latitude, longitude
date (2): date_end, date_start
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
trips %>% glimpse()
Rows: 46,510
Columns: 11
$ ...1 <dbl> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 3…
$ username <chr> "@lewellenmichael", "@lewellenmichael", "@lewellenmichael", "@lewellenmichael", "@waylandchin", "@waylandchin", "@wayla…
$ country <chr> "Mexico", "Mexico", "Mexico", "Jordan", "China", "Vietnam", "Hong Kong", "China", "China", "China", "Thailand", "Malays…
$ country_code <chr> "MX", "MX", "MX", "JO", "CN", "VN", "HK", "CN", "CN", "CN", "TH", "MY", "KH", "VN", "IN", "IN", "IN", "IN", "IN", "IN",…
$ country_slug <chr> "mexico", "mexico", "mexico", "jordan", "china", "vietnam", "hong-kong", "china", "china", "china", "thailand", "malays…
$ date_end <date> 2018-06-15, 2018-06-03, 2017-11-05, 2017-08-07, 2017-03-18, 2017-02-16, 2016-09-01, 2016-08-02, 2016-07-31, 2016-07-03…
$ date_start <date> 2018-06-04, 2018-05-31, 2017-11-01, 2017-07-24, 2017-02-17, 2016-09-02, 2016-08-02, 2016-07-31, 2016-07-03, 2016-06-03…
$ latitude <dbl> 21, 19, 21, 31, 40, 10, 22, 22, 22, 18, 7, 3, 11, 10, 13, 26, 27, 27, 28, 28, 19, 11, 22, 22, 38, 43, 45, 42, 25, 1, 34…
$ longitude <dbl> -101, -99, -86, 35, 122, 106, 114, 114, 113, 109, 98, 101, 104, 106, 80, 75, 78, 78, 77, 77, 72, 79, 114, 114, -77, -89…
$ place <chr> "Guanajuato", "Mexico City", "Cancun", "Amman", "Yingkou", "Ho Chi Minh City", "Shenzhen", "Hong Kong", "Zhuhai", "Sany…
$ place_slug <chr> "mexico", "mexico-city-mexico", "cancun-mexico", "amman-jordan", "china", "ho-chi-minh-city-vietnam", "hong-kong", "hon…
Add number of trips per city
data %<>%
left_join(trips %>% count(place, sort = TRUE, name = 'n_city'), by = 'place')
Check most popular cities per cluster
data %>%
select(place, cluster, n_city) %>%
group_by(cluster) %>%
arrange(desc(n_city)) %>%
slice(1:5) %>%
ungroup()
Count cluster popularity
data %>%
count(cluster, wt = n_city)
To finish up, lets plot it in a map, simplest way possible.
geo_merge <- trips %>%
select(place, longitude, latitude) %>%
distinct(place, .keep_all = TRUE)
data %<>%
left_join(geo_merge , by = 'place')
Load a worldmap geom
library(ggmap)
Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.
Please cite ggmap if you use it! See citation("ggmap") for details.
Attaching package: ‘ggmap’
The following object is masked from ‘package:magrittr’:
inset
mapWorld <- borders("world", colour = "gray50", fill = "gray50")
GThat’s how it looks
mapWorld
mapping: group = ~group, x = ~long, y = ~lat
geom_polygon: na.rm = FALSE, rule = evenodd
stat_identity: na.rm = FALSE
position_identity
Add it to an empty ggplot surface
mp <- ggplot() +
mapWorld
That’s how it looks so far
mp
Add a geom with the cities as points
nomad_map <- mp +
geom_point(data = data, aes(x = longitude, y = latitude, col = factor(cluster)))
nomad_map
Or do a density plot of popular nomad cities
mp +
stat_density2d(data = trips,
aes(x = longitude, y = latitude, fill = stat(nlevel), col = stat(nlevel) ),
alpha = 0.2, size = 0.2, bins = 10, geom = "polygon") +
scale_fill_gradient(low = "skyblue", high = "red") +
scale_color_gradient(low = "skyblue", high = "red")