### Load packages
library(tidyverse) # Collection of all the good stuff like dplyr, ggplot2 ect.
Registered S3 methods overwritten by 'dbplyr':
method from
print.tbl_lazy
print.tbl_sql
── Attaching packages ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.1 ──
✓ ggplot2 3.3.5 ✓ purrr 0.3.4
✓ tibble 3.1.4 ✓ dplyr 1.0.7
✓ tidyr 1.1.3 ✓ stringr 1.4.0
✓ readr 2.0.1 ✓ forcats 0.5.1
── Conflicts ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
x dplyr::filter() masks stats::filter()
x dplyr::lag() masks stats::lag()
library(magrittr) # For extra-piping operators (eg. %<>%)
Attaching package: ‘magrittr’
The following object is masked from ‘package:purrr’:
set_names
The following object is masked from ‘package:tidyr’:
extract
Trips
trips <- read_csv('https://sds-aau.github.io/SDS-master/M1/data/trips.csv')
New names:
* `` -> ...1
Rows: 46510 Columns: 11
── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (6): username, country, country_code, country_slug, place, place_slug
dbl (3): ...1, latitude, longitude
date (2): date_end, date_start
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
trips %>% glimpse()
Rows: 46,510
Columns: 11
$ ...1 <dbl> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 4…
$ username <chr> "@lewellenmichael", "@lewellenmichael", "@lewellenmichael", "@lewellenmichael", "@waylandchin", "@waylandchin", "@waylandchin", "@waylandchin", "@waylandchin", "@waylandch…
$ country <chr> "Mexico", "Mexico", "Mexico", "Jordan", "China", "Vietnam", "Hong Kong", "China", "China", "China", "Thailand", "Malaysia", "Cambodia", "Vietnam", "India", "India", "India…
$ country_code <chr> "MX", "MX", "MX", "JO", "CN", "VN", "HK", "CN", "CN", "CN", "TH", "MY", "KH", "VN", "IN", "IN", "IN", "IN", "IN", "IN", "IN", "IN", "CN", "CN", "US", "US", "US", "US", "AE…
$ country_slug <chr> "mexico", "mexico", "mexico", "jordan", "china", "vietnam", "hong-kong", "china", "china", "china", "thailand", "malaysia", "cambodia", "vietnam", "india", "india", "india…
$ date_end <date> 2018-06-15, 2018-06-03, 2017-11-05, 2017-08-07, 2017-03-18, 2017-02-16, 2016-09-01, 2016-08-02, 2016-07-31, 2016-07-03, 2016-06-03, 2016-05-22, 2016-04-14, 2016-02-15, 20…
$ date_start <date> 2018-06-04, 2018-05-31, 2017-11-01, 2017-07-24, 2017-02-17, 2016-09-02, 2016-08-02, 2016-07-31, 2016-07-03, 2016-06-03, 2016-05-22, 2016-04-19, 2016-02-14, 2015-11-15, 20…
$ latitude <dbl> 21, 19, 21, 31, 40, 10, 22, 22, 22, 18, 7, 3, 11, 10, 13, 26, 27, 27, 28, 28, 19, 11, 22, 22, 38, 43, 45, 42, 25, 1, 34, 55, 57, 57, 56, 55, 52, 50, 52, 15, 52, 53, 50, 53…
$ longitude <dbl> -101, -99, -86, 35, 122, 106, 114, 114, 113, 109, 98, 101, 104, 106, 80, 75, 78, 78, 77, 77, 72, 79, 114, 114, -77, -89, -69, -71, 55, 103, -119, -4, -5, -4, -5, -3, 5, 4,…
$ place <chr> "Guanajuato", "Mexico City", "Cancun", "Amman", "Yingkou", "Ho Chi Minh City", "Shenzhen", "Hong Kong", "Zhuhai", "Sanya", "Phuket", "Kuala Lumpur", "Phnom Penh", "Ho Chi …
$ place_slug <chr> "mexico", "mexico-city-mexico", "cancun-mexico", "amman-jordan", "china", "ho-chi-minh-city-vietnam", "hong-kong", "hong-kong-china", "zhuhai-china", "china", "phuket-thai…
People
people <- read_csv('https://sds-aau.github.io/SDS-master/M1/data/people.csv')
New names:
* `` -> ...1
Rows: 4016 Columns: 6
── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (3): username, work_raw, education_raw
dbl (3): ...1, followers, following
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
people %>% glimpse()
Rows: 4,016
Columns: 6
$ ...1 <dbl> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, …
$ username <chr> "@lewellenmichael", "@waylandchin", "@karan", "@skaboss217", "@apwn", "@samcalma", "@paulbremer", "@jtompl", "@jezfx", "@markcaggiano", "@ndbroadbent", "@tedavery", "@wal…
$ followers <dbl> 1, 0, 2, 0, 17, 3, 4, 2, 17, 2, 11, 11, 5, 8, 0, 9, 3, 5, 25, 1, 1, 1, 61, 2, 11, 0, 1, 2, 93, 3, 2, 6, 9, 14, 27, 0, 9, 0, 4, 5, 5, 8, 6, 5, 4, 12, 6, 10, 4, 1, 5, 1, 3,…
$ following <dbl> 2, 2, 1, 1, 426, 3, 9, 3, 23, 2, 17, 6, 9, 7, 1, 6, 3, 34, 23, 4, 4, 4, 120, 2, 10, 2, 2, 5, 10, 4, 2, 1, 16, 14, 33, 1, 14, 2, 25, 15, 2, 7, 5, 11, 2, 9, 6, 11, 7, 2, 7,…
$ work_raw <chr> "Software Dev, Startup Founder, Finance, Crypto, Product Manager, Education, Data, Ecommerce", NA, NA, NA, "Web Dev", NA, NA, "Web Dev, Software Dev, Startup Founder, Pro…
$ education_raw <chr> "High School, Bachelor's Degree", NA, NA, NA, NA, NA, NA, "High School, Bachelor's Degree, Master's Degree", NA, NA, NA, NA, NA, NA, "Master's Degree", "High School, Bach…
Countries
countries <- read_csv( 'https://sds-aau.github.io/SDS-master/M1/data/countrylist.csv')
Rows: 249 Columns: 3
── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (3): alpha_2, region, sub_region
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
countries %>% glimpse()
Rows: 249
Columns: 3
$ alpha_2 <chr> "AF", "AX", "AL", "DZ", "AS", "AD", "AO", "AI", "AQ", "AG", "AR", "AM", "AW", "AU", "AT", "AZ", "BS", "BH", "BD", "BB", "BY", "BE", "BZ", "BJ", "BM", "BT", "BO", "BQ", "BA",…
$ region <chr> "Asia", "Europe", "Europe", "Africa", "Oceania", "Europe", "Africa", "Americas", NA, "Americas", "Americas", "Asia", "Americas", "Oceania", "Europe", "Asia", "Americas", "As…
$ sub_region <chr> "Southern Asia", "Northern Europe", "Southern Europe", "Northern Africa", "Polynesia", "Southern Europe", "Sub-Saharan Africa", "Latin America and the Caribbean", NA, "Latin…
trips %>% select(date_start, date_end) %>%
glimpse()
Rows: 46,510
Columns: 2
$ date_start <date> 2018-06-04, 2018-05-31, 2017-11-01, 2017-07-24, 2017-02-17, 2016-09-02, 2016-08-02, 2016-07-31, 2016-07-03, 2016-06-03, 2016-05-22, 2016-04-19, 2016-02-14, 2015-11-15, 2017…
$ date_end <date> 2018-06-15, 2018-06-03, 2017-11-05, 2017-08-07, 2017-03-18, 2017-02-16, 2016-09-01, 2016-08-02, 2016-07-31, 2016-07-03, 2016-06-03, 2016-05-22, 2016-04-14, 2016-02-15, 2017…
readr
is smart, so if you loaded the data with read_csv
, then this is already taken care of. Otherwise:
# To demonstrate, I transform it back to a string.
trips %<>% mutate(date_start = date_start %>% as.character(),
date_end = date_end %>% as.character())
trips %>% select(date_start, date_end) %>%
glimpse()
Rows: 46,510
Columns: 2
$ date_start <chr> "2018-06-04", "2018-05-31", "2017-11-01", "2017-07-24", "2017-02-17", "2016-09-02", "2016-08-02", "2016-07-31", "2016-07-03", "2016-06-03", "2016-05-22", "2016-04-19", "2016…
$ date_end <chr> "2018-06-15", "2018-06-03", "2017-11-05", "2017-08-07", "2017-03-18", "2017-02-16", "2016-09-01", "2016-08-02", "2016-07-31", "2016-07-03", "2016-06-03", "2016-05-22", "2016…
In case it is a string but well formated, we can use the lubridate
packages.
library(lubridate) # This is tidyverse's datetime package
Attaching package: ‘lubridate’
The following objects are masked from ‘package:base’:
date, intersect, setdiff, union
trips %<>% mutate(date_start = date_start %>% as_date(),
date_end = date_end %>% as_date())
trips %>% select(date_start, date_end) %>%
glimpse()
Rows: 46,510
Columns: 2
$ date_start <date> 2018-06-04, 2018-05-31, 2017-11-01, 2017-07-24, 2017-02-17, 2016-09-02, 2016-08-02, 2016-07-31, 2016-07-03, 2016-06-03, 2016-05-22, 2016-04-19, 2016-02-14, 2015-11-15, 2017…
$ date_end <date> 2018-06-15, 2018-06-03, 2017-11-05, 2017-08-07, 2017-03-18, 2017-02-16, 2016-09-01, 2016-08-02, 2016-07-31, 2016-07-03, 2016-06-03, 2016-05-22, 2016-04-14, 2016-02-15, 2017…
trips %<>% mutate(trip_duration = date_end - date_start)
# Test if it works
trips %>%
select(trip_duration, date_start, date_end) %>%
head()
Seems to work fine :)
Lets inspect:
trips %>%
select(trip_duration, date_start, date_end) %>%
summary()
trip_duration date_start date_end
Length:46510 Min. :0003-11-12 Min. :0012-07-12
Class :difftime 1st Qu.:2015-08-29 1st Qu.:2015-09-29
Mode :numeric Median :2016-09-23 Median :2016-10-15
Mean :2015-09-28 Mean :2014-07-10
3rd Qu.:2017-08-26 3rd Qu.:2017-09-14
Max. :2106-06-19 Max. :2222-01-01
NA's :14 NA's :189
We clearly see that some observations areunrealistic (trip in Jesus’s times or in the future etc.). Lets look at the distribution
trips %>%
ggplot(aes(x = date_start)) +
geom_histogram()
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
trips %>%
ggplot(aes(x = date_end)) +
geom_histogram()
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
There are many ways to deal with outliers. To make it simple:
1: We could filter by some minimum / maximum date set manually 2: We could just delete extreme values using percentage_rank
(deleting the 1 percent of obs with highest/lowest values). We will demonstrate this here:
trips %<>%
mutate(date_start_pct = date_start %>% as.numeric() %>% percent_rank(),
date_end_pct = date_end %>% as.numeric() %>% percent_rank()) %>%
filter(date_start_pct >= 0.01 & date_start_pct <= 0.99) %>%
filter(date_end_pct >= 0.01 & date_end_pct <= 0.99)
Lets check how it looks now:
trips %>%
select(trip_duration, date_start, date_end) %>%
summary()
trip_duration date_start date_end
Length:45176 Min. :2002-09-11 Min. :2003-06-08
Class :difftime 1st Qu.:2015-09-08 1st Qu.:2015-10-10
Mode :numeric Median :2016-09-20 Median :2016-10-14
Mean :2016-03-09 Mean :2016-04-10
3rd Qu.:2017-08-16 3rd Qu.:2017-09-06
Max. :2018-09-01 Max. :2018-10-12
We clearly see that some observations areunrealistic (trip in Jesus’s times or in the future etc.). Lets look at the distribution
trips %>%
ggplot(aes(x = date_start)) +
geom_histogram()
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
trips %>%
ggplot(aes(x = date_end)) +
geom_histogram()
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Way more realistic, right?
A simple left join.Be only aware of the different variable names
trips %<>%
left_join(countries, by = c("country_code" = "alpha_2"))
trips %>% head()
New variables are in, seems to work. Lets check if there are some trips that could not match:
trips %>% filter(is.na(region) | is.na(sub_region))
trips %>% filter(is.na(region) | is.na(sub_region)) %>%
count(country_code, sort = TRUE)
Ok, we see some country codes did not match. We dont bother for most small numbers, but one thing we might take a look at: UK did not match, since it is coded GB in the countries dataframe (Just inspect it). Lets delete the newly matched variables and start over again.
trips %<>%
select(-region, -sub_region)
Lets replace UK with GB
trips %<>%
mutate(country_code = country_code %>% str_replace(pattern = 'UK', replacement = 'GB'))
trips %>%
filter(country_code == 'UK' |country_code == 'GB') %>%
count(country_code)
Ok, no more UK present… lets join again.
trips %<>%
left_join(countries, by = c("country_code" = "alpha_2"))
trips %>% filter(is.na(region) | is.na(sub_region)) %>%
count(country_code, sort = TRUE)
Ok, the rest seems negligible… we just delete these observations…
trips %<>% drop_na(region)
people %>% glimpse()
Rows: 4,016
Columns: 6
$ ...1 <dbl> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, …
$ username <chr> "@lewellenmichael", "@waylandchin", "@karan", "@skaboss217", "@apwn", "@samcalma", "@paulbremer", "@jtompl", "@jezfx", "@markcaggiano", "@ndbroadbent", "@tedavery", "@wal…
$ followers <dbl> 1, 0, 2, 0, 17, 3, 4, 2, 17, 2, 11, 11, 5, 8, 0, 9, 3, 5, 25, 1, 1, 1, 61, 2, 11, 0, 1, 2, 93, 3, 2, 6, 9, 14, 27, 0, 9, 0, 4, 5, 5, 8, 6, 5, 4, 12, 6, 10, 4, 1, 5, 1, 3,…
$ following <dbl> 2, 2, 1, 1, 426, 3, 9, 3, 23, 2, 17, 6, 9, 7, 1, 6, 3, 34, 23, 4, 4, 4, 120, 2, 10, 2, 2, 5, 10, 4, 2, 1, 16, 14, 33, 1, 14, 2, 25, 15, 2, 7, 5, 11, 2, 9, 6, 11, 7, 2, 7,…
$ work_raw <chr> "Software Dev, Startup Founder, Finance, Crypto, Product Manager, Education, Data, Ecommerce", NA, NA, NA, "Web Dev", NA, NA, "Web Dev, Software Dev, Startup Founder, Pro…
$ education_raw <chr> "High School, Bachelor's Degree", NA, NA, NA, NA, NA, NA, "High School, Bachelor's Degree, Master's Degree", NA, NA, NA, NA, NA, NA, "Master's Degree", "High School, Bach…
Lets see what educations we have in the data
people %>% count(education_raw)
Ok, that seems easy. Since all educations include Highschool (or higher), we can just assume that everybody that has all people with a non-missing education field have at least a highschool degree.
sum(!is.na(people$education_raw))
[1] 451
However, for the rest we just dont know…
people %>% count(work_raw, sort = TRUE)
We cannot just filter for the string, since it is contained in multiple categories. We have to instead detect all strings where “Spartup Founder” appears. HEre we need the stringr
package.
people %>%
filter(work_raw %>% str_detect('Startup Founder')) %>%
head()
people %>%
filter(education_raw %>% str_detect('Master\'s Degree')) %>% head()
# Notice the needed escape sign \ before the '
Putting it together and counting
people %>%
filter(work_raw %>% str_detect('Startup Founder') & education_raw %>% str_detect('Master\'s Degree') ) %>%
summarise(n = n())
NA
Its 53…
trips %>% glimpse()
Rows: 45,124
Columns: 16
$ ...1 <dbl> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,…
$ username <chr> "@lewellenmichael", "@lewellenmichael", "@lewellenmichael", "@lewellenmichael", "@waylandchin", "@waylandchin", "@waylandchin", "@waylandchin", "@waylandchin", "@wayland…
$ country <chr> "Mexico", "Mexico", "Mexico", "Jordan", "China", "Vietnam", "Hong Kong", "China", "China", "China", "Thailand", "Malaysia", "Cambodia", "Vietnam", "India", "India", "Ind…
$ country_code <chr> "MX", "MX", "MX", "JO", "CN", "VN", "HK", "CN", "CN", "CN", "TH", "MY", "KH", "VN", "IN", "IN", "IN", "IN", "IN", "IN", "IN", "IN", "CN", "CN", "US", "US", "US", "US", "…
$ country_slug <chr> "mexico", "mexico", "mexico", "jordan", "china", "vietnam", "hong-kong", "china", "china", "china", "thailand", "malaysia", "cambodia", "vietnam", "india", "india", "ind…
$ date_end <date> 2018-06-15, 2018-06-03, 2017-11-05, 2017-08-07, 2017-03-18, 2017-02-16, 2016-09-01, 2016-08-02, 2016-07-31, 2016-07-03, 2016-06-03, 2016-05-22, 2016-04-14, 2016-02-15, …
$ date_start <date> 2018-06-04, 2018-05-31, 2017-11-01, 2017-07-24, 2017-02-17, 2016-09-02, 2016-08-02, 2016-07-31, 2016-07-03, 2016-06-03, 2016-05-22, 2016-04-19, 2016-02-14, 2015-11-15, …
$ latitude <dbl> 21, 19, 21, 31, 40, 10, 22, 22, 22, 18, 7, 3, 11, 10, 13, 26, 27, 27, 28, 28, 19, 11, 22, 22, 38, 43, 45, 42, 25, 1, 34, 55, 57, 57, 56, 55, 52, 50, 52, 15, 52, 53, 50, …
$ longitude <dbl> -101, -99, -86, 35, 122, 106, 114, 114, 113, 109, 98, 101, 104, 106, 80, 75, 78, 78, 77, 77, 72, 79, 114, 114, -77, -89, -69, -71, 55, 103, -119, -4, -5, -4, -5, -3, 5, …
$ place <chr> "Guanajuato", "Mexico City", "Cancun", "Amman", "Yingkou", "Ho Chi Minh City", "Shenzhen", "Hong Kong", "Zhuhai", "Sanya", "Phuket", "Kuala Lumpur", "Phnom Penh", "Ho Ch…
$ place_slug <chr> "mexico", "mexico-city-mexico", "cancun-mexico", "amman-jordan", "china", "ho-chi-minh-city-vietnam", "hong-kong", "hong-kong-china", "zhuhai-china", "china", "phuket-th…
$ trip_duration <drtn> 11 days, 3 days, 4 days, 14 days, 29 days, 167 days, 30 days, 2 days, 28 days, 30 days, 12 days, 33 days, 60 days, 92 days, 29 days, 2 days, 1 days, 1 days, 4 days, 4 d…
$ date_start_pct <dbl> 0.95692010, 0.95414561, 0.80311861, 0.72446500, 0.60333369, 0.48489085, 0.46589956, 0.46402839, 0.44695129, 0.42716421, 0.41881923, 0.39642972, 0.35259705, 0.28917088, 0…
$ date_end_pct <dbl> 0.9515976, 0.9441062, 0.7909758, 0.7200777, 0.6090242, 0.5862047, 0.4690415, 0.4508636, 0.4485967, 0.4315415, 0.4113774, 0.4026339, 0.3758636, 0.3352332, 0.5750000, 0.55…
$ region <chr> "Americas", "Americas", "Americas", "Asia", "Asia", "Asia", "Asia", "Asia", "Asia", "Asia", "Asia", "Asia", "Asia", "Asia", "Asia", "Asia", "Asia", "Asia", "Asia", "Asia…
$ sub_region <chr> "Latin America and the Caribbean", "Latin America and the Caribbean", "Latin America and the Caribbean", "Western Asia", "Eastern Asia", "South-eastern Asia", "Eastern A…
Thats easy…
trips %>%
count(country_code) %>%
arrange(desc(n)) %>%
head()
The US recieves the most trips.
trips %>%
count(country_code) %>%
arrange(n) %>%
head()
Hmm, all some weird country codes… we could no filter them for only official ones… but lets leave it like that for now…
Since the dates are already datetimes, we can just extract the year with the year() function of lubridate.
trips %>%
filter(year(date_start) == 2017) %>%
count(country_code, sort = TRUE)
Its again the US.
trips %>%
filter(sub_region == 'Western Europe') %>%
# count(country_code, sort = TRUE) %>%
ggplot(aes(x = country_code)) +
geom_bar()
Could be done pettier, though
trips %>%
left_join(people %>% select(username, work_raw), by = 'username') %>%
mutate(founder = work_raw %>% str_detect('Startup Founder')) %>%
group_by(founder) %>%
summarize(duration_mean = trip_duration %>% mean(na.rm = TRUE))
Indeed, it seems they on average have the shortest trips… busy people….