Preamble

Standard packages

### Load packages
library(tidyverse) # Collection of all the good stuff like dplyr, ggplot2 ect.
Registered S3 methods overwritten by 'dbplyr':
  method         from
  print.tbl_lazy     
  print.tbl_sql      
── Attaching packages ────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.1 ──
✓ ggplot2 3.3.5     ✓ purrr   0.3.4
✓ tibble  3.1.5     ✓ dplyr   1.0.7
✓ tidyr   1.1.4     ✓ stringr 1.4.0
✓ readr   2.0.2     ✓ forcats 0.5.1
── Conflicts ───────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
x dplyr::filter() masks stats::filter()
x dplyr::lag()    masks stats::lag()
library(magrittr) # For extra-piping operators (eg. %<>%)

Attaching package: ‘magrittr’

The following object is masked from ‘package:purrr’:

    set_names

The following object is masked from ‘package:tidyr’:

    extract

Load data

data <- read_csv('data/Combined_Jobs_Final.csv')
Rows: 84090 Columns: 23
── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (17): Status, Slug, Title, Position, Company, City, State.Name, State.Code, Address, Industry, Job.Description, Listing.Start, Listing.End,...
dbl  (5): Job.ID, Provider, Latitude, Longitude, Salary
lgl  (1): Requirements

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data %>% glimpse()
Rows: 84,090
Columns: 23
$ Job.ID             <dbl> 111, 113, 117, 121, 127, 129, 131, 133, 134273, 134274, 134275, 134276, 134277, 134278, 134279, 134280, 134281, 134282…
$ Provider           <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2…
$ Status             <chr> "open", "open", "open", "open", "open", "open", "open", "open", "open", "open", "open", "open", "open", "open", "open"…
$ Slug               <chr> "palo-alto-ca-tacolicious-server", "san-francisco-ca-claude-lane-kitchen-staff-chef", "san-francisco-ca-machka-restaur…
$ Title              <chr> "Server @ Tacolicious", "Kitchen Staff/Chef @ Claude Lane", "Bartender @ Machka Restaurants Corp.", "Server @ Teriyaki…
$ Position           <chr> "Server", "Kitchen Staff/Chef", "Bartender", "Server", "Kitchen Staff/Chef", "Receptionist", "Server", "Driver", "Assi…
$ Company            <chr> "Tacolicious", "Claude Lane", "Machka Restaurants Corp.", "Teriyaki House", "Rosa Mexicano - Sunset", "Mind of Beauty …
$ City               <chr> "Palo Alto", "San Francisco", "San Francisco", "Brisbane", "Los Angeles", "Los Altos", "Los Angeles", "Berkeley", "Men…
$ State.Name         <chr> "California", "California", "California", "California", "California", "California", "California", "California", "New J…
$ State.Code         <chr> "CA", "CA", "CA", "CA", "CA", "CA", "CA", "CA", "NJ", "WI", "IL", "KY", "SC", "PA", "KY", "NC", "NJ", "OH", "MN", "KY"…
$ Address            <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ Latitude           <dbl> 37.44335, 37.78983, 37.79560, 37.68507, 34.07338, 37.38522, 34.18630, 37.86721, 40.77600, 43.90650, 42.03450, 38.25486…
$ Longitude          <dbl> -122.16117, -122.40427, -122.40296, -122.40028, -118.46044, -122.11413, -118.60637, -122.25861, -74.60100, -91.23340, …
$ Industry           <chr> "Food and Beverages", "Food and Beverages", "Food and Beverages", "Food and Beverages", "Food and Beverages", "Retail"…
$ Job.Description    <chr> "Tacolicious' first Palo Alto store just opened recently, and we are hiring! If you love tacos, you will love working …
$ Requirements       <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ Salary             <dbl> 8.00, 0.00, 11.00, 10.55, 10.55, 0.00, 8.00, 11.00, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ Listing.Start      <chr> NA, NA, NA, NA, NA, NA, NA, NA, "12-05-2014", "12-05-2014", "12-05-2014", "12-05-2014", "12-05-2014", "12-05-2014", "1…
$ Listing.End        <chr> NA, NA, NA, NA, NA, NA, NA, NA, "01-04-2015", "01-04-2015", "01-04-2015", "01-04-2015", "01-04-2015", "01-04-2015", "0…
$ Employment.Type    <chr> "Part-Time", "Part-Time", "Part-Time", "Part-Time", "Part-Time", "Part-Time", "Part-Time", "Part-Time", "Part-Time", "…
$ Education.Required <chr> NA, NA, NA, NA, NA, NA, NA, NA, "Not Specified", "High School Diploma", "Not Specified", "Not Specified", "Not Specifi…
$ Created.At         <chr> "2013-03-12 02:08:28 UTC", "2013-04-12 08:36:36 UTC", "2013-07-16 09:34:10 UTC", "2013-09-04 15:40:30 UTC", "2013-07-1…
$ Updated.At         <chr> "2014-08-16 15:35:36 UTC", "2014-08-16 15:35:36 UTC", "2014-08-16 15:35:37 UTC", "2014-08-16 15:35:38 UTC", "2014-08-1…
data[1,'Job.Description']
colnames(data) <- colnames(data) %>% tolower() %>% str_replace('\\.', '_')

Text prepeocssing

library(tidytext)
data %<>% 
  mutate(text = paste(title, job_description, sep = ' ')) %>%
  select(job_id, text)
text_tidy %>% 
  anti_join(stop_words) %>%
  count(word, sort = TRUE)
Joining, by = "word"
library(textdata)
text_tidy_embed_small <- text_tidy_embed[1:100000,]
text_tidy_embed_small %<>%
  group_by(job_id) %>%
  add_count(word) %>%
  ungroup()
text_tidy_embed_small %>%
  select(job_id, word, tf_idf, everything()) %>%
  select(-n, tf, idf) %>%
  mutate(across(where(starts_with('d')), ~ fun(.x) .x * tf_idf, , .names = ".col"))
Error: unexpected symbol in:
"  select(-n, tf, idf) %>%
  mutate(across(where(starts_with('d')), ~ fun(.x) .x"
library(widyr)
text_tidy_embed_small %>%
  pairwise_similarity()

Doc2Vec

library(doc2vec)
x <- data #[1:10000,]
x$text   <- tolower(x$text)
x$text   <- gsub("[^[:alpha:]]", " ", x$text)
x$text   <- gsub("[[:space:]]+", " ", x$text)
x$text   <- trimws(x$text)
x %<>%
  rename(doc_id = job_id)
## More realistic model
model <- paragraph2vec(x = x, type = "PV-DM", dim = 10, iter = 3,  
                       min_count = 5, lr = 0.05, threads = 4)
embedding_word <- as.matrix(model, which = "words")
embedding_doc <- as.matrix(model, which = "docs")
embedding_word %>% head()
          [,1]        [,2]        [,3]       [,4]        [,5]
</s> 0.5049461  0.55751580 -0.48319447 -0.4135315  0.17239621
and  0.4475586 -0.70460820  0.02434461 -0.4640722 -0.29540303
to   0.4501234 -0.64722580 -0.57381165  0.1133763  0.19071901
the  0.9781786  0.02705558  0.05119156 -0.1305319  0.15091500
a    0.4065473 -0.27062786 -0.24862848  0.5535178  0.62712181
of   0.5894881 -0.75248194  0.17775758 -0.2169727  0.08717752
embedding_doc %>% head()
          [,1]         [,2]        [,3]        [,4]        [,5]
111  0.7196287  0.431667209  0.10125642  0.53245306  0.04515361
113  0.4762859 -0.667058170 -0.03218742 -0.35846087  0.44570717
117  0.8882785 -0.286137760 -0.32409638  0.09825002 -0.11997923
121  0.8938912  0.005031405 -0.20705684  0.32504669 -0.22892231
127  0.8835905 -0.189624771  0.33544683  0.24478789 -0.10423350
129 -0.4069880  0.164157242  0.35578504  0.82381403 -0.04648277
nn <- predict(model, newdata = c("server", 'computer', 'pizza'), type = "nearest", which = "word2word", top_n = 5)
nn
[[1]]

[[2]]

[[3]]
NA
nn <- predict(model, newdata = c('sushi', 'waiter', 'chef'), type = "nearest", which = "doc2doc",   top_n = 5)
nn
[[1]]

[[2]]

[[3]]
NA
LS0tCnRpdGxlOiAiSW4tQ2xhc3MgRXhlcmNpc2U6IFVuc3RydWN0dXJlZCBhZHZhbmNlZCkiCmF1dGhvcjogIkRhbmllbCBTLiBIYWluIChkc2hAYnVzaW5lc3MuYWF1LmRrKSIKZGF0ZTogIlVwZGF0ZWQgYHIgZm9ybWF0KFN5cy50aW1lKCksICclQiAlZCwgJVknKWAiCm91dHB1dDoKICBodG1sX25vdGVib29rOgogICAgY29kZV9mb2xkaW5nOiBzaG93CiAgICBkZl9wcmludDogcGFnZWQKICAgIHRvYzogdHJ1ZQogICAgdG9jX2RlcHRoOiAyCiAgICB0b2NfZmxvYXQ6CiAgICAgIGNvbGxhcHNlZDogZmFsc2UKICAgIHRoZW1lOiBmbGF0bHkKLS0tCgpgYGB7ciBzZXR1cCwgaW5jbHVkZT1GQUxTRX0KIyBLbml0ciBvcHRpb25zCiMjIyBHZW5lcmljIHByZWFtYmxlCnJtKGxpc3Q9bHMoKSk7IGdyYXBoaWNzLm9mZigpIApTeXMuc2V0ZW52KExBTkcgPSAiZW4iKSAjIEZvciBlbmdsaXNoIGxhbmd1YWdlCm9wdGlvbnMoc2NpcGVuID0gNSkgIyBUbyBkZWFjdGl2YXRlIGFubm95aW5nIHNjaWVudGlmaWMgbnVtYmVyIG5vdGF0aW9uCgojIHJtKGxpc3Q9bHMoKSk7IGdyYXBoaWNzLm9mZigpICMgZ2V0IHJpZCBvZiBldmVyeXRoaW5nIGluIHRoZSB3b3Jrc3BhY2UKaWYgKCFyZXF1aXJlKCJrbml0ciIpKSBpbnN0YWxsLnBhY2thZ2VzKCJrbml0ciIpOyBsaWJyYXJ5KGtuaXRyKSAjIEZvciBkaXNwbGF5IG9mIHRoZSBtYXJrZG93bgoKIyMjIEtuaXRyIG9wdGlvbnMKa25pdHI6Om9wdHNfY2h1bmskc2V0KHdhcm5pbmc9RkFMU0UsCiAgICAgICAgICAgICAgICAgICAgIG1lc3NhZ2U9RkFMU0UsCiAgICAgICAgICAgICAgICAgICAgIGZpZy5hbGlnbj0iY2VudGVyIgogICAgICAgICAgICAgICAgICAgICApCmBgYAoKIyBQcmVhbWJsZQoKIyMgU3RhbmRhcmQgcGFja2FnZXMKCmBgYHtyfQojIyMgTG9hZCBwYWNrYWdlcwpsaWJyYXJ5KHRpZHl2ZXJzZSkgIyBDb2xsZWN0aW9uIG9mIGFsbCB0aGUgZ29vZCBzdHVmZiBsaWtlIGRwbHlyLCBnZ3Bsb3QyIGVjdC4KbGlicmFyeShtYWdyaXR0cikgIyBGb3IgZXh0cmEtcGlwaW5nIG9wZXJhdG9ycyAoZWcuICU8PiUpCmBgYAoKIyBMb2FkIGRhdGEKCmBgYHtyfQpkYXRhIDwtIHJlYWRfY3N2KCdkYXRhL0NvbWJpbmVkX0pvYnNfRmluYWwuY3N2JykKYGBgCmBgYHtyfQpkYXRhICU+JSBnbGltcHNlKCkKYGBgCgpgYGB7cn0KZGF0YVsxLCdKb2IuRGVzY3JpcHRpb24nXQpgYGAKCmBgYHtyfQpjb2xuYW1lcyhkYXRhKSA8LSBjb2xuYW1lcyhkYXRhKSAlPiUgdG9sb3dlcigpICU+JSBzdHJfcmVwbGFjZSgnXFwuJywgJ18nKQpgYGAKCiMgVGV4dCBwcmVwZW9jc3NpbmcKCmBgYHtyfQpsaWJyYXJ5KHRpZHl0ZXh0KQpgYGAKCmBgYHtyfQpkYXRhICU8PiUgCiAgbXV0YXRlKHRleHQgPSBwYXN0ZSh0aXRsZSwgam9iX2Rlc2NyaXB0aW9uLCBzZXAgPSAnICcpKSAlPiUKICBzZWxlY3Qoam9iX2lkLCB0ZXh0KQpgYGAKCmBgYHtyfQp0ZXh0X3RpZHkgPC0gZGF0YSAlPiUgdW5uZXN0X3Rva2Vucyh3b3JkLCB0ZXh0LCB0b2tlbiA9ICd3b3JkcycpIApgYGAKCmBgYHtyfQp0ZXh0X3RpZHkgJT4lIAogIGFudGlfam9pbihzdG9wX3dvcmRzKSAlPiUKICBjb3VudCh3b3JkLCBzb3J0ID0gVFJVRSkKYGBgCgoKYGBge3J9CmxpYnJhcnkodGV4dGRhdGEpCmBgYAoKYGBge3J9Cmdsb3ZlNmIgPC0gZW1iZWRkaW5nX2dsb3ZlNmIoZGltZW5zaW9ucyA9IDEwMCkKYGBgCgpgYGB7cn0KdGV4dF90aWR5X2VtYmVkIDwtIHRleHRfdGlkeSAlPiUKICBpbm5lcl9qb2luKGdsb3ZlNmIsIGJ5ID0gYygnd29yZCcgPSAndG9rZW4nKSkKYGBgCgpgYGB7cn0KdGV4dF90aWR5X2VtYmVkX3NtYWxsIDwtIHRleHRfdGlkeV9lbWJlZFsxOjEwMDAwMCxdCmBgYAoKYGBge3J9CiPigqx0ZXh0X3RpZHlfZW1iZWRfc21hbGwgJTw+JQojICBncm91cF9ieShqb2JfaWQpICU+JQojICBhZGRfY291bnQod29yZCkgJT4lCiMgIHVuZ3JvdXAoKQpgYGAKCmBgYHtyfQojdGV4dF90aWR5X2VtYmVkX3NtYWxsICU8PiUKIyAgYmluZF90Zl9pZGYod29yZCwgam9iX2lkLCBuKQpgYGAKCmBgYHtyfQojdGV4dF90aWR5X2VtYmVkX3NtYWxsICU+JQojICBzZWxlY3Qoam9iX2lkLCB3b3JkLCB0Zl9pZGYsIGV2ZXJ5dGhpbmcoKSkgJT4lCiMgIHNlbGVjdCgtbiwgdGYsIGlkZikgJT4lCiMgIG11dGF0ZShhY3Jvc3Mod2hlcmUoc3RhcnRzX3dpdGgoJ2QnKSksIH4gZnVuKC54KSAueCAqIHRmX2lkZiwgLCAubmFtZXMgPSAiLmNvbCIpKQpgYGAKCgoKCgpgYGB7cn0KdGV4dF90aWR5X2VtYmVkX3NtYWxsICU8PiUKICBncm91cF9ieShqb2JfaWQpICU+JQogIHN1bW1hcmlzZShhY3Jvc3Mod2hlcmUoaXMubnVtZXJpYyksIH4gbWVhbigueCwgbmEucm0gPSBUUlVFKSkpCmBgYAoKYGBge3J9CmxpYnJhcnkod2lkeXIpCmBgYAoKYGBge3J9CnRleHRfdGlkeV9lbWJlZF9zbWFsbCAlPiUKICBwYWlyd2lzZV9zaW1pbGFyaXR5KCkKYGBgCgojIERvYzJWZWMKCmBgYHtyfQpsaWJyYXJ5KGRvYzJ2ZWMpCmBgYAoKYGBge3J9CnggPC0gZGF0YSAjWzE6MTAwMDAsXQp4JHRleHQgICA8LSB0b2xvd2VyKHgkdGV4dCkKeCR0ZXh0ICAgPC0gZ3N1YigiW15bOmFscGhhOl1dIiwgIiAiLCB4JHRleHQpCngkdGV4dCAgIDwtIGdzdWIoIltbOnNwYWNlOl1dKyIsICIgIiwgeCR0ZXh0KQp4JHRleHQgICA8LSB0cmltd3MoeCR0ZXh0KQpgYGAKCmBgYHtyfQp4ICU8PiUKICByZW5hbWUoZG9jX2lkID0gam9iX2lkKQpgYGAKCgpgYGB7cn0KIyMgTW9yZSByZWFsaXN0aWMgbW9kZWwKbW9kZWwgPC0gcGFyYWdyYXBoMnZlYyh4ID0geCwgdHlwZSA9ICJQVi1ETSIsIGRpbSA9IDEwLCBpdGVyID0gMywgIAogICAgICAgICAgICAgICAgICAgICAgIG1pbl9jb3VudCA9IDUsIGxyID0gMC4wNSwgdGhyZWFkcyA9IDQpCmBgYAoKYGBge3J9CmVtYmVkZGluZ193b3JkIDwtIGFzLm1hdHJpeChtb2RlbCwgd2hpY2ggPSAid29yZHMiKQplbWJlZGRpbmdfZG9jIDwtIGFzLm1hdHJpeChtb2RlbCwgd2hpY2ggPSAiZG9jcyIpCmBgYAoKYGBge3J9CmVtYmVkZGluZ193b3JkICU+JSBoZWFkKCkKYGBgCmBgYHtyfQplbWJlZGRpbmdfZG9jICU+JSBoZWFkKCkKYGBgCmBgYHtyfQpubiA8LSBwcmVkaWN0KG1vZGVsLCBuZXdkYXRhID0gYygic2VydmVyIiwgJ2NvbXB1dGVyJywgJ3BpenphJyksIHR5cGUgPSAibmVhcmVzdCIsIHdoaWNoID0gIndvcmQyd29yZCIsIHRvcF9uID0gNSkKbm4KYGBgCmBgYHtyfQpubiA8LSBwcmVkaWN0KG1vZGVsLCBuZXdkYXRhID0gYygnc3VzaGknLCAnd2FpdGVyJywgJ2NoZWYnKSwgdHlwZSA9ICJuZWFyZXN0Iiwgd2hpY2ggPSAiZG9jMmRvYyIsICAgdG9wX24gPSA1KQpubgpgYGAKCg==