Load data
data <- read_csv('data/Combined_Jobs_Final.csv')
Rows: 84090 Columns: 23
── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (17): Status, Slug, Title, Position, Company, City, State.Name, State.Code, Address, Industry, Job.Description, Listing.Start, Listing.End,...
dbl (5): Job.ID, Provider, Latitude, Longitude, Salary
lgl (1): Requirements
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data %>% glimpse()
Rows: 84,090
Columns: 23
$ Job.ID <dbl> 111, 113, 117, 121, 127, 129, 131, 133, 134273, 134274, 134275, 134276, 134277, 134278, 134279, 134280, 134281, 134282…
$ Provider <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2…
$ Status <chr> "open", "open", "open", "open", "open", "open", "open", "open", "open", "open", "open", "open", "open", "open", "open"…
$ Slug <chr> "palo-alto-ca-tacolicious-server", "san-francisco-ca-claude-lane-kitchen-staff-chef", "san-francisco-ca-machka-restaur…
$ Title <chr> "Server @ Tacolicious", "Kitchen Staff/Chef @ Claude Lane", "Bartender @ Machka Restaurants Corp.", "Server @ Teriyaki…
$ Position <chr> "Server", "Kitchen Staff/Chef", "Bartender", "Server", "Kitchen Staff/Chef", "Receptionist", "Server", "Driver", "Assi…
$ Company <chr> "Tacolicious", "Claude Lane", "Machka Restaurants Corp.", "Teriyaki House", "Rosa Mexicano - Sunset", "Mind of Beauty …
$ City <chr> "Palo Alto", "San Francisco", "San Francisco", "Brisbane", "Los Angeles", "Los Altos", "Los Angeles", "Berkeley", "Men…
$ State.Name <chr> "California", "California", "California", "California", "California", "California", "California", "California", "New J…
$ State.Code <chr> "CA", "CA", "CA", "CA", "CA", "CA", "CA", "CA", "NJ", "WI", "IL", "KY", "SC", "PA", "KY", "NC", "NJ", "OH", "MN", "KY"…
$ Address <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ Latitude <dbl> 37.44335, 37.78983, 37.79560, 37.68507, 34.07338, 37.38522, 34.18630, 37.86721, 40.77600, 43.90650, 42.03450, 38.25486…
$ Longitude <dbl> -122.16117, -122.40427, -122.40296, -122.40028, -118.46044, -122.11413, -118.60637, -122.25861, -74.60100, -91.23340, …
$ Industry <chr> "Food and Beverages", "Food and Beverages", "Food and Beverages", "Food and Beverages", "Food and Beverages", "Retail"…
$ Job.Description <chr> "Tacolicious' first Palo Alto store just opened recently, and we are hiring! If you love tacos, you will love working …
$ Requirements <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ Salary <dbl> 8.00, 0.00, 11.00, 10.55, 10.55, 0.00, 8.00, 11.00, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ Listing.Start <chr> NA, NA, NA, NA, NA, NA, NA, NA, "12-05-2014", "12-05-2014", "12-05-2014", "12-05-2014", "12-05-2014", "12-05-2014", "1…
$ Listing.End <chr> NA, NA, NA, NA, NA, NA, NA, NA, "01-04-2015", "01-04-2015", "01-04-2015", "01-04-2015", "01-04-2015", "01-04-2015", "0…
$ Employment.Type <chr> "Part-Time", "Part-Time", "Part-Time", "Part-Time", "Part-Time", "Part-Time", "Part-Time", "Part-Time", "Part-Time", "…
$ Education.Required <chr> NA, NA, NA, NA, NA, NA, NA, NA, "Not Specified", "High School Diploma", "Not Specified", "Not Specified", "Not Specifi…
$ Created.At <chr> "2013-03-12 02:08:28 UTC", "2013-04-12 08:36:36 UTC", "2013-07-16 09:34:10 UTC", "2013-09-04 15:40:30 UTC", "2013-07-1…
$ Updated.At <chr> "2014-08-16 15:35:36 UTC", "2014-08-16 15:35:36 UTC", "2014-08-16 15:35:37 UTC", "2014-08-16 15:35:38 UTC", "2014-08-1…
data[1,'Job.Description']
colnames(data) <- colnames(data) %>% tolower() %>% str_replace('\\.', '_')
Doc2Vec
library(doc2vec)
x <- data #[1:10000,]
x$text <- tolower(x$text)
x$text <- gsub("[^[:alpha:]]", " ", x$text)
x$text <- gsub("[[:space:]]+", " ", x$text)
x$text <- trimws(x$text)
x %<>%
rename(doc_id = job_id)
## More realistic model
model <- paragraph2vec(x = x, type = "PV-DM", dim = 10, iter = 3,
min_count = 5, lr = 0.05, threads = 4)
embedding_word <- as.matrix(model, which = "words")
embedding_doc <- as.matrix(model, which = "docs")
embedding_word %>% head()
[,1] [,2] [,3] [,4] [,5]
</s> 0.5049461 0.55751580 -0.48319447 -0.4135315 0.17239621
and 0.4475586 -0.70460820 0.02434461 -0.4640722 -0.29540303
to 0.4501234 -0.64722580 -0.57381165 0.1133763 0.19071901
the 0.9781786 0.02705558 0.05119156 -0.1305319 0.15091500
a 0.4065473 -0.27062786 -0.24862848 0.5535178 0.62712181
of 0.5894881 -0.75248194 0.17775758 -0.2169727 0.08717752
embedding_doc %>% head()
[,1] [,2] [,3] [,4] [,5]
111 0.7196287 0.431667209 0.10125642 0.53245306 0.04515361
113 0.4762859 -0.667058170 -0.03218742 -0.35846087 0.44570717
117 0.8882785 -0.286137760 -0.32409638 0.09825002 -0.11997923
121 0.8938912 0.005031405 -0.20705684 0.32504669 -0.22892231
127 0.8835905 -0.189624771 0.33544683 0.24478789 -0.10423350
129 -0.4069880 0.164157242 0.35578504 0.82381403 -0.04648277
nn <- predict(model, newdata = c("server", 'computer', 'pizza'), type = "nearest", which = "word2word", top_n = 5)
nn
[[1]]
[[2]]
[[3]]
NA
nn <- predict(model, newdata = c('sushi', 'waiter', 'chef'), type = "nearest", which = "doc2doc", top_n = 5)
nn
[[1]]
[[2]]
[[3]]
NA
LS0tCnRpdGxlOiAiSW4tQ2xhc3MgRXhlcmNpc2U6IFVuc3RydWN0dXJlZCBhZHZhbmNlZCkiCmF1dGhvcjogIkRhbmllbCBTLiBIYWluIChkc2hAYnVzaW5lc3MuYWF1LmRrKSIKZGF0ZTogIlVwZGF0ZWQgYHIgZm9ybWF0KFN5cy50aW1lKCksICclQiAlZCwgJVknKWAiCm91dHB1dDoKICBodG1sX25vdGVib29rOgogICAgY29kZV9mb2xkaW5nOiBzaG93CiAgICBkZl9wcmludDogcGFnZWQKICAgIHRvYzogdHJ1ZQogICAgdG9jX2RlcHRoOiAyCiAgICB0b2NfZmxvYXQ6CiAgICAgIGNvbGxhcHNlZDogZmFsc2UKICAgIHRoZW1lOiBmbGF0bHkKLS0tCgpgYGB7ciBzZXR1cCwgaW5jbHVkZT1GQUxTRX0KIyBLbml0ciBvcHRpb25zCiMjIyBHZW5lcmljIHByZWFtYmxlCnJtKGxpc3Q9bHMoKSk7IGdyYXBoaWNzLm9mZigpIApTeXMuc2V0ZW52KExBTkcgPSAiZW4iKSAjIEZvciBlbmdsaXNoIGxhbmd1YWdlCm9wdGlvbnMoc2NpcGVuID0gNSkgIyBUbyBkZWFjdGl2YXRlIGFubm95aW5nIHNjaWVudGlmaWMgbnVtYmVyIG5vdGF0aW9uCgojIHJtKGxpc3Q9bHMoKSk7IGdyYXBoaWNzLm9mZigpICMgZ2V0IHJpZCBvZiBldmVyeXRoaW5nIGluIHRoZSB3b3Jrc3BhY2UKaWYgKCFyZXF1aXJlKCJrbml0ciIpKSBpbnN0YWxsLnBhY2thZ2VzKCJrbml0ciIpOyBsaWJyYXJ5KGtuaXRyKSAjIEZvciBkaXNwbGF5IG9mIHRoZSBtYXJrZG93bgoKIyMjIEtuaXRyIG9wdGlvbnMKa25pdHI6Om9wdHNfY2h1bmskc2V0KHdhcm5pbmc9RkFMU0UsCiAgICAgICAgICAgICAgICAgICAgIG1lc3NhZ2U9RkFMU0UsCiAgICAgICAgICAgICAgICAgICAgIGZpZy5hbGlnbj0iY2VudGVyIgogICAgICAgICAgICAgICAgICAgICApCmBgYAoKIyBQcmVhbWJsZQoKIyMgU3RhbmRhcmQgcGFja2FnZXMKCmBgYHtyfQojIyMgTG9hZCBwYWNrYWdlcwpsaWJyYXJ5KHRpZHl2ZXJzZSkgIyBDb2xsZWN0aW9uIG9mIGFsbCB0aGUgZ29vZCBzdHVmZiBsaWtlIGRwbHlyLCBnZ3Bsb3QyIGVjdC4KbGlicmFyeShtYWdyaXR0cikgIyBGb3IgZXh0cmEtcGlwaW5nIG9wZXJhdG9ycyAoZWcuICU8PiUpCmBgYAoKIyBMb2FkIGRhdGEKCmBgYHtyfQpkYXRhIDwtIHJlYWRfY3N2KCdkYXRhL0NvbWJpbmVkX0pvYnNfRmluYWwuY3N2JykKYGBgCmBgYHtyfQpkYXRhICU+JSBnbGltcHNlKCkKYGBgCgpgYGB7cn0KZGF0YVsxLCdKb2IuRGVzY3JpcHRpb24nXQpgYGAKCmBgYHtyfQpjb2xuYW1lcyhkYXRhKSA8LSBjb2xuYW1lcyhkYXRhKSAlPiUgdG9sb3dlcigpICU+JSBzdHJfcmVwbGFjZSgnXFwuJywgJ18nKQpgYGAKCiMgVGV4dCBwcmVwZW9jc3NpbmcKCmBgYHtyfQpsaWJyYXJ5KHRpZHl0ZXh0KQpgYGAKCmBgYHtyfQpkYXRhICU8PiUgCiAgbXV0YXRlKHRleHQgPSBwYXN0ZSh0aXRsZSwgam9iX2Rlc2NyaXB0aW9uLCBzZXAgPSAnICcpKSAlPiUKICBzZWxlY3Qoam9iX2lkLCB0ZXh0KQpgYGAKCmBgYHtyfQp0ZXh0X3RpZHkgPC0gZGF0YSAlPiUgdW5uZXN0X3Rva2Vucyh3b3JkLCB0ZXh0LCB0b2tlbiA9ICd3b3JkcycpIApgYGAKCmBgYHtyfQp0ZXh0X3RpZHkgJT4lIAogIGFudGlfam9pbihzdG9wX3dvcmRzKSAlPiUKICBjb3VudCh3b3JkLCBzb3J0ID0gVFJVRSkKYGBgCgoKYGBge3J9CmxpYnJhcnkodGV4dGRhdGEpCmBgYAoKYGBge3J9Cmdsb3ZlNmIgPC0gZW1iZWRkaW5nX2dsb3ZlNmIoZGltZW5zaW9ucyA9IDEwMCkKYGBgCgpgYGB7cn0KdGV4dF90aWR5X2VtYmVkIDwtIHRleHRfdGlkeSAlPiUKICBpbm5lcl9qb2luKGdsb3ZlNmIsIGJ5ID0gYygnd29yZCcgPSAndG9rZW4nKSkKYGBgCgpgYGB7cn0KdGV4dF90aWR5X2VtYmVkX3NtYWxsIDwtIHRleHRfdGlkeV9lbWJlZFsxOjEwMDAwMCxdCmBgYAoKYGBge3J9CiPigqx0ZXh0X3RpZHlfZW1iZWRfc21hbGwgJTw+JQojICBncm91cF9ieShqb2JfaWQpICU+JQojICBhZGRfY291bnQod29yZCkgJT4lCiMgIHVuZ3JvdXAoKQpgYGAKCmBgYHtyfQojdGV4dF90aWR5X2VtYmVkX3NtYWxsICU8PiUKIyAgYmluZF90Zl9pZGYod29yZCwgam9iX2lkLCBuKQpgYGAKCmBgYHtyfQojdGV4dF90aWR5X2VtYmVkX3NtYWxsICU+JQojICBzZWxlY3Qoam9iX2lkLCB3b3JkLCB0Zl9pZGYsIGV2ZXJ5dGhpbmcoKSkgJT4lCiMgIHNlbGVjdCgtbiwgdGYsIGlkZikgJT4lCiMgIG11dGF0ZShhY3Jvc3Mod2hlcmUoc3RhcnRzX3dpdGgoJ2QnKSksIH4gZnVuKC54KSAueCAqIHRmX2lkZiwgLCAubmFtZXMgPSAiLmNvbCIpKQpgYGAKCgoKCgpgYGB7cn0KdGV4dF90aWR5X2VtYmVkX3NtYWxsICU8PiUKICBncm91cF9ieShqb2JfaWQpICU+JQogIHN1bW1hcmlzZShhY3Jvc3Mod2hlcmUoaXMubnVtZXJpYyksIH4gbWVhbigueCwgbmEucm0gPSBUUlVFKSkpCmBgYAoKYGBge3J9CmxpYnJhcnkod2lkeXIpCmBgYAoKYGBge3J9CnRleHRfdGlkeV9lbWJlZF9zbWFsbCAlPiUKICBwYWlyd2lzZV9zaW1pbGFyaXR5KCkKYGBgCgojIERvYzJWZWMKCmBgYHtyfQpsaWJyYXJ5KGRvYzJ2ZWMpCmBgYAoKYGBge3J9CnggPC0gZGF0YSAjWzE6MTAwMDAsXQp4JHRleHQgICA8LSB0b2xvd2VyKHgkdGV4dCkKeCR0ZXh0ICAgPC0gZ3N1YigiW15bOmFscGhhOl1dIiwgIiAiLCB4JHRleHQpCngkdGV4dCAgIDwtIGdzdWIoIltbOnNwYWNlOl1dKyIsICIgIiwgeCR0ZXh0KQp4JHRleHQgICA8LSB0cmltd3MoeCR0ZXh0KQpgYGAKCmBgYHtyfQp4ICU8PiUKICByZW5hbWUoZG9jX2lkID0gam9iX2lkKQpgYGAKCgpgYGB7cn0KIyMgTW9yZSByZWFsaXN0aWMgbW9kZWwKbW9kZWwgPC0gcGFyYWdyYXBoMnZlYyh4ID0geCwgdHlwZSA9ICJQVi1ETSIsIGRpbSA9IDEwLCBpdGVyID0gMywgIAogICAgICAgICAgICAgICAgICAgICAgIG1pbl9jb3VudCA9IDUsIGxyID0gMC4wNSwgdGhyZWFkcyA9IDQpCmBgYAoKYGBge3J9CmVtYmVkZGluZ193b3JkIDwtIGFzLm1hdHJpeChtb2RlbCwgd2hpY2ggPSAid29yZHMiKQplbWJlZGRpbmdfZG9jIDwtIGFzLm1hdHJpeChtb2RlbCwgd2hpY2ggPSAiZG9jcyIpCmBgYAoKYGBge3J9CmVtYmVkZGluZ193b3JkICU+JSBoZWFkKCkKYGBgCmBgYHtyfQplbWJlZGRpbmdfZG9jICU+JSBoZWFkKCkKYGBgCmBgYHtyfQpubiA8LSBwcmVkaWN0KG1vZGVsLCBuZXdkYXRhID0gYygic2VydmVyIiwgJ2NvbXB1dGVyJywgJ3BpenphJyksIHR5cGUgPSAibmVhcmVzdCIsIHdoaWNoID0gIndvcmQyd29yZCIsIHRvcF9uID0gNSkKbm4KYGBgCmBgYHtyfQpubiA8LSBwcmVkaWN0KG1vZGVsLCBuZXdkYXRhID0gYygnc3VzaGknLCAnd2FpdGVyJywgJ2NoZWYnKSwgdHlwZSA9ICJuZWFyZXN0Iiwgd2hpY2ggPSAiZG9jMmRvYyIsICAgdG9wX24gPSA1KQpubgpgYGAKCg==