Fuzzy Matcher Stage 1

The fuzzy matcher predicts the likelihood that two toponyms are the same place even though their spellings might be different. It has two stages.

This file develops Stage 1 of the fuzzy toponym matcher. Its job is to screen out the vast majority of suggestions that are too dissimilar to ever be a plausible match. It will have a high false positive rate, which can then be further refined in stage 2.

rm(list=ls()); gc()
library(MeasuringLandscape)
library(tidyverse)
#devtools::load_all()
dir_figures <- glue::glue(here::here(), "/paper/figures/")
knitr::opts_knit$set(progress = TRUE, verbose = TRUE)
knitr::opts_chunk$set(fig.width='100%',  warning=FALSE, message=FALSE, cache=TRUE)
options(width = 160)

Load Hand Labeled Place Matches

#Load Hand Labeled Examples
handlabeled <- data.table::fread(system.file("extdata",
                                 "event_flatfile_matches_for_hand_labeling - event_flatfile_matches_for_hand_labeling.csv",
                                 package = "MeasuringLandscape"), data.table=T) %>% distinct() 
dim(handlabeled)
[1] 20243     8
# Remove exact matches 
# Rewrote  with foreach to run in parallel on windows, but it's still relatively slow code with 8 cores.
handlabeled$stemmed_a <- MeasuringLandscape:::strip_postfixes(to_be_striped=handlabeled$name_cleaner_a)[[1]]
[1] "Loading Preexisting"
handlabeled$stemmed_b <- MeasuringLandscape:::strip_postfixes(to_be_striped=handlabeled$name_cleaner_b)[[1]]
[1] "Loading Preexisting"
handlabeled_unique <- subset(handlabeled, stemmed_a!=stemmed_b) # very important we're dropping any with identical stems for evaluation
table(handlabeled_unique$rex_match) #1090 matches, 16978 nonmatches

    0     1 
16978  1090 
#Stem them
handlabeled_unique$stemmed_a <- MeasuringLandscape:::strip_postfixes(handlabeled_unique$name_cleaner_a)[[1]]
[1] "Loading Preexisting"
handlabeled_unique$stemmed_b <- MeasuringLandscape:::strip_postfixes(handlabeled_unique$name_cleaner_b)[[1]]
[1] "Loading Preexisting"
handlabeled_unique$stemmed_ab <- sapply(lapply(strsplit(paste(handlabeled_unique$stemmed_a,
                                                              handlabeled_unique$stemmed_b, sep="_"),
                                                        "_"),
                                               sort),
                                        paste,
                                        collapse="_")
dim(handlabeled)
[1] 20243    10
handlabeled$a <- handlabeled$name_cleaner_a
handlabeled$b <- handlabeled$name_cleaner_b
handlabeled[,ab:=paste(a,b,sep="_")]
handlabeled[,ba:=paste(b,a,sep="_")]
stemmed_ab <- unique(c(handlabeled$stemmed_a, handlabeled$stemmed_b)) ; length(stemmed_ab) #where ab is the unique toponym strings found in the data
[1] 5954

Grid search optimal parameters for locality sensitive hashing.

Settling on 20 bands, 5 rows, and qgram of 1 letter

fromsrcatch=F
if(fromsrcatch){
  grid_search_lhs <- list()
  for(q in c(2,5,10,20,25,50)){
    print(q)
    grid_search_lhs[[as.character(q)]] <- lhs_textreuse(minhash_count=100,  bands=q) #good trade off
    print(grid_search_lhs[[as.character(q)]])
  }
  grid_search_lhs_dt <- rbindlist(grid_search_lhs)
  
  saveRDS(grid_search_lhs_dt,
          glue::glue(getwd(), "/../inst/extdata/grid_search_lhs_dt.Rds"))
}
grid_search_lhs_dt <- readRDS(system.file("extdata", "grid_search_lhs_dt.Rds", package = "MeasuringLandscape"))

Thereโ€™s a big discontinuity between 50 bands and 25, and then diminishing returns with increasingly higher false negative rates thereafter. Choosing 25 as a compromise between low false negative rate and fewer suggestions per case. (Appendix Figure 4)

p_lhs_gridsearch <- ggplot(grid_search_lhs_dt, aes(x=false_negative,y=suggestions_per, label=bands)) + geom_label() + ggtitle("")
p_lhs_gridsearch

ggsave(
  filename = glue::glue(dir_figures, "p_lhs_gridsearch.pdf"),
  plot = p_lhs_gridsearch,
  width = 5.5,
  #height = 8,
  device = cairo_pdf #have to use cairo to correctly embed the fonts
)
Saving 5.5 x 8 in image
LS0tCnRpdGxlOiAiMDMgRnV6enkgTWF0Y2hlciBTdGFnZSAxIChMb2NhbGl0eSBTZW5zaXRpdmUgSGFzaGluZykgIgphdXRob3I6ICJSZXggVy4gRG91Z2xhc3MgYW5kIEtyaXN0ZW4gSGFya25lc3MiCmRhdGU6ICJNYXJjaCA5LCAyMDE4IgpvdXRwdXQ6IAogIGh0bWxfbm90ZWJvb2s6CiAgICB0b2M6IHRydWUKICAgIHRvY19mbG9hdDogdHJ1ZQplZGl0b3Jfb3B0aW9uczogCiAgY2h1bmtfb3V0cHV0X3R5cGU6IGlubGluZQotLS0KPHN0eWxlPgogICAgYm9keSAubWFpbi1jb250YWluZXIgewogICAgICAgIG1heC13aWR0aDogMTAwJTsKICAgIH0KPC9zdHlsZT4KCiMgRnV6enkgTWF0Y2hlciBTdGFnZSAxCgpUaGUgZnV6enkgbWF0Y2hlciBwcmVkaWN0cyB0aGUgbGlrZWxpaG9vZCB0aGF0IHR3byB0b3BvbnltcyBhcmUgdGhlIHNhbWUgcGxhY2UgZXZlbiB0aG91Z2ggdGhlaXIgc3BlbGxpbmdzIG1pZ2h0IGJlIGRpZmZlcmVudC4gSXQgaGFzIHR3byBzdGFnZXMuCgpUaGlzIGZpbGUgZGV2ZWxvcHMgU3RhZ2UgMSBvZiB0aGUgZnV6enkgdG9wb255bSBtYXRjaGVyLiBJdHMgam9iIGlzIHRvIHNjcmVlbiBvdXQgdGhlIHZhc3QgbWFqb3JpdHkgb2Ygc3VnZ2VzdGlvbnMgdGhhdCBhcmUgdG9vIGRpc3NpbWlsYXIgdG8gZXZlciBiZSBhIHBsYXVzaWJsZSBtYXRjaC4gSXQgd2lsbCBoYXZlIGEgaGlnaCBmYWxzZSBwb3NpdGl2ZSByYXRlLCB3aGljaCBjYW4gdGhlbiBiZSBmdXJ0aGVyIHJlZmluZWQgaW4gc3RhZ2UgMi4KCiAKYGBge3IgLCByZXN1bHRzPSdoaWRlJywgbWVzc2FnZT1GQUxTRSwgd2FybmluZz1GQUxTRX0Kcm0obGlzdD1scygpKTsgZ2MoKQpsaWJyYXJ5KE1lYXN1cmluZ0xhbmRzY2FwZSkKbGlicmFyeSh0aWR5dmVyc2UpCiNkZXZ0b29sczo6bG9hZF9hbGwoKQoKZGlyX2ZpZ3VyZXMgPC0gZ2x1ZTo6Z2x1ZShoZXJlOjpoZXJlKCksICIvcGFwZXIvZmlndXJlcy8iKQoKa25pdHI6Om9wdHNfa25pdCRzZXQocHJvZ3Jlc3MgPSBUUlVFLCB2ZXJib3NlID0gVFJVRSkKa25pdHI6Om9wdHNfY2h1bmskc2V0KGZpZy53aWR0aD0nMTAwJScsICB3YXJuaW5nPUZBTFNFLCBtZXNzYWdlPUZBTFNFLCBjYWNoZT1UUlVFKQpvcHRpb25zKHdpZHRoID0gMTYwKQoKYGBgCgojIExvYWQgSGFuZCBMYWJlbGVkIFBsYWNlIE1hdGNoZXMKCgpgYGB7cn0KCiNMb2FkIEhhbmQgTGFiZWxlZCBFeGFtcGxlcwpoYW5kbGFiZWxlZCA8LSBkYXRhLnRhYmxlOjpmcmVhZChzeXN0ZW0uZmlsZSgiZXh0ZGF0YSIsCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICJldmVudF9mbGF0ZmlsZV9tYXRjaGVzX2Zvcl9oYW5kX2xhYmVsaW5nIC0gZXZlbnRfZmxhdGZpbGVfbWF0Y2hlc19mb3JfaGFuZF9sYWJlbGluZy5jc3YiLAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBwYWNrYWdlID0gIk1lYXN1cmluZ0xhbmRzY2FwZSIpLCBkYXRhLnRhYmxlPVQpICU+JSBkaXN0aW5jdCgpIApkaW0oaGFuZGxhYmVsZWQpCgojIFJlbW92ZSBleGFjdCBtYXRjaGVzIAojIFJld3JvdGUgIHdpdGggZm9yZWFjaCB0byBydW4gaW4gcGFyYWxsZWwgb24gd2luZG93cywgYnV0IGl0J3Mgc3RpbGwgcmVsYXRpdmVseSBzbG93IGNvZGUgd2l0aCA4IGNvcmVzLgpoYW5kbGFiZWxlZCRzdGVtbWVkX2EgPC0gTWVhc3VyaW5nTGFuZHNjYXBlOjo6c3RyaXBfcG9zdGZpeGVzKHRvX2JlX3N0cmlwZWQ9aGFuZGxhYmVsZWQkbmFtZV9jbGVhbmVyX2EpW1sxXV0KaGFuZGxhYmVsZWQkc3RlbW1lZF9iIDwtIE1lYXN1cmluZ0xhbmRzY2FwZTo6OnN0cmlwX3Bvc3RmaXhlcyh0b19iZV9zdHJpcGVkPWhhbmRsYWJlbGVkJG5hbWVfY2xlYW5lcl9iKVtbMV1dCgpoYW5kbGFiZWxlZF91bmlxdWUgPC0gc3Vic2V0KGhhbmRsYWJlbGVkLCBzdGVtbWVkX2EhPXN0ZW1tZWRfYikgIyB2ZXJ5IGltcG9ydGFudCB3ZSdyZSBkcm9wcGluZyBhbnkgd2l0aCBpZGVudGljYWwgc3RlbXMgZm9yIGV2YWx1YXRpb24KdGFibGUoaGFuZGxhYmVsZWRfdW5pcXVlJHJleF9tYXRjaCkgIzEwOTAgbWF0Y2hlcywgMTY5Nzggbm9ubWF0Y2hlcwoKI1N0ZW0gdGhlbQpoYW5kbGFiZWxlZF91bmlxdWUkc3RlbW1lZF9hIDwtIE1lYXN1cmluZ0xhbmRzY2FwZTo6OnN0cmlwX3Bvc3RmaXhlcyhoYW5kbGFiZWxlZF91bmlxdWUkbmFtZV9jbGVhbmVyX2EpW1sxXV0KaGFuZGxhYmVsZWRfdW5pcXVlJHN0ZW1tZWRfYiA8LSBNZWFzdXJpbmdMYW5kc2NhcGU6OjpzdHJpcF9wb3N0Zml4ZXMoaGFuZGxhYmVsZWRfdW5pcXVlJG5hbWVfY2xlYW5lcl9iKVtbMV1dCmhhbmRsYWJlbGVkX3VuaXF1ZSRzdGVtbWVkX2FiIDwtIHNhcHBseShsYXBwbHkoc3Ryc3BsaXQocGFzdGUoaGFuZGxhYmVsZWRfdW5pcXVlJHN0ZW1tZWRfYSwKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBoYW5kbGFiZWxlZF91bmlxdWUkc3RlbW1lZF9iLCBzZXA9Il8iKSwKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAiXyIpLAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIHNvcnQpLAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgcGFzdGUsCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBjb2xsYXBzZT0iXyIpCgoKZGltKGhhbmRsYWJlbGVkKQpoYW5kbGFiZWxlZCRhIDwtIGhhbmRsYWJlbGVkJG5hbWVfY2xlYW5lcl9hCmhhbmRsYWJlbGVkJGIgPC0gaGFuZGxhYmVsZWQkbmFtZV9jbGVhbmVyX2IKaGFuZGxhYmVsZWRbLGFiOj1wYXN0ZShhLGIsc2VwPSJfIildCmhhbmRsYWJlbGVkWyxiYTo9cGFzdGUoYixhLHNlcD0iXyIpXQoKc3RlbW1lZF9hYiA8LSB1bmlxdWUoYyhoYW5kbGFiZWxlZCRzdGVtbWVkX2EsIGhhbmRsYWJlbGVkJHN0ZW1tZWRfYikpIDsgbGVuZ3RoKHN0ZW1tZWRfYWIpICN3aGVyZSBhYiBpcyB0aGUgdW5pcXVlIHRvcG9ueW0gc3RyaW5ncyBmb3VuZCBpbiB0aGUgZGF0YQoKYGBgCgpHcmlkIHNlYXJjaCBvcHRpbWFsIHBhcmFtZXRlcnMgZm9yIGxvY2FsaXR5IHNlbnNpdGl2ZSBoYXNoaW5nLgoKU2V0dGxpbmcgb24gMjAgYmFuZHMsIDUgcm93cywgYW5kIHFncmFtIG9mIDEgbGV0dGVyCgpgYGB7cn0KCmZyb21zcmNhdGNoPUYKaWYoZnJvbXNyY2F0Y2gpewogIGdyaWRfc2VhcmNoX2xocyA8LSBsaXN0KCkKICBmb3IocSBpbiBjKDIsNSwxMCwyMCwyNSw1MCkpewogICAgcHJpbnQocSkKICAgIGdyaWRfc2VhcmNoX2xoc1tbYXMuY2hhcmFjdGVyKHEpXV0gPC0gbGhzX3RleHRyZXVzZShtaW5oYXNoX2NvdW50PTEwMCwgIGJhbmRzPXEpICNnb29kIHRyYWRlIG9mZgogICAgcHJpbnQoZ3JpZF9zZWFyY2hfbGhzW1thcy5jaGFyYWN0ZXIocSldXSkKICB9CiAgZ3JpZF9zZWFyY2hfbGhzX2R0IDwtIHJiaW5kbGlzdChncmlkX3NlYXJjaF9saHMpCiAgCiAgc2F2ZVJEUyhncmlkX3NlYXJjaF9saHNfZHQsCiAgICAgICAgICBnbHVlOjpnbHVlKGdldHdkKCksICIvLi4vaW5zdC9leHRkYXRhL2dyaWRfc2VhcmNoX2xoc19kdC5SZHMiKSkKfQoKZ3JpZF9zZWFyY2hfbGhzX2R0IDwtIHJlYWRSRFMoc3lzdGVtLmZpbGUoImV4dGRhdGEiLCAiZ3JpZF9zZWFyY2hfbGhzX2R0LlJkcyIsIHBhY2thZ2UgPSAiTWVhc3VyaW5nTGFuZHNjYXBlIikpCgpgYGAKClRoZXJlJ3MgYSBiaWcgZGlzY29udGludWl0eSBiZXR3ZWVuIDUwIGJhbmRzIGFuZCAyNSwgYW5kIHRoZW4gZGltaW5pc2hpbmcgcmV0dXJucyB3aXRoIGluY3JlYXNpbmdseSBoaWdoZXIgZmFsc2UgbmVnYXRpdmUgcmF0ZXMgdGhlcmVhZnRlci4gQ2hvb3NpbmcgMjUgYXMgYSBjb21wcm9taXNlIGJldHdlZW4gbG93IGZhbHNlIG5lZ2F0aXZlIHJhdGUgYW5kIGZld2VyIHN1Z2dlc3Rpb25zIHBlciBjYXNlLiAoQXBwZW5kaXggRmlndXJlIDQpCgpgYGB7ciwgZmlnLndpZHRoPTEyLCBmaWcuaGVpZ2h0PTh9CgpwX2xoc19ncmlkc2VhcmNoIDwtIGdncGxvdChncmlkX3NlYXJjaF9saHNfZHQsIGFlcyh4PWZhbHNlX25lZ2F0aXZlLHk9c3VnZ2VzdGlvbnNfcGVyLCBsYWJlbD1iYW5kcykpICsgZ2VvbV9sYWJlbCgpICsgZ2d0aXRsZSgiIikKcF9saHNfZ3JpZHNlYXJjaAoKZ2dzYXZlKAogIGZpbGVuYW1lID0gZ2x1ZTo6Z2x1ZShkaXJfZmlndXJlcywgInBfbGhzX2dyaWRzZWFyY2gucGRmIiksCiAgcGxvdCA9IHBfbGhzX2dyaWRzZWFyY2gsCiAgd2lkdGggPSA1LjUsCiAgI2hlaWdodCA9IDgsCiAgZGV2aWNlID0gY2Fpcm9fcGRmICNoYXZlIHRvIHVzZSBjYWlybyB0byBjb3JyZWN0bHkgZW1iZWQgdGhlIGZvbnRzCikKCmBgYAo=