Takes in locations of events described as text, and returns all possible matches across different gazetteers.

Dependencies: events_sf.Rdata, flatfiles_sf_roi.Rdata Products: georef_all_dt.Rds

rm(list=ls()); gc()
# Hiding output and warnings
# !diagnostics off
library(MeasuringLandscape)
library(tidyverse)
dir_figures <- paste0(here::here(), "/paper/figures/")
gc()
knitr::opts_knit$set(progress = TRUE, verbose = TRUE)
knitr::opts_chunk$set(fig.width=12, fig.height=8,  warning=FALSE, message=FALSE, cache=TRUE)
options(width = 160)

Load Files

We load the events file to be geolocated.

#Load Events
events_sf <- readRDS(system.file("extdata", "events_sf.Rdata", package = "MeasuringLandscape")) 
events_sf_text_coord_unique <- plyr::ddply(events_sf[,c('location_text','name_clean','name_cleaner','document_district_clean','map_coordinate_clean_latitude','map_coordinate_clean_longitude')],
                                     "location_text", transform,
      map_coordinate_has =sum(!is.na(map_coordinate_clean_latitude))
      )

We load the combined gazetteer files.

#Load Gazetteers
flatfiles_sf_roi <- readRDS(system.file("extdata", "flatfiles_sf_roi.Rdata", package = "MeasuringLandscape")) 
Warning message:
Unknown or uninitialised column: 'name_cleaner_stem'. 
dim(flatfiles_sf_roi)
[1] 55966    22
flatfiles_dt <- data.table::as.data.table(flatfiles_sf_roi)
data.table::setkey(flatfiles_dt,place_hash)

We load a pretrained toponym matcher based on XGBoost and many string distance features.

#Load toponym model
#p_load(xgboost)
toponym_xb_model <- xgboost::xgb.load(system.file("extdata", "toponym_xb_onlystring.bin", package = "MeasuringLandscape"))

Summary Statistics

When we start, we have 2939 unique location strings in the events data.

rr #events_sf$name_cleaner %>% tabyl(sort=T)

When we start, we have 22735 unique location strings in the gazetteer data.

rr #flatfiles_sf_roi$name_cleaner %>% tabyl(sort=T)

If we considered every single pair of possible matches it would require about 62 million comparisons (66818165). And that is just for our relatively small number of event location strings and restricting our gazetteers to a relatively small region of interest. Obviously, any approach we implement is going to have to scale sub-quadratic in the number of locations or it won’t be applicable to any problem of interesting size.

Stem

Our first move is to stem each location string of prefixes and common suffixes so that we can match slight variations of the same place to a single identifier.

#Step 1  Get Suggestions for Stems
#Stem the gazetteer names
temp <- MeasuringLandscape:::strip_postfixes(flatfiles_sf_roi$name_cleaner)
[1] "Loading Preexisting"
flatfiles_sf_roi$name_cleaner_stem <- temp[[1]]
flatfiles_sf_roi$name_cleaner_postfix <- temp[[2]]
#flatfiles_sf_roi$name_cleaner_stem %>% tabyl(sort=T)

There are 17231 unique location stems in the flatfiles data.

#Stem the event names
temp <- MeasuringLandscape:::strip_postfixes(events_sf$name_cleaner)
[1] "Loading Preexisting"
events_sf$name_cleaner_stem <- temp[[1]]
events_sf$name_cleaner_postfix <- temp[[2]]
#events_sf$name_cleaner %>% tabyl(sort=T)

There are 2465 unique location stems in the events data.

#Unique stems
stemmed_ab <- unique(c(flatfiles_sf_roi$name_cleaner_stem, events_sf$name_cleaner_stem))
length(stemmed_ab) #17,880 unique stems
[1] 17876
#Unique name-stem pairs
names_and_stems <- rbind(
                          as.data.frame(flatfiles_sf_roi)[,c('name_cleaner','name_cleaner_stem')],
                          as.data.frame(events_sf)[,c('name_cleaner','name_cleaner_stem')]
                        ) %>% unique() ; dim(names_and_stems)
[1] 23590     2

LSH

Our next move is hash each stemmed location in a way that locations that are somewhat similar to one another are returned as possible matches and the vast majority of pairs that are too dissimilar to ever possibly match are excluded. This requires only a single pass through the data, and so scales linearly in the number of locations. We select parameters which perform well on our hand-labeled training dataset of toponym matches and mismatches. Where performing well at this stage means a low false negative rate (missing few genuine matches) and a moderate false positive rate (returning many irrelevant matches, but not an overwhelming number). Here we are only trying to reject as many obvious nonmatches as possible.

minhash_count=100
bands=25
fromscratch=F # For some reason these parallel functions hang in rmarkdown. Copy and paste them into console to run.
if(fromscratch){
  #Optimal settings derived in 04_fuzzy_matcher_stage_1
  
  library(textreuse)
  stemmed_ab_spaced <- sapply(strsplit(stemmed_ab, split="") , paste, collapse=" ")
  minhash <- minhash_generator(n = minhash_count, seed = 1)
  options("mc.cores" = parallel::detectCores()) #Much faster when paralized
  #options("mc.cores" = 1) #Much faster when paralized
  #This doesn't like to be ran in rmarkdown, it'll spawn processes and hang. Maybe see if it'll work as a function?
  
  corpus_ab_spaced <- TextReuseCorpus(text = stemmed_ab_spaced,
                                      tokenizer = tokenize_ngrams,
                                      n = 2, #wow if you pass this as a variable instead of a number it crashes. Wtf?
                                      minhash_func = minhash,
                                      keep_tokens = TRUE,
                                      progress = T)
  
  #buckets <- lsh(corpus_ab_spaced, bands = 80, progress = T) #Single threaded but should be parallizable
  
  n.cores <- parallel::detectCores()
  cuts <- cut(1:length(corpus_ab_spaced), n.cores)
  #buckets_list <- mclapply(levels(cuts),
  #               function(q) lsh(corpus_ab_spaced[cuts==q], bands = bands, progress = T) ,
  #               mc.cores = n.cores)
  #buckets <- do.call(rbind, buckets_list)
  
  library(doParallel)
  library(foreach)
  cl<- makeCluster(parallel::detectCores()) #change the 2 to your number of CPU cores
  registerDoParallel(cl)
  buckets <- foreach(q=levels(cuts), .combine='rbind', .packages=c('textreuse')) %dopar%  
                lsh(corpus_ab_spaced[cuts==q], bands = bands, progress =F)
  stopCluster(cl)
  
  candidates <- lsh_candidates(buckets)
  dim(candidates) #842,288 
  candidates$a_numeric <- as.numeric( gsub("doc-","",candidates$a) )
  candidates$b_numeric <- as.numeric( gsub("doc-","",candidates$b) )
  candidates$stemmed_a <- stemmed_ab[candidates$a_numeric]
  candidates$stemmed_b <- stemmed_ab[candidates$b_numeric]
  candidates$stemmed_ab <- paste(candidates$stemmed_a,candidates$stemmed_b, sep="_")
  candidates$stemmed_ba <- paste(candidates$stemmed_b,candidates$stemmed_a, sep="_")
  
  saveRDS(candidates, paste0(here::here(), "inst/extdata/candidates.Rdata")) #Leaving off the .. because you should only be running this by hand
  
}
candidates <- readRDS(system.file("extdata",
                                 "candidates.Rdata",
                                 package = "MeasuringLandscape"))
#head(candidates) %>% DT::datatable()

There are 1004273 candidate stem matches return by locality sensitive hashing with these parameters. This is a tiny 0.0031 fraction of all the possible stem to stem matches we could have considered.

Expand Stem matches into full matches

We consider any two strings to be a possible match if we consider their stems to be a possible match. This bit of code expands and merges to get the suggestions in terms of event string- gazetteer string possible matches. Throughout we use the example of the stem “gura” to show possible matches and how they’re reduced over various steps.

ab_suggestions_events <- subset(ab_suggestions_events, name_cleaner_a %in% unique( events_sf$name_cleaner)  ) ; #dim(ab_suggestions_events)  #
glue::glue(stemmed_ab_suggestions_events %>% nrow() , " string matches")
152864 string matches
#DT::datatable(ab_suggestions_events[stemmed_a=="gura"])

Screen pairs that are too dissimilar

ab_suggestions_events[,temp_q_cos:= stringdist::stringsim(name_cleaner_a,name_cleaner_b,"cos", nthread=parallel::detectCores(),q=2),] #both fast and very good at sorting
ab_suggestions_events <- ab_suggestions_events[temp_q_cos>.3] ; #dim(ab_suggestions_events) #less than .3 is never match
glue::glue(ab_suggestions_events %>% nrow() , " string matches after removing those with too small a ngram2 cosine distance")
103311 string matches after removing those with too small a ngram2 cosine distance

XGBoost Toponym Matching

Next, we refine the remaining matches using additional string distance features and an XGBoost model trained earlier on the labeled match/no match toponym training examples.

ab_suggestions_events$a <- ab_suggestions_events$stemmed_a
ab_suggestions_events$b <- ab_suggestions_events$stemmed_b
ab_suggestions_events_features <- MeasuringLandscape:::toponym_add_features(ab_suggestions_events)
[1] "1 of 24"
[1] "2 of 24"
[1] "3 of 24"
[1] "4 of 24"
[1] "5 of 24"
[1] "6 of 24"
[1] "7 of 24"
[1] "8 of 24"
[1] "9 of 24"
[1] "10 of 24"
[1] "11 of 24"
[1] "12 of 24"
[1] "13 of 24"
[1] "14 of 24"
[1] "15 of 24"
[1] "16 of 24"
[1] "17 of 24"
[1] "18 of 24"
[1] "19 of 24"
[1] "20 of 24"
[1] "21 of 24"
[1] "22 of 24"
[1] "23 of 24"
[1] "24 of 24"
vars_x_string <- c(
    "Jaro",
    "Optimal_String_Alignment"    ,
    "Levenshtein",
    "Damerau_Levenshtein"    ,
     "Longest_Common_Substring"     ,
    "q_gram_1",
    "q_gram_2",
    "q_gram_3",
    "q_gram_4",
    "q_gram_5",
    'Cosine_1',
    'Cosine_2',
    'Cosine_3',
    'Cosine_4',
    'Cosine_5',
    "Jaccard"              ,
     "First_Mistmatch"         ,
    "a_nchar"     ,
    "b_nchar"   ,
    "ab_nchar_diff"       ,             
    "dJaro",
    "dOptimal_String_Alignment"      ,
    "dLevenshtein"     ,
    "dDamerau_Levenshtein"  ,           
    "dLongest_Common_Substring",
    "dq_gram",
    "dCosine",
    "dJaccard"
) 
dpredict<-xgboost::xgb.DMatrix(data= as.matrix(ab_suggestions_events_features[,vars_x_string, with=F]), missing = NA)
#Step 4 Predict the likelihood that two strings are same
ab_suggestions_events_features$toponym_xb_model_prediction <- predict( toponym_xb_model, dpredict )
ab_suggestions_events_features$toponym_xb_model_prediction  <- 1/(1 + exp(-ab_suggestions_events_features$toponym_xb_model_prediction ))
# why does this work now but not in 06
hist(ab_suggestions_events_features$toponym_xb_model_prediction)

ab_suggestions_small <- ab_suggestions_events_features[,c('a','b','name_cleaner_a','name_cleaner_b','toponym_xb_model_prediction',"N")]
ab_suggestions_directed <- ab_suggestions_small
ab_suggestions_directed$a_event <- ab_suggestions_directed$name_cleaner_a %in% events_sf$name_cleaner
ab_suggestions_directed <- subset(ab_suggestions_directed, a_event) #just to double check, should be no change
ab_suggestions_directed$b_event <- ab_suggestions_directed$name_cleaner_b %in% events_sf$name_cleaner
ab_suggestions_directed$b_event_coord <- ab_suggestions_directed$name_cleaner_b %in% flatfiles_sf_roi$name_cleaner[flatfiles_sf_roi$source_dataset %in% c('events') & !is.na(flatfiles_sf_roi$latitude)]
ab_suggestions_directed$b_gaz_coord   <- ab_suggestions_directed$name_cleaner_b %in% flatfiles_sf_roi$name_cleaner[!flatfiles_sf_roi$source_dataset %in% c('events')  & !is.na(flatfiles_sf_roi$latitude) ]
hist(ab_suggestions_directed$toponym_xb_model_prediction) #before subsetting

ab_suggestions_directed <- ab_suggestions_directed[a_event & (b_event_coord|b_gaz_coord)]
ab_suggestions_directed <- ab_suggestions_directed[toponym_xb_model_prediction>.5 | name_cleaner_a==name_cleaner_b] #Only keep those with greater .5 chance of match
hist(ab_suggestions_directed$toponym_xb_model_prediction) #after subsetting

ab_suggestions_directed[,toponym_xb_model_prediction_max:=max(toponym_xb_model_prediction),
                        by=c('name_cleaner_a','b_gaz_coord')]
ab_suggestions_directed_max_gaz_coord <- subset(ab_suggestions_directed,
                                                b_gaz_coord &
                                                  toponym_xb_model_prediction==toponym_xb_model_prediction_max
                                                )
ab_suggestions_directed_max_event_coord <- subset(ab_suggestions_directed,
                                                  b_event_coord &
                                                    toponym_xb_model_prediction==toponym_xb_model_prediction_max
                                                  )
#A non missing coordinate
condition <- !is.na(sf::st_coordinates(events_sf)[,1])
table(condition)
condition
FALSE  TRUE 
 4877  5592 
#An exact match to the gaz
condition2 <- events_sf$name_cleaner %in% flatfiles_sf_roi$name_cleaner[!flatfiles_sf_roi$source_dataset %in% c('events','events_poly')]
table(condition2)
condition2
FALSE  TRUE 
 6120  4349 
table(condition | condition2)

FALSE  TRUE 
 2881  7588 
#Fuzzy match to a gaz
condition3 <- events_sf$name_cleaner %in% ab_suggestions_directed_max_gaz_coord$name_cleaner_a 
table(condition3)
condition3
FALSE  TRUE 
 4090  6379 
table(condition | condition2 | condition3)

FALSE  TRUE 
 1959  8510 
table(condition | condition2 | condition3) / length(condition)

    FALSE      TRUE 
0.1871239 0.8128761 
#Exact match to another event
condition4 <- events_sf$name_cleaner %in% flatfiles_sf_roi$name_cleaner[flatfiles_sf_roi$source_dataset %in% c('events') & !is.na(flatfiles_sf_roi$latitude) ]
table(condition4)
condition4
FALSE  TRUE 
 3682  6787 
table(condition4) / length(condition)
condition4
   FALSE     TRUE 
0.351705 0.648295 
table(condition | condition2 | condition3 | condition4)

FALSE  TRUE 
 1772  8697 
table(condition | condition2 | condition3 | condition4) / length(condition)

    FALSE      TRUE 
0.1692616 0.8307384 
#Fuzzy match to event with coordinate
condition5 <- events_sf$name_cleaner %in% ab_suggestions_directed_max_event_coord$name_cleaner_a 
table(condition5)
condition5
FALSE  TRUE 
 3631  6838 
table(condition5) / length(condition)
condition5
    FALSE      TRUE 
0.3468335 0.6531665 
table(condition | condition2 | condition3 | condition4 | condition5)

FALSE  TRUE 
 1690  8779 
table(condition | condition2 | condition3 | condition4 | condition5) / length(condition)

   FALSE     TRUE 
0.161429 0.838571 
condition6 <- tolower(events_sf$document_district_clean) %in% flatfiles_sf_roi$name_cleaner[!flatfiles_sf_roi$source_dataset %in% c('events')]
table(condition6)
condition6
FALSE  TRUE 
   11 10458 
saveRDS(ab_suggestions_directed,
        file=glue::glue(getwd(), "/../inst/extdata/ab_suggestions_directed.Rds") 
        )

Given all of the above we want to produce a matching dataset broken out across

exact or fuzzy match place description or district/province

#For when this inevitably crashes
ab_suggestions_directed <-  readRDS(system.file("extdata", "ab_suggestions_directed.Rds", package = "MeasuringLandscape")) 
flatfiles_sf_roi_centroids <- flatfiles_sf_roi %>% filter(!is.na(latitude) | geometry_type != "POINT") %>% sf::st_centroid()
st_centroid assumes attributes are constant over geometries of xst_centroid does not give correct centroids for longitude/latitude data
#Do some quick accounting
#Create new dataset that includes
#1) Diads chosen by fuzzy ab_suggestions_directed 
#2) Diads of identical
georef <- data.table::setnames( ab_suggestions_directed[,c('name_cleaner_a','name_cleaner_b','toponym_xb_model_prediction')] , c('georef_a','georef_b','toponym_xb_model_prediction') ) #So this is now a list of names that are either identical or fuzzy matches for one another
georef$georef_a <- trimws(georef$georef_a)
georef$georef_b <- trimws(georef$georef_b)
georef <- unique(georef)
#georef$georef_ab_identical <- georef$georef_a==georef$georef_b 
table(events_sf$name_cleaner %in% georef$georef_a) #7,742 events with at least one gazetteer suggestion

FALSE  TRUE 
 2527  7942 
#This is mapping of events to identical and fuzzy
#georef2 is a combination of events, and all the gazetteer names they might match to
georef2 <- as.data.frame(events_sf)[,c('event_hash', 'name_cleaner','document_district_clean','document_unit_type','document_date_best_year')] %>% 
  dplyr::left_join(georef, by = c("name_cleaner" = "georef_a")) %>% unique()  
dim(georef2)
[1] 92002     7
table(events_sf$name_cleaner %in% georef2$name_cleaner) #All events are in here

 TRUE 
10469 
table(events_sf$name_cleaner %in% georef2$name_cleaner[!is.na(georef2$georef_b)]) #

FALSE  TRUE 
 2527  7942 
table(events_sf$event_hash %in% georef2$event_hash) #All event hashes show up in here

 TRUE 
10469 
#This is mapping of gaz places to identical and fuzzy possible
#georef3 is a combination of gazetteers and all gazetteer names they might match to
#georef3 <- as.data.frame(flatfiles_sf_roi_centroids)[,c('place_hash','source_dataset','name_cleaner','geometry_type')] %>% left_join(georef, by = c("name_cleaner" = "georef_b")) %>% unique() 
#dim(georef3)
#This is a mapping of events to gaz where either their exact or fuzzy counterparts matched each other
#georef all is a mapping of events, to gazetteer names, to 
georef_all <- georef2 %>% 
             dplyr::left_join(as.data.frame(flatfiles_sf_roi_centroids)[,c('place_hash','source_dataset','name_cleaner','geometry_type','feature_code')],
                       by = c("georef_b" = "name_cleaner")) %>% unique()
dim(georef_all)   #303,174
[1] 532711     11
  
table(events_sf$name_cleaner %in% georef_all$name_cleaner) #All events are in here

 TRUE 
10469 
table(events_sf$name_cleaner %in% georef_all$name_cleaner[!is.na(georef_all$georef_b)]) #

FALSE  TRUE 
 2527  7942 
table(events_sf$event_hash %in% georef_all$event_hash) #All event hashes show up in here

 TRUE 
10469 
table(georef_all$source_dataset)

                    bing                   events                     gadm                 geonames                   google               historical          kenya_cadastral kenya_cadastral_district 
                    9436                   254162                    10729                    60988                    10591                    49962                      412                     1033 
      kenya_district1962     livestock_boundaries         livestock_points                      nga            openstreetmap                      tgn                 wikidata 
                     676                     8738                    35978                    57180                    16040                     2830                    11429 
length(unique(georef_all$event_hash))
[1] 10469
georef_all$georef_a <- NULL

rr #Ok next up we add distance #crs_m <- +proj=utm +zone=27 +datum=NAD83 +units=m +no_defs  #this is toally wrong, stop using it #https://epsg.io/21037 #EPSG:21037 #Projected coordinate system #Arc 1960 / UTM zone 37S rownames(events_sf) <- events_sf$event_hash

Setting row names on a tibble is deprecated.

rr #events_sf_utm <- st_transform(events_sf[,c(‘event_hash’,)], crs=21037) # #events_sf_utm <- st_sf(as.data.frame(events_sf_utm)) #rownames(events_sf_utm) <- events_sf_utm\(event_hash #It was slow as hell as a tbl, covert to just sf #flatfiles_sf_roi_centroids_utm <- st_transform(flatfiles_sf_roi_centroids[,c('place_hash',\geometry\)], crs=21037) #rownames(flatfiles_sf_roi_centroids_utm) <- flatfiles_sf_roi_centroids_utm\)place_hash #Try some smaller experiments and verify if and how long it takes to calculate distance since we’ve got a million of them #This takes a long time to calculate because there’s a million of them #Instead, add a column for whether dimensions are zero and then queue that column with the hash, much much better events_sf\(isvalid <- !is.na(st_dimension(events_sf) ) flatfiles_sf_roi_centroids\)isvalid <- !is.na(st_dimension(flatfiles_sf_roi_centroids) ) cords_events <- as.data.frame( st_coordinates(events_sf) ) rownames(cords_events) <- rownames(events_sf) cords_flatfiles <- as.data.frame( st_coordinates(flatfiles_sf_roi_centroids) ) rownames(flatfiles_sf_roi_centroids) <- flatfiles_sf_roi_centroids\(place_hash rownames(cords_flatfiles) <- rownames(flatfiles_sf_roi_centroids) coords_dt <- as.data.table( cbind(cords_events[georef_all\)event_hash,], cords_flatfiles[georef_all$place_hash,])) names(coords_dt) <- c(1,1, 2, 2) coords_dt[,distance_km:=sqrt((X2-X1)2+(Y2-Y1)2) * 111] hist(coords_dt$distance)

rr georef_all$distance <- NULL georef_all_dt <- as.data.table(cbind(georef_all, coords_dt)) georef_all_dt <- unique(georef_all_dt); dim(georef_all_dt)

[1] 532711     16

rr #georef_all_dt <- subset(georef_all_dt, !is.na(name_cleaner) & !is.na(georef_b)) #maybe don’t drop unmatched ones? If we want events as a source? table(events_sf\(name_cleaner %in% georef_all_dt\)name_cleaner) #All events are in here


 TRUE 
10469 

rr table(events_sf\(name_cleaner %in% georef_all_dt\)name_cleaner[!is.na(georef_all_dt$georef_b)]) #7,742 events with at least one gazetteer suggestion


FALSE  TRUE 
 2527  7942 

rr table(events_sf\(event_hash %in% georef_all_dt\)event_hash) #All event hashes show up in here


 TRUE 
10469 

rr saveRDS(georef_all_dt, file=paste0(here::here(), /inst/extdata/georef_all_dt.Rds) )

---
title: "05 Georeferencer"
author: "Rex W. Douglass and Kristen Harkness"
date: "March 9, 2018"
output: 
  html_notebook:
    toc: true
    toc_float: true
editor_options: 
  chunk_output_type: inline
---
<style>
    body .main-container {
        max-width: 100%;
    }
</style>


Takes in locations of events described as text, and returns all possible matches across different gazetteers.

Dependencies: events_sf.Rdata, flatfiles_sf_roi.Rdata
Products: georef_all_dt.Rds

```{r, results='hide', message=FALSE, warning=FALSE}
rm(list=ls()); gc()
# Hiding output and warnings
# !diagnostics off
library(MeasuringLandscape)
library(tidyverse)

dir_figures <- paste0(here::here(), "/paper/figures/")

gc()

knitr::opts_knit$set(progress = TRUE, verbose = TRUE)
knitr::opts_chunk$set(fig.width=12, fig.height=8,  warning=FALSE, message=FALSE, cache=TRUE)
options(width = 160)

```

# Load Files

We load the events file to be geolocated.

```{r}

#Load Events
events_sf <- readRDS(system.file("extdata", "events_sf.Rdata", package = "MeasuringLandscape")) 

events_sf_text_coord_unique <- plyr::ddply(events_sf[,c('location_text','name_clean','name_cleaner','document_district_clean','map_coordinate_clean_latitude','map_coordinate_clean_longitude')],
                                     "location_text", transform,
      map_coordinate_has =sum(!is.na(map_coordinate_clean_latitude))
      )


```

We load the combined gazetteer files.

```{r}
#Load Gazetteers
flatfiles_sf_roi <- readRDS(system.file("extdata", "flatfiles_sf_roi.Rdata", package = "MeasuringLandscape")) 
dim(flatfiles_sf_roi)
flatfiles_dt <- data.table::as.data.table(flatfiles_sf_roi)
data.table::setkey(flatfiles_dt,place_hash)

```

We load a pretrained toponym matcher based on XGBoost and many string distance features.

```{r}

#Load toponym model
#p_load(xgboost)
toponym_xb_model <- xgboost::xgb.load(system.file("extdata", "toponym_xb_onlystring.bin", package = "MeasuringLandscape"))

```


# Summary Statistics

When we start, we have `r length(unique(events_sf$name_cleaner))` unique location strings in the events data.

```{r}
#events_sf$name_cleaner %>% tabyl(sort=T)
```

When we start, we have `r length(unique(flatfiles_sf_roi$name_cleaner))` unique location strings in the gazetteer data.

```{r}
#flatfiles_sf_roi$name_cleaner %>% tabyl(sort=T)

```

If we considered every single pair of possible matches it would require about 62 million comparisons (`r length(unique(events_sf$name_cleaner)) * length(unique(flatfiles_sf_roi$name_cleaner))`). And that is just for our relatively small number of event location strings and restricting our gazetteers to a relatively small region of interest. Obviously, any approach we implement is going to have to scale sub-quadratic in the number of locations or it won't be applicable to any problem of interesting size.

# Stem

Our first move is to stem each location string of prefixes and common suffixes so that we can match slight variations of the same place to a single identifier. 

```{r}

#Step 1  Get Suggestions for Stems

#Stem the gazetteer names
temp <- MeasuringLandscape:::strip_postfixes(flatfiles_sf_roi$name_cleaner)
flatfiles_sf_roi$name_cleaner_stem <- temp[[1]]
flatfiles_sf_roi$name_cleaner_postfix <- temp[[2]]

#flatfiles_sf_roi$name_cleaner_stem %>% tabyl(sort=T)

```

There are `r length(unique(flatfiles_sf_roi$name_cleaner_stem))` unique location stems in the flatfiles data.

```{r}
#Stem the event names
temp <- MeasuringLandscape:::strip_postfixes(events_sf$name_cleaner)
events_sf$name_cleaner_stem <- temp[[1]]
events_sf$name_cleaner_postfix <- temp[[2]]

#events_sf$name_cleaner %>% tabyl(sort=T)

```

There are `r length(unique(events_sf$name_cleaner_stem))` unique location stems in the events data.
 

```{r}
#Unique stems
stemmed_ab <- unique(c(flatfiles_sf_roi$name_cleaner_stem, events_sf$name_cleaner_stem))
length(stemmed_ab) #17,880 unique stems

#Unique name-stem pairs
names_and_stems <- rbind(
                          as.data.frame(flatfiles_sf_roi)[,c('name_cleaner','name_cleaner_stem')],
                          as.data.frame(events_sf)[,c('name_cleaner','name_cleaner_stem')]
                        ) %>% unique() ; dim(names_and_stems)

```

# LSH 

Our next move is hash each stemmed location in a way that locations that are somewhat similar to one another are returned as possible matches and the vast majority of pairs that are too dissimilar to ever possibly match are excluded. This requires only a single pass through the data, and so scales linearly in the number of locations. We select parameters which perform well on our hand-labeled training dataset of toponym matches and mismatches. Where performing well at this stage means a low false negative rate (missing few genuine matches) and a moderate false positive rate (returning many irrelevant matches, but not an overwhelming number). Here we are only trying to reject as many obvious nonmatches as possible.

```{r}

minhash_count=100
bands=25

fromscratch=F # For some reason these parallel functions hang in rmarkdown. Copy and paste them into console to run.
if(fromscratch){
  #Optimal settings derived in 04_fuzzy_matcher_stage_1
  
  library(textreuse)
  stemmed_ab_spaced <- sapply(strsplit(stemmed_ab, split="") , paste, collapse=" ")
  minhash <- minhash_generator(n = minhash_count, seed = 1)
  options("mc.cores" = parallel::detectCores()) #Much faster when paralized
  #options("mc.cores" = 1) #Much faster when paralized
  #This doesn't like to be ran in rmarkdown, it'll spawn processes and hang. Maybe see if it'll work as a function?
  
  corpus_ab_spaced <- TextReuseCorpus(text = stemmed_ab_spaced,
                                      tokenizer = tokenize_ngrams,
                                      n = 2, #wow if you pass this as a variable instead of a number it crashes. Wtf?
                                      minhash_func = minhash,
                                      keep_tokens = TRUE,
                                      progress = T)
  
  #buckets <- lsh(corpus_ab_spaced, bands = 80, progress = T) #Single threaded but should be parallizable
  
  n.cores <- parallel::detectCores()
  cuts <- cut(1:length(corpus_ab_spaced), n.cores)
  #buckets_list <- mclapply(levels(cuts),
  #               function(q) lsh(corpus_ab_spaced[cuts==q], bands = bands, progress = T) ,
  #               mc.cores = n.cores)
  #buckets <- do.call(rbind, buckets_list)
  
  library(doParallel)
  library(foreach)
  cl<- makeCluster(parallel::detectCores()) #change the 2 to your number of CPU cores
  registerDoParallel(cl)
  buckets <- foreach(q=levels(cuts), .combine='rbind', .packages=c('textreuse')) %dopar%  
                lsh(corpus_ab_spaced[cuts==q], bands = bands, progress =F)
  stopCluster(cl)
  
  candidates <- lsh_candidates(buckets)
  dim(candidates) #842,288 
  candidates$a_numeric <- as.numeric( gsub("doc-","",candidates$a) )
  candidates$b_numeric <- as.numeric( gsub("doc-","",candidates$b) )
  candidates$stemmed_a <- stemmed_ab[candidates$a_numeric]
  candidates$stemmed_b <- stemmed_ab[candidates$b_numeric]
  candidates$stemmed_ab <- paste(candidates$stemmed_a,candidates$stemmed_b, sep="_")
  candidates$stemmed_ba <- paste(candidates$stemmed_b,candidates$stemmed_a, sep="_")
  
  saveRDS(candidates, paste0(here::here(), "inst/extdata/candidates.Rdata")) #Leaving off the .. because you should only be running this by hand
  
}

```


```{r}

candidates <- readRDS(system.file("extdata",
                                 "candidates.Rdata",
                                 package = "MeasuringLandscape"))



#head(candidates) %>% DT::datatable()

```

There are `r nrow(candidates)` candidate stem matches return by locality sensitive hashing with these parameters. This is a tiny `r round(nrow(candidates)/length(stemmed_ab)^2,4)` fraction of all the possible stem to stem matches we could have considered.


## Expand Stem matches into full matches

We consider any two strings to be a possible match if we consider their stems to be a possible match. This bit of code expands and merges to get the suggestions in terms of event string- gazetteer string possible matches. Throughout we use the example of the stem "gura" to show possible matches and how they're reduced over various steps.

```{r}

stemmed_ab_suggestions <- subset(candidates , stemmed_a!="" &  stemmed_b!="") ; #dim(stemmed_ab_suggestions) #drop empty stems. If not it'll crash first match later on
stemmed_ab_suggestions$N <- 1
glue::glue(stemmed_ab_suggestions %>% nrow() , " string matches")

#Link a stemmed_a to stemmed b
stemmed_ab_suggestions_events <- data.table::rbindlist(list(
  stemmed_ab_suggestions[,c('stemmed_a','stemmed_b','N')],
  data.table::setnames(stemmed_ab_suggestions[,c('stemmed_a','stemmed_b','N')], c('stemmed_b','stemmed_a','N')  ), 
  data.table::setnames(stemmed_ab_suggestions[,c('stemmed_a','stemmed_a','N')], c('stemmed_b','stemmed_b','N')  ) #make sure it's a suggestion for itself
)) ; dim(stemmed_ab_suggestions_events)

stemmed_ab_suggestions_events <- unique(stemmed_ab_suggestions_events) ; #dim(stemmed_ab_suggestions_events) #859,118


#Only keep if the a is a stem found in the events data
stemmed_ab_suggestions_events <- subset(stemmed_ab_suggestions_events, stemmed_a %in% unique(events_sf$name_cleaner_stem)) ; #dim(stemmed_ab_suggestions_events)  #148,416
glue::glue(stemmed_ab_suggestions_events %>% nrow() , " string matches")

#Step 2 Pull all full
ab_suggestions_events <- merge(stemmed_ab_suggestions_events,
                               data.table::setnames(names_and_stems, c("name_cleaner_a","name_cleaner_stem")) ,
                               by.x="stemmed_a",
                               by.y="name_cleaner_stem",
                               all.x=T,
                               allow.cartesian=TRUE) ; #dim(ab_suggestions_events)
#DT::datatable(ab_suggestions_events[stemmed_a=="gura"])

#Make sure the full name appears in the event data
ab_suggestions_events <- subset(ab_suggestions_events, name_cleaner_a %in% unique( events_sf$name_cleaner)  ) ; #dim(ab_suggestions_events)  #
glue::glue(stemmed_ab_suggestions_events %>% nrow() , " string matches")
#DT::datatable(ab_suggestions_events[stemmed_a=="gura"])

ab_suggestions_events <- merge(ab_suggestions_events,
                                     data.table::setnames(names_and_stems, c("name_cleaner_b","name_cleaner_stem")) ,
                                     by.x="stemmed_b",
                                     by.y="name_cleaner_stem",
                                     all.x=T,
                                     allow.cartesian=TRUE)  ; #dim(ab_suggestions_events) #4,846,054

ab_suggestions_events <- subset(ab_suggestions_events, name_cleaner_a %in% unique( events_sf$name_cleaner)  ) ; #dim(ab_suggestions_events)  #
glue::glue(stemmed_ab_suggestions_events %>% nrow() , " string matches")
#DT::datatable(ab_suggestions_events[stemmed_a=="gura"])

```


# Screen pairs that are too dissimilar

```{r}
ab_suggestions_events[,temp_q_cos:= stringdist::stringsim(name_cleaner_a,name_cleaner_b,"cos", nthread=parallel::detectCores(),q=2),] #both fast and very good at sorting
ab_suggestions_events <- ab_suggestions_events[temp_q_cos>.3] ; #dim(ab_suggestions_events) #less than .3 is never match
glue::glue(ab_suggestions_events %>% nrow() , " string matches after removing those with too small a ngram2 cosine distance")

#DT::datatable(ab_suggestions_events[stemmed_a=="gura"])

#library(DT)
#DT::datatable(ab_suggestions_events[stemmed_a=="gura"])

```


# XGBoost Toponym Matching

Next, we refine the remaining matches using additional string distance features and an XGBoost model trained earlier on the labeled match/no match toponym training examples.

```{r, fig.width=12, fig.height=8}
ab_suggestions_events$a <- ab_suggestions_events$stemmed_a
ab_suggestions_events$b <- ab_suggestions_events$stemmed_b

ab_suggestions_events_features <- MeasuringLandscape:::toponym_add_features(ab_suggestions_events)

vars_x_string <- c(
    "Jaro",
    "Optimal_String_Alignment"    ,
    "Levenshtein",
    "Damerau_Levenshtein"    ,
     "Longest_Common_Substring"     ,
    "q_gram_1",
    "q_gram_2",
    "q_gram_3",
    "q_gram_4",
    "q_gram_5",
    'Cosine_1',
    'Cosine_2',
    'Cosine_3',
    'Cosine_4',
    'Cosine_5',
    "Jaccard"              ,
     "First_Mistmatch"         ,
    "a_nchar"     ,
    "b_nchar"   ,
    "ab_nchar_diff"       ,             
    "dJaro",
    "dOptimal_String_Alignment"      ,
    "dLevenshtein"     ,
    "dDamerau_Levenshtein"  ,           
    "dLongest_Common_Substring",
    "dq_gram",
    "dCosine",
    "dJaccard"
) 

dpredict<-xgboost::xgb.DMatrix(data= as.matrix(ab_suggestions_events_features[,vars_x_string, with=F]), missing = NA)

#Step 4 Predict the likelihood that two strings are same
ab_suggestions_events_features$toponym_xb_model_prediction <- predict( toponym_xb_model, dpredict )
ab_suggestions_events_features$toponym_xb_model_prediction  <- 1/(1 + exp(-ab_suggestions_events_features$toponym_xb_model_prediction ))

# why does this work now but not in 06
hist(ab_suggestions_events_features$toponym_xb_model_prediction)


ab_suggestions_small <- ab_suggestions_events_features[,c('a','b','name_cleaner_a','name_cleaner_b','toponym_xb_model_prediction',"N")]
ab_suggestions_directed <- ab_suggestions_small
ab_suggestions_directed$a_event <- ab_suggestions_directed$name_cleaner_a %in% events_sf$name_cleaner
ab_suggestions_directed <- subset(ab_suggestions_directed, a_event) #just to double check, should be no change
ab_suggestions_directed$b_event <- ab_suggestions_directed$name_cleaner_b %in% events_sf$name_cleaner
ab_suggestions_directed$b_event_coord <- ab_suggestions_directed$name_cleaner_b %in% flatfiles_sf_roi$name_cleaner[flatfiles_sf_roi$source_dataset %in% c('events') & !is.na(flatfiles_sf_roi$latitude)]
ab_suggestions_directed$b_gaz_coord   <- ab_suggestions_directed$name_cleaner_b %in% flatfiles_sf_roi$name_cleaner[!flatfiles_sf_roi$source_dataset %in% c('events')  & !is.na(flatfiles_sf_roi$latitude) ]

```

```{r}

hist(ab_suggestions_directed$toponym_xb_model_prediction) #before subsetting
ab_suggestions_directed <- ab_suggestions_directed[a_event & (b_event_coord|b_gaz_coord)]
ab_suggestions_directed <- ab_suggestions_directed[toponym_xb_model_prediction>.5 | name_cleaner_a==name_cleaner_b] #Only keep those with greater .5 chance of match
hist(ab_suggestions_directed$toponym_xb_model_prediction) #after subsetting



```

```{r}

ab_suggestions_directed[,toponym_xb_model_prediction_max:=max(toponym_xb_model_prediction),
                        by=c('name_cleaner_a','b_gaz_coord')]

ab_suggestions_directed_max_gaz_coord <- subset(ab_suggestions_directed,
                                                b_gaz_coord &
                                                  toponym_xb_model_prediction==toponym_xb_model_prediction_max
                                                )

ab_suggestions_directed_max_event_coord <- subset(ab_suggestions_directed,
                                                  b_event_coord &
                                                    toponym_xb_model_prediction==toponym_xb_model_prediction_max
                                                  )

#A non missing coordinate
condition <- !is.na(sf::st_coordinates(events_sf)[,1])
table(condition)

#An exact match to the gaz
condition2 <- events_sf$name_cleaner %in% flatfiles_sf_roi$name_cleaner[!flatfiles_sf_roi$source_dataset %in% c('events','events_poly')]
table(condition2)

table(condition | condition2)

#Fuzzy match to a gaz
condition3 <- events_sf$name_cleaner %in% ab_suggestions_directed_max_gaz_coord$name_cleaner_a 
table(condition3)

table(condition | condition2 | condition3)
table(condition | condition2 | condition3) / length(condition)

#Exact match to another event
condition4 <- events_sf$name_cleaner %in% flatfiles_sf_roi$name_cleaner[flatfiles_sf_roi$source_dataset %in% c('events') & !is.na(flatfiles_sf_roi$latitude) ]
table(condition4)
table(condition4) / length(condition)

table(condition | condition2 | condition3 | condition4)
table(condition | condition2 | condition3 | condition4) / length(condition)

#Fuzzy match to event with coordinate
condition5 <- events_sf$name_cleaner %in% ab_suggestions_directed_max_event_coord$name_cleaner_a 
table(condition5)
table(condition5) / length(condition)

table(condition | condition2 | condition3 | condition4 | condition5)
table(condition | condition2 | condition3 | condition4 | condition5) / length(condition)

condition6 <- tolower(events_sf$document_district_clean) %in% flatfiles_sf_roi$name_cleaner[!flatfiles_sf_roi$source_dataset %in% c('events')]
table(condition6)

saveRDS(ab_suggestions_directed,
        file=glue::glue(getwd(), "/../inst/extdata/ab_suggestions_directed.Rds") 
        )

```

Given all of the above we want to produce a matching dataset broken out across 

exact or fuzzy match
place description or district/province


```{r}

#For when this inevitably crashes
ab_suggestions_directed <-  readRDS(system.file("extdata", "ab_suggestions_directed.Rds", package = "MeasuringLandscape")) 
flatfiles_sf_roi_centroids <- flatfiles_sf_roi %>% filter(!is.na(latitude) | geometry_type != "POINT") %>% sf::st_centroid()

#Do some quick accounting

#Create new dataset that includes
#1) Diads chosen by fuzzy ab_suggestions_directed 
#2) Diads of identical

georef <- data.table::setnames( ab_suggestions_directed[,c('name_cleaner_a','name_cleaner_b','toponym_xb_model_prediction')] , c('georef_a','georef_b','toponym_xb_model_prediction') ) #So this is now a list of names that are either identical or fuzzy matches for one another
georef$georef_a <- trimws(georef$georef_a)
georef$georef_b <- trimws(georef$georef_b)
georef <- unique(georef)
#georef$georef_ab_identical <- georef$georef_a==georef$georef_b 

table(events_sf$name_cleaner %in% georef$georef_a) #7,742 events with at least one gazetteer suggestion

#This is mapping of events to identical and fuzzy
#georef2 is a combination of events, and all the gazetteer names they might match to
georef2 <- as.data.frame(events_sf)[,c('event_hash', 'name_cleaner','document_district_clean','document_unit_type','document_date_best_year')] %>% 
  dplyr::left_join(georef, by = c("name_cleaner" = "georef_a")) %>% unique()  
dim(georef2)

table(events_sf$name_cleaner %in% georef2$name_cleaner) #All events are in here
table(events_sf$name_cleaner %in% georef2$name_cleaner[!is.na(georef2$georef_b)]) #
table(events_sf$event_hash %in% georef2$event_hash) #All event hashes show up in here

#This is mapping of gaz places to identical and fuzzy possible
#georef3 is a combination of gazetteers and all gazetteer names they might match to
#georef3 <- as.data.frame(flatfiles_sf_roi_centroids)[,c('place_hash','source_dataset','name_cleaner','geometry_type')] %>% left_join(georef, by = c("name_cleaner" = "georef_b")) %>% unique() 
#dim(georef3)

#This is a mapping of events to gaz where either their exact or fuzzy counterparts matched each other
#georef all is a mapping of events, to gazetteer names, to 
georef_all <- georef2 %>% 
             dplyr::left_join(as.data.frame(flatfiles_sf_roi_centroids)[,c('place_hash','source_dataset','name_cleaner','geometry_type','feature_code')],
                       by = c("georef_b" = "name_cleaner")) %>% unique()
dim(georef_all)   #303,174

  
table(events_sf$name_cleaner %in% georef_all$name_cleaner) #All events are in here
table(events_sf$name_cleaner %in% georef_all$name_cleaner[!is.na(georef_all$georef_b)]) #
table(events_sf$event_hash %in% georef_all$event_hash) #All event hashes show up in here

table(georef_all$source_dataset)
length(unique(georef_all$event_hash))
georef_all$georef_a <- NULL

```




```{r}

#Ok next up we add distance
#crs_m <- "+proj=utm +zone=27 +datum=NAD83 +units=m +no_defs"  #this is toally wrong, stop using it
#https://epsg.io/21037
#EPSG:21037
#Projected coordinate system
#Arc 1960 / UTM zone 37S
rownames(events_sf) <- events_sf$event_hash
#events_sf_utm <- st_transform(events_sf[,c('event_hash',"geometry")], crs=21037) #
#events_sf_utm <- st_sf(as.data.frame(events_sf_utm))
#rownames(events_sf_utm) <- events_sf_utm$event_hash
#It was slow as hell as a tbl, covert to just sf


#flatfiles_sf_roi_centroids_utm <- st_transform(flatfiles_sf_roi_centroids[,c('place_hash',"geometry")], crs=21037)
#rownames(flatfiles_sf_roi_centroids_utm) <- flatfiles_sf_roi_centroids_utm$place_hash

#Try some smaller experiments and verify if and how long it takes to calculate distance since we've got a million of them
#This takes a long time to calculate because there's a million of them

#Instead, add a column for whether dimensions are zero and then queue that column with the hash, much much better
events_sf$isvalid <- !is.na(st_dimension(events_sf) )
flatfiles_sf_roi_centroids$isvalid <- !is.na(st_dimension(flatfiles_sf_roi_centroids) )

cords_events <- as.data.frame( st_coordinates(events_sf) )
rownames(cords_events) <- rownames(events_sf)
cords_flatfiles <- as.data.frame( st_coordinates(flatfiles_sf_roi_centroids) )
rownames(flatfiles_sf_roi_centroids) <- flatfiles_sf_roi_centroids$place_hash
rownames(cords_flatfiles) <- rownames(flatfiles_sf_roi_centroids)

coords_dt <- as.data.table( cbind(cords_events[georef_all$event_hash,], cords_flatfiles[georef_all$place_hash,]))
names(coords_dt) <- c("X1","Y1", "X2", "Y2")
coords_dt[,distance_km:=sqrt((X2-X1)^2+(Y2-Y1)^2) * 111]

hist(coords_dt$distance)
georef_all$distance <- NULL
georef_all_dt <- as.data.table(cbind(georef_all, coords_dt))

georef_all_dt <- unique(georef_all_dt); dim(georef_all_dt)
#georef_all_dt <- subset(georef_all_dt, !is.na(name_cleaner) & !is.na(georef_b)) #maybe don't drop unmatched ones? If we want events as a source?

table(events_sf$name_cleaner %in% georef_all_dt$name_cleaner) #All events are in here
table(events_sf$name_cleaner %in% georef_all_dt$name_cleaner[!is.na(georef_all_dt$georef_b)]) #7,742 events with at least one gazetteer suggestion
table(events_sf$event_hash %in% georef_all_dt$event_hash) #All event hashes show up in here

saveRDS(georef_all_dt,
        file=paste0(here::here(), "/inst/extdata/georef_all_dt.Rds")
)

```






