Fuzzy Matcher Stage 2

The fuzzy matcher predicts the likelihood that two toponyms are the same place even though their spellings might be different. It has two stages.

This file develops Stage 2 of the fuzzy toponym matcher. Its job is to refine the prediction for the subset of plausible matches and provide an actual probability of a match.

rm(list=ls()); gc()
library(MeasuringLandscape)
library(tidyverse)
dir_figures <- glue::glue(getwd(), "/../paper/figures/")
knitr::opts_knit$set(progress = TRUE, verbose = TRUE)
knitr::opts_chunk$set(fig.width=12, fig.height=8,  warning=FALSE, message=FALSE, cache=TRUE)
options(width = 160)

It is based on an events dataset built and cleaned in the file “01 Prep Events Counts”.

#Load Events
events_sf <- readRDS(system.file("extdata", "events_sf.Rdata", package = "MeasuringLandscape")) 
events_dt <- data.table::as.data.table(events_sf) #%>% unique() 
dim(events_sf)
[1] 10469   104
unlist(strsplit(events_sf$name_cleaner,"")) %>% janitor::tabyl() %>% janitor::adorn_crosstab(., denom = "col", show_n = T, digits = 1, show_totals = T)
'janitor::adorn_crosstab' is deprecated.
Use 'use the various adorn_ functions instead.  See the "tabyl" vignette for examples.' instead.
See help("Deprecated")
events_sf_text_coord_unique <- plyr::ddply(events_sf[,c('location_text','name_clean','name_cleaner','document_district_clean',
                                                  'map_coordinate_clean_latitude','map_coordinate_clean_longitude')],
                                     "location_text", transform,
      map_coordinate_has =sum(!is.na(map_coordinate_clean_latitude))
      )
#Load Gazeteers
flatfiles_sf_roi <- readRDS(system.file("extdata", "flatfiles_sf_roi.Rdata", package = "MeasuringLandscape")) 
dim(flatfiles_sf_roi)
[1] 55966    22
flatfiles_dt <- data.table::as.data.table(flatfiles_sf_roi)
data.table::setkey(flatfiles_dt,place_hash)

Summary statistics

table( !is.na(sf::st_coordinates(events_sf)[,1] ) ,
       !is.na(events_sf$name_cleaner) )
       
        FALSE TRUE
  FALSE  1377 3500
  TRUE    454 5138

Produce a dataset for hand labeling

This takes every event with a coordinate and looks up the 10 nearest points in the gazetteers. It saves this as a csv file that a human can label as a toponym match or mismatch.

rr #Should only need to do this once #create_toponym_dataset_forlabeling()

Choose Features

vars_id <- c("name_cleaner_a","name_cleaner_b",'test','extranegative')
vars_weights <- c("weights")
vars_y <- c("rex_match")
vars_x_string <- c(#"exact_match",               
"Jaro",
"Optimal_String_Alignment"    ,
"Levenshtein",
"Damerau_Levenshtein"    ,
"Longest_Common_Substring"     ,
"q_gram_1",
"q_gram_2",
"q_gram_3",
"q_gram_4",
"q_gram_5",
'Cosine_1',
'Cosine_2',
'Cosine_3',
'Cosine_4',
'Cosine_5',
"Jaccard"              ,
 "First_Mistmatch"         ,
"a_nchar"     ,
"b_nchar"   ,
"ab_nchar_diff"       ,             
"dJaro",
"dOptimal_String_Alignment"      ,
"dLevenshtein"     ,
"dDamerau_Levenshtein"  ,           
"dLongest_Common_Substring",
"dq_gram",
"dCosine",
"dJaccard"
# "OM",
# "OMloc",
# "OMslen"
# ,"OMspell",
# "TWED",
# "LCS",                 
# "LCP",
# "RLCP",
# "NMS",
# "NMSMST",                 
# "SVRspell",
# "CHI2"
) 
vars_x_stem <- paste0(vars_x_string, "_stemmed")
vars_x_string_corpus <- c(
"corpus_mention_count_a",
"corpus_mention_year_min_a",
"corpus_mention_year_median_a",
"corpus_mention_year_mean_a",
"corpus_mention_year_max_a",
"corpus_mention_count_b",
"corpus_mention_year_min_b",
"corpus_mention_year_median_b",
"corpus_mention_year_mean_b",
"corpus_mention_year_max_b",
"gazeteer_mentions_count_a",
"gazeteer_mentions_count_b",
"gazeteer_stem_mentions_count_a",
"gazeteer_stem_mentions_count_b"
#"postfix_has_a"
#"postfix_has_b" 
#"ngram_a", #Don't run, it's just a word count and it's missing if it's never found
#"ngram_b"
)
vars_x_stem_corpus <- paste0(vars_x_string_corpus, "_stemmed")
vars_x_everything <-  c(
                        vars_x_string,
                        vars_x_stem#,
                        #vars_x_string_corpus, #excluding corpus features
                        #vars_x_stem_corpus  #excluding corpus features
                        )     
vars_x_onlystring <- c(vars_x_string)
vars_x_stringcorpus <-  c(vars_x_string,  vars_x_string_corpus)     
vars_x_string_and_stem <- c(vars_x_string, vars_x_stem)

Create Training Test Split

handlabeled <- MeasuringLandscape:::toponym_training_dataset_load()
[1] "Hand labels original"
[1] 20242     8

    0     1 
17050  3192 
[1] "Drop identical labels"
[1] 18405    10

    0     1 
17047  1358 
[1] "Invert Diads"
[1] 34496    10

    0     1 
32374  2122 
fromscratch=F
if(fromscratch){    
  
    toponym_training_dataset <- create_toponym_split_training_test( 
                                                           handlabeled=handlabeled,
                                                           vars_id=vars_id,
                                                           vars_weights=vars_weights,
                                                           vars_y=vars_y,
                                                           vars_x=vars_x_everything,
                                                           neg_count=0, #you lose so many to cosine distance
                                                           fromscratch=T
                                                         )
    
    saveRDS(toponym_training_dataset,
            file=glue(getwd(), "/../inst/extdata/toponym_training_dataset.Rds")
            )
}
toponym_training_dataset <- readRDS(system.file("extdata", "toponym_training_dataset.Rds", package = "MeasuringLandscape"))
  

Summarize training data

glue::glue("Handlabeled Training Dataset")
Handlabeled Training Dataset
dim(handlabeled)
[1] 34496    10
table(handlabeled$rex_match)

    0     1 
32374  2122 
glue::glue("All Data Resampled with Features \n")
All Data Resampled with Features 
dim(toponym_training_dataset$xy_all)
[1] 14936    62
table(toponym_training_dataset$xy_all$rex_match)

    0     1 
12820  2116 
glue::glue("Training Split \n")
Training Split 
dim(toponym_training_dataset$xy_train)
[1] 12611    62
table(toponym_training_dataset$xy_train$rex_match)

    0     1 
10803  1808 
glue::glue("Test Split \n")
Test Split 
dim(toponym_training_dataset$xy_test)
[1] 2325   62
table(toponym_training_dataset$xy_test$rex_match)

   0    1 
2017  308 

Fit XGBoost models predicting match

xgb.save(toponym_xb_string_and_stem,
       glue::glue(getwd(), "/../inst/extdata/toponym_xb_string_and_stem.bin")
         ) #Have to save separately for some reason
[1] TRUE

(Appendix Figure 5, AUC and Precision Recall curves for toponym fuzzy matcher stage 2, predicting likelihood of a match.)

dtest<-xgb.DMatrix(data= as.matrix(as.data.frame(global_test)[,vars_x_everything]),
                   label=as.numeric(global_test$rex_match),
                   weight = global_test$weights,
                   missing = NA)
global_test$toponym_xb_everything2 <- predict(toponym_xb_everything2, dtest )
global_test$toponym_xb_everything2 <- 1/(1 + exp(-global_test$toponym_xb_everything2))
#global_test$toponym_xb_onlystring <- predict(toponym_xb_onlystring, dtest )
#global_test$toponym_xb_onlystring <- 1/(1 + exp(-global_test$toponym_xb_onlystring))
global_test$toponym_xb_string_and_stem <- predict(toponym_xb_string_and_stem, dtest )
global_test$toponym_xb_string_and_stem <- 1/(1 + exp(-global_test$toponym_xb_string_and_stem))
#p_load(Metrics)
#p_load(MLmetrics)
tocompare <- c(#"toponym_xb_everything2",
               #"toponym_xb_onlystring"#,
               #"toponym_xb_stringcorpus"#,
               "toponym_xb_string_and_stem"
               ) # "toponym_xb_everything",, "toponym_xb_everything2_extra","toponym_xb_onlystring_extra","toponym_xb_everything2_extra_large", "toponym_xb_onlystring_extra_large", "toponym_xb_everything2_uber_large" , "toponym_xb_onlystring_uber_large"
print("ce")
[1] "ce"
sapply(tocompare, function(q) {  Metrics::ce(actual=global_test$rex_match, predicted=global_test[,q]>.5)  })
toponym_xb_string_and_stem 
                0.03526882 
print("F1_Score")
[1] "F1_Score"
sapply(tocompare, function(q) {  MLmetrics::F1_Score(global_test$rex_match, as.numeric( global_test[,q]>.5) )  })
toponym_xb_string_and_stem 
                 0.9795613 
print("logloss")
[1] "logloss"
sapply(tocompare, function(q) {  Metrics::logLoss(actual=global_test$rex_match, predicted=global_test[,q])  })
toponym_xb_string_and_stem 
                0.09686789 
print("confusion")
[1] "confusion"
sapply(tocompare, function(q) {  table(global_test$rex_match,  global_test[,q]>.5)  })
     toponym_xb_string_and_stem
[1,]                       1965
[2,]                         30
[3,]                         52
[4,]                        278
#p_load(precrec)
msmdat1 <- precrec::evalmod( precrec::mmdata(scores=list(
                                #global_test$toponym_xb_everything,
                                #global_test$toponym_rf_everything,
                                #global_test$toponym_xb_everything_extracted,
                                #global_test$toponym_xb_everything2, #highest PRC
                                #global_test$toponym_xb_everything3,
                                #global_test$toponym_xb_everything4,
                                global_test$toponym_xb_string_and_stem#,
                                #global_test$toponym_xb_onlystring#,
                                #global_test$toponym_xb_stringcorpus#,
                                #global_test$toponym_xb_onlystring_tiny
                                #global_test$toponym_xb_everything2_extra,
                                #global_test$toponym_xb_onlystring_extra,
                                #global_test$toponym_xb_everything2_extra_large,
                                #global_test$toponym_xb_onlystring_extra_large,
                                #global_test$toponym_xb_everything2_uber_large,
                                #global_test$toponym_xb_onlystring_uber_large
                                ),
                           labels=as.numeric(as.data.frame(global_test)$rex_match),
                           modnames = c(#"toponym_xb_everything",
                                        #"toponym_rf_everything",
                                        #"toponym_xb_everything_features",
                                        #"toponym_xb_everything2",
                                        #"toponym_xb_everything3",
                                        #"toponym_xb_everything4",
                                        "toponym_xb_string_and_stem"#,
                                        #"toponym_xb_onlystring"#,
                                        #"toponym_xb_stringcorpus"#,
                                        #"toponym_xb_onlystring_tiny"
                                        #"toponym_xb_everything2_extra",
                                        #"toponym_xb_onlystring_extra",
                                        #"toponym_xb_everything2_extra_large", 
                                        #"toponym_xb_onlystring_extra_large",
                                        #"toponym_xb_everything2_uber_large" ,
                                        #"toponym_xb_onlystring_uber_large"
                                        )
                           ) )
msmdat1

    === AUCs ===

                     Model name Dataset ID Curve type       AUC
   1 toponym_xb_string_and_stem          1        ROC 0.9882251
   2 toponym_xb_string_and_stem          1        PRC 0.9268585

    === Input data ===

                     Model name Dataset ID # of negatives # of positives
   1 toponym_xb_string_and_stem          1           2017            308
uauc1 <- precrec::evalmod(scores = global_test$toponym_xb_string_and_stem,
                 labels=as.numeric(as.data.frame(global_test)$rex_match)#,
                 #mode="aucroc"
                 )
plot(uauc1)
#dev.off()
pdf(file=glue::glue(dir_figures, "p_auc_stage2.pdf"), width=5.5)
plot(uauc1)
#autoplot(uauc1)
dev.off()
png 
  2 

autoplot(uauc1)

Evaluate the model (Appendix Figure 6, AUC and Precision Recall curves for toponym fuzzy matcher stage 2, predicting likelihood of a match.)

#p_load(DiagrammeR)
#xgb.plot.tree(model = xb)
#xgb.plot.tree(model = toponym_xb_everything2, trees = 0, show_node_id = TRUE, render = TRUE)
importance_importance <- xgb.importance(feature_names=vars_x_string_and_stem, model = toponym_xb_string_and_stem)
xgb.plot.importance(importance_importance)
pdf(file=glue::glue(dir_figures, "p_variable_importance_stage2.pdf"), width=5.5, height=6)
xgb.plot.importance(importance_importance)
#autoplot(uauc1)
dev.off()
png 
  2 

Analysis of Residuals

global_test$toponym_xb_everything2_correct <- (global_test$toponym_xb_everything2>.5) == global_test$rex_match
#global_test$toponym_xb_onlystring_uber_large_correct <- (global_test$toponym_xb_onlystring_uber_large>.5)==global_test$rex_match
#table(global_test$toponym_xb_onlystring_uber_large_correct)
---
title: "04 Fuzzy Matcher Stage 2 (XGBoost)"
author: "Rex W. Douglass and Kristen Harkness"
date: "March 9, 2018"
output: 
  html_notebook:
    toc: true
    toc_float: true
editor_options: 
  chunk_output_type: inline
---
<style>
    body .main-container {
        max-width: 100%;
    }
</style>

# Fuzzy Matcher Stage 2

The fuzzy matcher predicts the likelihood that two toponyms are the same place even though their spellings might be different. It has two stages.

This file develops Stage 2 of the fuzzy toponym matcher. Its job is to refine the prediction for the subset of plausible matches and provide an actual probability of a match.
 

```{r, results='hide', message=FALSE, warning=FALSE }
rm(list=ls()); gc()
library(MeasuringLandscape)
library(tidyverse)
dir_figures <- glue::glue(getwd(), "/../paper/figures/")

knitr::opts_knit$set(progress = TRUE, verbose = TRUE)
knitr::opts_chunk$set(fig.width=12, fig.height=8,  warning=FALSE, message=FALSE, cache=TRUE)
options(width = 160)

```

It is based on an events dataset built and cleaned in the file "01 Prep Events Counts".

```{r}

#Load Events
events_sf <- readRDS(system.file("extdata", "events_sf.Rdata", package = "MeasuringLandscape")) 
events_dt <- data.table::as.data.table(events_sf) #%>% unique() 
dim(events_sf)

unlist(strsplit(events_sf$name_cleaner,"")) %>% janitor::tabyl() %>% janitor::adorn_crosstab(., denom = "col", show_n = T, digits = 1, show_totals = T)

events_sf_text_coord_unique <- plyr::ddply(events_sf[,c('location_text','name_clean','name_cleaner','document_district_clean',
                                                  'map_coordinate_clean_latitude','map_coordinate_clean_longitude')],
                                     "location_text", transform,
      map_coordinate_has =sum(!is.na(map_coordinate_clean_latitude))
      )

#Load Gazeteers
flatfiles_sf_roi <- readRDS(system.file("extdata", "flatfiles_sf_roi.Rdata", package = "MeasuringLandscape")) 
dim(flatfiles_sf_roi)
flatfiles_dt <- data.table::as.data.table(flatfiles_sf_roi)
data.table::setkey(flatfiles_dt,place_hash)


```

Summary statistics

```{r}

table( !is.na(sf::st_coordinates(events_sf)[,1] ) ,
       !is.na(events_sf$name_cleaner) )

```


# Produce a dataset for hand labeling

This takes every event with a coordinate and looks up the 10 nearest points in the gazetteers. It saves this as a csv file that a human can label as a toponym match or mismatch.

```{r}

#Should only need to do this once
#MeasuringLandscape:::create_toponym_dataset_forlabeling()

```


# Choose Features 

```{r}

vars_id <- c("name_cleaner_a","name_cleaner_b",'test','extranegative')
vars_weights <- c("weights")
vars_y <- c("rex_match")

vars_x_string <- c(#"exact_match",               
"Jaro",
"Optimal_String_Alignment"    ,
"Levenshtein",
"Damerau_Levenshtein"    ,
"Longest_Common_Substring"     ,
"q_gram_1",
"q_gram_2",
"q_gram_3",
"q_gram_4",
"q_gram_5",
'Cosine_1',
'Cosine_2',
'Cosine_3',
'Cosine_4',
'Cosine_5',
"Jaccard"              ,
 "First_Mistmatch"         ,
"a_nchar"     ,
"b_nchar"   ,
"ab_nchar_diff"       ,             
"dJaro",
"dOptimal_String_Alignment"      ,
"dLevenshtein"     ,
"dDamerau_Levenshtein"  ,           
"dLongest_Common_Substring",
"dq_gram",
"dCosine",
"dJaccard"
# "OM",
# "OMloc",
# "OMslen"
# ,"OMspell",
# "TWED",
# "LCS",                 
# "LCP",
# "RLCP",
# "NMS",
# "NMSMST",                 
# "SVRspell",
# "CHI2"
) 

vars_x_stem <- paste0(vars_x_string, "_stemmed")

vars_x_string_corpus <- c(

"corpus_mention_count_a",
"corpus_mention_year_min_a",
"corpus_mention_year_median_a",
"corpus_mention_year_mean_a",
"corpus_mention_year_max_a",

"corpus_mention_count_b",
"corpus_mention_year_min_b",
"corpus_mention_year_median_b",
"corpus_mention_year_mean_b",
"corpus_mention_year_max_b",

"gazeteer_mentions_count_a",
"gazeteer_mentions_count_b",
"gazeteer_stem_mentions_count_a",
"gazeteer_stem_mentions_count_b"

#"postfix_has_a"
#"postfix_has_b" 

#"ngram_a", #Don't run, it's just a word count and it's missing if it's never found
#"ngram_b"
)


vars_x_stem_corpus <- paste0(vars_x_string_corpus, "_stemmed")
vars_x_everything <-  c(
                        vars_x_string,
                        vars_x_stem#,
                        #vars_x_string_corpus, #excluding corpus features
                        #vars_x_stem_corpus  #excluding corpus features
                        )     
vars_x_onlystring <- c(vars_x_string)
vars_x_stringcorpus <-  c(vars_x_string,  vars_x_string_corpus)     
vars_x_string_and_stem <- c(vars_x_string, vars_x_stem)

```

# Create Training Test Split

```{r}

handlabeled <- MeasuringLandscape:::toponym_training_dataset_load()

fromscratch=F
if(fromscratch){    
  
    toponym_training_dataset <- create_toponym_split_training_test( 
                                                           handlabeled=handlabeled,
                                                           vars_id=vars_id,
                                                           vars_weights=vars_weights,
                                                           vars_y=vars_y,
                                                           vars_x=vars_x_everything,
                                                           neg_count=0, #you lose so many to cosine distance
                                                           fromscratch=T
                                                         )
    
    saveRDS(toponym_training_dataset,
            file=glue(getwd(), "/../inst/extdata/toponym_training_dataset.Rds")
            )
}
toponym_training_dataset <- readRDS(system.file("extdata", "toponym_training_dataset.Rds", package = "MeasuringLandscape"))
  
```

# Summarize training data

```{r}


glue::glue("Handlabeled Training Dataset")
dim(handlabeled)
table(handlabeled$rex_match)


glue::glue("All Data Resampled with Features \n")
dim(toponym_training_dataset$xy_all)
table(toponym_training_dataset$xy_all$rex_match)


glue::glue("Training Split \n")
dim(toponym_training_dataset$xy_train)
table(toponym_training_dataset$xy_train$rex_match)


glue::glue("Test Split \n")
dim(toponym_training_dataset$xy_test)
table(toponym_training_dataset$xy_test$rex_match)

```


# Fit XGBoost models predicting match

```{r}


require(methods)
global_test <- toponym_training_dataset[['xy_all']] %>% filter(test %in% T  & !is.na(rex_match) )

sumwneg=sum(toponym_training_dataset[['xy_train']]$rex_match==0)
sumwpos=sum(toponym_training_dataset[['xy_train']]$rex_match==1)
param2 <- list("objective" = MeasuringLandscape:::logregobj, #"objective" ="binary:logistic", #
              "scale_pos_weight" = sumwneg / sumwpos,
              "eta" = 0.3,
              "max_depth" = 6,
              "eval_metric" = "auc",
              #"eval_metric" = area_under_pr_curve_metric,
              #"eval_metric" = "ams@0.15",
              "silent" = 1,
              "nthread" = 48,
              'maximize'=T)

toponym_xb_everything2 <- MeasuringLandscape:::train_an_xb(toponym_training_dataset[['xy_train']],
                                     toponym_training_dataset[['xy_test']] ,
                                     vars_x=vars_x_everything,
                                     param=param2,
                                     use_weights=F,
                                     extract_features=F )
xgb.save(toponym_xb_everything2,
       glue::glue(getwd(), "/../inst/extdata/toponym_xb_everything2.bin")
         ) #Have to save separately for some reason

toponym_xb_onlystring <- MeasuringLandscape:::train_an_xb(toponym_training_dataset[['xy_train']],
                                     toponym_training_dataset[['xy_test']] ,
                                     vars_x=vars_x_onlystring,
                                     param=param2,
                                     use_weights=F,
                                     extract_features=F,
                                     missing=NA)
xgb.save(toponym_xb_onlystring,
       glue::glue(getwd(), "/../inst/extdata/toponym_xb_onlystring.bin")
         ) #Have to save separately for some reason



toponym_xb_string_and_stem <- MeasuringLandscape:::train_an_xb(toponym_training_dataset[['xy_train']],
                                     toponym_training_dataset[['xy_test']] ,
                                     vars_x=vars_x_string_and_stem,
                                     param=param2,
                                     use_weights=F,
                                     extract_features=F,
                                     missing=NA)

xgb.save(toponym_xb_string_and_stem,
       glue::glue(getwd(), "/../inst/extdata/toponym_xb_string_and_stem.bin")
         ) #Have to save separately for some reason



```

(Appendix Figure 5, AUC and Precision Recall curves for toponym fuzzy matcher stage 2, predicting likelihood of a match.)

```{r, fig.width=12, fig.height=8}

dtest<-xgb.DMatrix(data= as.matrix(as.data.frame(global_test)[,vars_x_everything]),
                   label=as.numeric(global_test$rex_match),
                   weight = global_test$weights,
                   missing = NA)

global_test$toponym_xb_everything2 <- predict(toponym_xb_everything2, dtest )
global_test$toponym_xb_everything2 <- 1/(1 + exp(-global_test$toponym_xb_everything2))

#global_test$toponym_xb_onlystring <- predict(toponym_xb_onlystring, dtest )
#global_test$toponym_xb_onlystring <- 1/(1 + exp(-global_test$toponym_xb_onlystring))


global_test$toponym_xb_string_and_stem <- predict(toponym_xb_string_and_stem, dtest )
global_test$toponym_xb_string_and_stem <- 1/(1 + exp(-global_test$toponym_xb_string_and_stem))

#p_load(Metrics)
#p_load(MLmetrics)
tocompare <- c(#"toponym_xb_everything2",
               #"toponym_xb_onlystring"#,
               #"toponym_xb_stringcorpus"#,
               "toponym_xb_string_and_stem"
               ) # "toponym_xb_everything",, "toponym_xb_everything2_extra","toponym_xb_onlystring_extra","toponym_xb_everything2_extra_large", "toponym_xb_onlystring_extra_large", "toponym_xb_everything2_uber_large" , "toponym_xb_onlystring_uber_large"
print("ce")
sapply(tocompare, function(q) {  Metrics::ce(actual=global_test$rex_match, predicted=global_test[,q]>.5)  })
print("F1_Score")
sapply(tocompare, function(q) {  MLmetrics::F1_Score(global_test$rex_match, as.numeric( global_test[,q]>.5) )  })
print("logloss")
sapply(tocompare, function(q) {  Metrics::logLoss(actual=global_test$rex_match, predicted=global_test[,q])  })
print("confusion")
sapply(tocompare, function(q) {  table(global_test$rex_match,  global_test[,q]>.5)  })


#p_load(precrec)
msmdat1 <- precrec::evalmod( precrec::mmdata(scores=list(
                                #global_test$toponym_xb_everything,
                                #global_test$toponym_rf_everything,
                                #global_test$toponym_xb_everything_extracted,
                                #global_test$toponym_xb_everything2, #highest PRC
                                #global_test$toponym_xb_everything3,
                                #global_test$toponym_xb_everything4,
                                global_test$toponym_xb_string_and_stem#,
                                #global_test$toponym_xb_onlystring#,
                                #global_test$toponym_xb_stringcorpus#,
                                #global_test$toponym_xb_onlystring_tiny
                                #global_test$toponym_xb_everything2_extra,
                                #global_test$toponym_xb_onlystring_extra,
                                #global_test$toponym_xb_everything2_extra_large,
                                #global_test$toponym_xb_onlystring_extra_large,
                                #global_test$toponym_xb_everything2_uber_large,
                                #global_test$toponym_xb_onlystring_uber_large
                                ),
                           labels=as.numeric(as.data.frame(global_test)$rex_match),
                           modnames = c(#"toponym_xb_everything",
                                        #"toponym_rf_everything",
                                        #"toponym_xb_everything_features",
                                        #"toponym_xb_everything2",
                                        #"toponym_xb_everything3",
                                        #"toponym_xb_everything4",
                                        "toponym_xb_string_and_stem"#,
                                        #"toponym_xb_onlystring"#,
                                        #"toponym_xb_stringcorpus"#,
                                        #"toponym_xb_onlystring_tiny"
                                        #"toponym_xb_everything2_extra",
                                        #"toponym_xb_onlystring_extra",
                                        #"toponym_xb_everything2_extra_large", 
                                        #"toponym_xb_onlystring_extra_large",
                                        #"toponym_xb_everything2_uber_large" ,
                                        #"toponym_xb_onlystring_uber_large"
                                        )
                           ) )

msmdat1

uauc1 <- precrec::evalmod(scores = global_test$toponym_xb_string_and_stem,
                 labels=as.numeric(as.data.frame(global_test)$rex_match)#,
                 #mode="aucroc"
                 )

plot(uauc1)

#dev.off()
pdf(file=glue::glue(dir_figures, "p_auc_stage2.pdf"), width=5.5)
plot(uauc1)
#autoplot(uauc1)
dev.off()
autoplot(uauc1)

```


Evaluate the model
(Appendix Figure 6, AUC and Precision Recall curves for toponym fuzzy matcher stage 2, predicting likelihood of a match.)


```{r, fig.width=12, fig.height=8}
#p_load(DiagrammeR)
#xgb.plot.tree(model = xb)
#xgb.plot.tree(model = toponym_xb_everything2, trees = 0, show_node_id = TRUE, render = TRUE)

importance_importance <- xgb.importance(feature_names=vars_x_string_and_stem, model = toponym_xb_string_and_stem)
xgb.plot.importance(importance_importance)

pdf(file=glue::glue(dir_figures, "p_variable_importance_stage2.pdf"), width=5.5, height=6)
xgb.plot.importance(importance_importance)
#autoplot(uauc1)
dev.off()

```

# Analysis of Residuals

```{r}

global_test$toponym_xb_everything2_correct <- (global_test$toponym_xb_everything2>.5) == global_test$rex_match


#global_test$toponym_xb_onlystring_uber_large_correct <- (global_test$toponym_xb_onlystring_uber_large>.5)==global_test$rex_match
#table(global_test$toponym_xb_onlystring_uber_large_correct)


```


