Demonstrates that georeferencing options vary in how systematically they include some kinds of events while excluding other kinds. Asks whether or not missingness of geographic information is truly random, or if certain kinds of events or places are being dropped.

rm(list=ls()); gc()
# !diagnostics off
library(MeasuringLandscape)
library(tidyverse)
dir_figures <- glue::glue(getwd(), "/../paper/figures/")
gc()
knitr::opts_knit$set(progress = TRUE, verbose = TRUE)
knitr::opts_chunk$set(fig.width=12, fig.height=8,  warning=FALSE, message=FALSE, cache=TRUE)
options(width = 160)
#Load Events
events_sf <- readRDS(system.file("extdata", "events_sf.Rdata", package = "MeasuringLandscape")) 
events_sf_text_coord_unique <- plyr::ddply(events_sf[,c('location_text','name_clean','name_cleaner','document_district_clean','map_coordinate_clean_latitude','map_coordinate_clean_longitude')],
                                     "location_text", transform,
      map_coordinate_has =sum(!is.na(map_coordinate_clean_latitude))
      )
#Reload from scratch each time in case we subset sometehing weirdly
georef_all_dt <- readRDS(system.file("extdata", "georef_all_dt_recomendations.Rds", package = "MeasuringLandscape")) 
table(events_sf$name_cleaner %in% georef_all_dt$name_cleaner) #All events are in here

 TRUE 
10469 
table(events_sf$name_cleaner %in% georef_all_dt$name_cleaner[!is.na(georef_all_dt$georef_b)]) #7,742 events with at least one gazetteer suggestion

FALSE  TRUE 
 2527  7942 
#Exclude all distance = 0 obs, those are self matches
georef_all_dt <- subset(georef_all_dt, 
                        !is.na(name_cleaner) & # must have a name
                        (is.na(distance_km) | distance_km!=0)  ) #Can be either missing or not zero. Only thing we drop is zero because that's a self match

Predict missing coordinates

Proof of concept, show it works on original missingness

glue::glue("Military coordinates only")
Military coordinates only
y <- !is.na(events_sf$map_coordinate_clean_latitude); table(y)
y
FALSE  TRUE 
 4877  5592 
pred_cords <- MeasuringLandscape:::predict_missingness_dv(y)
[1] train-auc:0.804097+0.003315 test-auc:0.799740+0.006637 
Multiple eval metrics are present. Will use test_auc for early stopping.
Will train until test_auc hasn't improved in 10 rounds.

[21]    train-auc:0.889264+0.001468 test-auc:0.872310+0.006274 
[41]    train-auc:0.898854+0.002677 test-auc:0.877003+0.003656 
[50]    train-auc:0.901976+0.002753 test-auc:0.877772+0.003660 
auc_cords_dataset <- Metrics::auc(pred_cords$label, pred_cords$xb)
recall_cords_dataset <- sum(pred_cords$label)
glue::glue("Text only")
Text only
y <- !is.na(events_sf$name_cleaner); table(y)
y
FALSE  TRUE 
 1831  8638 
pred_text <- MeasuringLandscape:::predict_missingness_dv(y)
[1] train-auc:0.830160+0.001570 test-auc:0.825829+0.006723 
Multiple eval metrics are present. Will use test_auc for early stopping.
Will train until test_auc hasn't improved in 10 rounds.

[21]    train-auc:0.892263+0.001669 test-auc:0.877117+0.006347 
[41]    train-auc:0.905476+0.002618 test-auc:0.882680+0.005951 
[50]    train-auc:0.909041+0.002292 test-auc:0.883159+0.005973 
auc_text_dataset <- Metrics::auc(pred_text$label, pred_text$xb)
recall_text_dataset <- sum(pred_text$label)
glue::glue("Military or Text")
Military or Text
y <- !is.na(events_sf$name_cleaner) | #Name isn't missing
     !is.na(events_sf$map_coordinate_clean_latitude); #or it has coordinates
table(y)
y
FALSE  TRUE 
 1377  9092 
pred_cordstext <- MeasuringLandscape:::predict_missingness_dv(y)
[1] train-auc:0.841018+0.002213 test-auc:0.835831+0.011111 
Multiple eval metrics are present. Will use test_auc for early stopping.
Will train until test_auc hasn't improved in 10 rounds.

[21]    train-auc:0.905167+0.000436 test-auc:0.889299+0.005454 
[41]    train-auc:0.918015+0.002224 test-auc:0.891847+0.004062 
[50]    train-auc:0.922551+0.001271 test-auc:0.892790+0.004624 
auc_cordstext_dataset <- Metrics::auc(pred_cordstext$label, pred_cordstext$xb)
recall_cordstext_dataset <- sum(pred_cordstext$label)
#Hand Rule
glue::glue("Hand Rule")
Hand Rule
data.table::setkey(georef_all_dt, handrule)
georef_all_dt_handrule <- georef_all_dt[,.SD[1], by=list(event_hash) ]
y <- events_sf$event_hash %in% georef_all_dt_handrule$event_hash #Which events received a imputed location under this rule
table(y)
y
FALSE  TRUE 
 2248  8221 
pred_source_handrule <- MeasuringLandscape:::predict_missingness_dv(y)
[1] train-auc:0.777390+0.004390 test-auc:0.767687+0.011393 
Multiple eval metrics are present. Will use test_auc for early stopping.
Will train until test_auc hasn't improved in 10 rounds.

[21]    train-auc:0.849321+0.001842 test-auc:0.827999+0.007427 
[41]    train-auc:0.861369+0.001751 test-auc:0.832096+0.008182 
[50]    train-auc:0.866109+0.001986 test-auc:0.833090+0.008597 
auc_handrule_dataset <- Metrics::auc(pred_source_handrule$label, pred_source_handrule$xb)
recall_handrule_dataset <- sum(pred_source_handrule$label)
#Ensemble Rule
glue::glue("Ensemble Rule")
Ensemble Rule
data.table::setkey(georef_all_dt, rule_ensemble)
georef_all_dt_ensemble <- georef_all_dt[,.SD[1], by=list(event_hash) ]
y <- events_sf$event_hash %in% georef_all_dt_ensemble$event_hash #Which events received a imputed location under this rule
table(y)
y
FALSE  TRUE 
 2248  8221 
pred_source_ensemble <- MeasuringLandscape:::predict_missingness_dv(y)
[1] train-auc:0.780940+0.005652 test-auc:0.771963+0.009399 
Multiple eval metrics are present. Will use test_auc for early stopping.
Will train until test_auc hasn't improved in 10 rounds.

[21]    train-auc:0.851907+0.003191 test-auc:0.827541+0.006146 
[41]    train-auc:0.863698+0.003920 test-auc:0.831828+0.006781 
[50]    train-auc:0.868085+0.003498 test-auc:0.832048+0.006420 
auc_ensemble_dataset <- Metrics::auc(pred_source_ensemble$label, pred_source_ensemble$xb)
recall_ensemble_dataset <- sum(pred_source_ensemble$label)
#fuzzy
glue::glue("fuzzy")
fuzzy
georef_all_dt_byfuzzy <- georef_all_dt[,
                                       list(distance_km_min=min(distance_km, na.rm=T) ),
                                       by=list(event_hash, fuzzy)
                                       ]
No non-missing values found in at least one group. Returning 'Inf' for such groups to be consistent with base
summary(georef_all_dt_byfuzzy)
  event_hash          fuzzy         distance_km_min   
 Length:13593       Mode :logical   Min.   : 0.01807  
 Class :character   FALSE:5683      1st Qu.: 2.99298  
 Mode  :character   TRUE :7214      Median :37.37747  
                    NA's :696       Mean   :     Inf  
                                    3rd Qu.:     Inf  
                                    Max.   :     Inf  
pred_fuzzy_list <- list()
q_all <- na.omit(unique(georef_all_dt_byfuzzy$fuzzy)) ; table(q_all)
q_all
FALSE  TRUE 
    1     1 
for( q in q_all ){
  print(q)
  y <- events_sf$event_hash %in% 
      georef_all_dt_byfuzzy[fuzzy==q  #The right kind of fuzzy
                            ]$event_hash
  pred_fuzzy_list[[as.character(q)]] <- MeasuringLandscape:::predict_missingness_dv(y)
}
[1] FALSE
[1] train-auc:0.760763+0.002087 test-auc:0.751381+0.010016 
Multiple eval metrics are present. Will use test_auc for early stopping.
Will train until test_auc hasn't improved in 10 rounds.

[21]    train-auc:0.801600+0.002656 test-auc:0.772316+0.010604 
Stopping. Best iteration:
[22]    train-auc:0.802831+0.002088 test-auc:0.772947+0.010750

[1] TRUE
[1] train-auc:0.780915+0.003241 test-auc:0.773619+0.008622 
Multiple eval metrics are present. Will use test_auc for early stopping.
Will train until test_auc hasn't improved in 10 rounds.

[21]    train-auc:0.823140+0.003137 test-auc:0.796991+0.009034 
[41]    train-auc:0.834482+0.003847 test-auc:0.797205+0.008080 
Stopping. Best iteration:
[36]    train-auc:0.832137+0.003594 test-auc:0.797721+0.008106
auc_fuzzy <- sapply(pred_fuzzy_list, FUN=function(q) Metrics::auc(q$label, q$xb))
recall_fuzzy <- sapply(pred_fuzzy_list, FUN=function(q) sum(q$label) )
#Source dataset
glue::glue("Source dataset")
Source dataset
georef_all_dt_bysource <- georef_all_dt[,list(distance_km_min=min(distance_km, na.rm=T) ),by=list(event_hash, source_dataset)] ; dim(georef_all_dt_bysource)
No non-missing values found in at least one group. Returning 'Inf' for such groups to be consistent with base
[1] 58290     3
georef_all_dt_bysource[!is.finite(distance_km_min), distance_km_min:=NA]
pred_source_dataset_list <- list()
for( q in na.omit(unique(georef_all_dt_bysource$source_dataset)) ){
  print(q)
  y <- events_sf$event_hash %in% georef_all_dt_bysource[source_dataset==q ]$event_hash
  pred_source_dataset_list[[as.character(q)]] <- MeasuringLandscape:::predict_missingness_dv(y)
}
[1] "events"
[1] train-auc:0.769434+0.003422 test-auc:0.763054+0.011735 
Multiple eval metrics are present. Will use test_auc for early stopping.
Will train until test_auc hasn't improved in 10 rounds.

[21]    train-auc:0.815349+0.003901 test-auc:0.791311+0.006508 
Stopping. Best iteration:
[21]    train-auc:0.815349+0.003901 test-auc:0.791311+0.006508

[1] "bing"
[1] train-auc:0.750718+0.001182 test-auc:0.737795+0.012900 
Multiple eval metrics are present. Will use test_auc for early stopping.
Will train until test_auc hasn't improved in 10 rounds.

[21]    train-auc:0.794599+0.002226 test-auc:0.758811+0.006010 
Stopping. Best iteration:
[29]    train-auc:0.800159+0.001365 test-auc:0.759775+0.005683

[1] "nga"
[1] train-auc:0.770880+0.003116 test-auc:0.759014+0.007822 
Multiple eval metrics are present. Will use test_auc for early stopping.
Will train until test_auc hasn't improved in 10 rounds.

[21]    train-auc:0.819542+0.002779 test-auc:0.793420+0.009689 
[41]    train-auc:0.832146+0.002556 test-auc:0.795167+0.009888 
[50]    train-auc:0.835503+0.002805 test-auc:0.795277+0.009902 
[1] "geonames"
[1] train-auc:0.770527+0.001975 test-auc:0.760426+0.008883 
Multiple eval metrics are present. Will use test_auc for early stopping.
Will train until test_auc hasn't improved in 10 rounds.

[21]    train-auc:0.818685+0.002154 test-auc:0.793308+0.011331 
[41]    train-auc:0.830110+0.002676 test-auc:0.795345+0.011904 
Stopping. Best iteration:
[35]    train-auc:0.827043+0.001663 test-auc:0.795647+0.011786

[1] "livestock_points"
[1] train-auc:0.774481+0.002335 test-auc:0.767195+0.008053 
Multiple eval metrics are present. Will use test_auc for early stopping.
Will train until test_auc hasn't improved in 10 rounds.

[21]    train-auc:0.818804+0.003677 test-auc:0.789986+0.011579 
Stopping. Best iteration:
[25]    train-auc:0.821982+0.004149 test-auc:0.790738+0.011826

[1] "openstreetmap"
[1] train-auc:0.674669+0.001454 test-auc:0.665281+0.005709 
Multiple eval metrics are present. Will use test_auc for early stopping.
Will train until test_auc hasn't improved in 10 rounds.

[21]    train-auc:0.755103+0.002599 test-auc:0.710490+0.005086 
[41]    train-auc:0.772432+0.002472 test-auc:0.715184+0.007262 
[50]    train-auc:0.778505+0.001777 test-auc:0.715358+0.007452 
[1] "historical"
[1] train-auc:0.765187+0.002063 test-auc:0.752054+0.006422 
Multiple eval metrics are present. Will use test_auc for early stopping.
Will train until test_auc hasn't improved in 10 rounds.

[21]    train-auc:0.807012+0.003043 test-auc:0.778607+0.007958 
[41]    train-auc:0.818725+0.002731 test-auc:0.779360+0.008012 
[50]    train-auc:0.822632+0.001443 test-auc:0.779657+0.008408 
[1] "google"
[1] train-auc:0.762845+0.003636 test-auc:0.747845+0.015454 
Multiple eval metrics are present. Will use test_auc for early stopping.
Will train until test_auc hasn't improved in 10 rounds.

[21]    train-auc:0.804601+0.004104 test-auc:0.767300+0.014475 
[41]    train-auc:0.815128+0.003840 test-auc:0.766495+0.012439 
Stopping. Best iteration:
[33]    train-auc:0.811827+0.004018 test-auc:0.767676+0.012807

[1] "wikidata"
[1] train-auc:0.744611+0.002555 test-auc:0.728887+0.008280 
Multiple eval metrics are present. Will use test_auc for early stopping.
Will train until test_auc hasn't improved in 10 rounds.

[21]    train-auc:0.793132+0.001818 test-auc:0.753363+0.007966 
Stopping. Best iteration:
[29]    train-auc:0.800007+0.001360 test-auc:0.754428+0.007321

[1] "gadm"
[1] train-auc:0.770666+0.003388 test-auc:0.757944+0.011324 
Multiple eval metrics are present. Will use test_auc for early stopping.
Will train until test_auc hasn't improved in 10 rounds.

[21]    train-auc:0.817256+0.003392 test-auc:0.787340+0.014773 
[41]    train-auc:0.829924+0.003937 test-auc:0.788950+0.013814 
[50]    train-auc:0.834424+0.003656 test-auc:0.789107+0.014312 
[1] "kenya_cadastral_district"
[1] train-auc:0.773146+0.008101 test-auc:0.732873+0.016350 
Multiple eval metrics are present. Will use test_auc for early stopping.
Will train until test_auc hasn't improved in 10 rounds.

[21]    train-auc:0.845288+0.005454 test-auc:0.782971+0.019089 
Stopping. Best iteration:
[14]    train-auc:0.835930+0.003822 test-auc:0.784113+0.015809

[1] "livestock_boundaries"
[1] train-auc:0.767371+0.002642 test-auc:0.755384+0.010713 
Multiple eval metrics are present. Will use test_auc for early stopping.
Will train until test_auc hasn't improved in 10 rounds.

[21]    train-auc:0.810280+0.002879 test-auc:0.776519+0.010880 
[41]    train-auc:0.823395+0.001955 test-auc:0.777546+0.012048 
Stopping. Best iteration:
[39]    train-auc:0.822695+0.002227 test-auc:0.777960+0.011979

[1] "tgn"
[1] train-auc:0.721959+0.006640 test-auc:0.690729+0.012434 
Multiple eval metrics are present. Will use test_auc for early stopping.
Will train until test_auc hasn't improved in 10 rounds.

[21]    train-auc:0.825427+0.004686 test-auc:0.746431+0.014124 
[41]    train-auc:0.843449+0.004627 test-auc:0.751312+0.016043 
[50]    train-auc:0.848913+0.004747 test-auc:0.751344+0.015644 
[1] "kenya_district1962"
[1] train-auc:0.854161+0.003213 test-auc:0.817628+0.019381 
Multiple eval metrics are present. Will use test_auc for early stopping.
Will train until test_auc hasn't improved in 10 rounds.

[21]    train-auc:0.908943+0.004764 test-auc:0.847120+0.013950 
Stopping. Best iteration:
[11]    train-auc:0.896112+0.002746 test-auc:0.848098+0.016216

[1] "kenya_cadastral"
[1] train-auc:0.904636+0.006685 test-auc:0.852674+0.031010 
Multiple eval metrics are present. Will use test_auc for early stopping.
Will train until test_auc hasn't improved in 10 rounds.

[21]    train-auc:0.948584+0.001499 test-auc:0.888132+0.016014 
Stopping. Best iteration:
[19]    train-auc:0.946756+0.000714 test-auc:0.888853+0.015630
auc_source_dataset <- sapply(pred_source_dataset_list, FUN=function(q) Metrics::auc(q$label, q$xb))
recall_source_dataset <- sapply(pred_source_dataset_list, FUN=function(q) sum(q$label) )
#geometry_type
glue::glue("geometry_type")
geometry_type
georef_all_dt_bygeometry_type <- georef_all_dt[,list(distance_km_min=min(distance_km, na.rm=T) ),by=list(event_hash, geometry_type)] 
No non-missing values found in at least one group. Returning 'Inf' for such groups to be consistent with base
georef_all_dt_bygeometry_type[!is.finite(distance_km_min), distance_km_min:=NA]
pred_geometry_type_list <- list()
for( q in na.omit(unique(georef_all_dt_bygeometry_type$geometry_type)) ){
  print(q)
  y <- events_sf$event_hash %in% georef_all_dt_bygeometry_type[geometry_type==q ]$event_hash
  pred_geometry_type_list[[as.character(q)]] <- MeasuringLandscape:::predict_missingness_dv(y)
}
[1] "POINT"
[1] train-auc:0.793843+0.003068 test-auc:0.787794+0.012419 
Multiple eval metrics are present. Will use test_auc for early stopping.
Will train until test_auc hasn't improved in 10 rounds.

[21]    train-auc:0.834564+0.001957 test-auc:0.811295+0.009149 
[41]    train-auc:0.846155+0.001758 test-auc:0.812287+0.008703 
Stopping. Best iteration:
[37]    train-auc:0.844202+0.001934 test-auc:0.812594+0.008373

[1] "MULTIPOLYGON"
[1] train-auc:0.764748+0.002854 test-auc:0.754327+0.008761 
Multiple eval metrics are present. Will use test_auc for early stopping.
Will train until test_auc hasn't improved in 10 rounds.

[21]    train-auc:0.810010+0.004237 test-auc:0.777654+0.009797 
[41]    train-auc:0.822522+0.004614 test-auc:0.777865+0.009815 
Stopping. Best iteration:
[35]    train-auc:0.820118+0.004601 test-auc:0.778672+0.010421

[1] "POLYGON"
[1] train-auc:0.763042+0.001849 test-auc:0.752128+0.003365 
Multiple eval metrics are present. Will use test_auc for early stopping.
Will train until test_auc hasn't improved in 10 rounds.

[21]    train-auc:0.804911+0.001350 test-auc:0.773768+0.006505 
[41]    train-auc:0.819318+0.002769 test-auc:0.774795+0.006044 
Stopping. Best iteration:
[33]    train-auc:0.813731+0.001761 test-auc:0.774945+0.006811

[1] "LINESTRING"
[1] train-auc:0.729706+0.008462 test-auc:0.702811+0.026694 
Multiple eval metrics are present. Will use test_auc for early stopping.
Will train until test_auc hasn't improved in 10 rounds.

[21]    train-auc:0.827082+0.004572 test-auc:0.749591+0.020059 
Stopping. Best iteration:
[27]    train-auc:0.832162+0.005524 test-auc:0.751773+0.020394
auc_geometry_type <- sapply(pred_geometry_type_list, FUN=function(q) Metrics::auc(q$label, q$xb))
recall_geometry_type <- sapply(pred_geometry_type_list, FUN=function(q) sum(q$label) )
#Self Reference
glue::glue("Self Reference")
Self Reference
georef_all_dt_byselfreference <- georef_all_dt[,list(distance_km_min=min(distance_km, na.rm=T) ),by=list(event_hash, SelfReference )] ; dim(georef_all_dt_bysource)
No non-missing values found in at least one group. Returning 'Inf' for such groups to be consistent with base
[1] 58290     3
georef_all_dt_byselfreference[!is.finite(distance_km_min), distance_km_min:=NA]
pred_selfreference_list <- list()
for( q in na.omit(unique(georef_all_dt_byselfreference$SelfReference)) ){
  print(q)
  y <- events_sf$event_hash %in% georef_all_dt_byselfreference[SelfReference==q]$event_hash
  pred_selfreference_list[[as.character(q)]] <- MeasuringLandscape:::predict_missingness_dv(y)
}
[1] TRUE
[1] train-auc:0.770494+0.002377 test-auc:0.763557+0.004348 
Multiple eval metrics are present. Will use test_auc for early stopping.
Will train until test_auc hasn't improved in 10 rounds.

[21]    train-auc:0.815309+0.000789 test-auc:0.790626+0.003589 
[41]    train-auc:0.827838+0.001723 test-auc:0.791958+0.004529 
[50]    train-auc:0.831600+0.001346 test-auc:0.791737+0.004698 
[1] FALSE
[1] train-auc:0.773348+0.002983 test-auc:0.764485+0.008508 
Multiple eval metrics are present. Will use test_auc for early stopping.
Will train until test_auc hasn't improved in 10 rounds.

[21]    train-auc:0.820040+0.001409 test-auc:0.793394+0.006822 
[41]    train-auc:0.831762+0.001024 test-auc:0.796301+0.006800 
[50]    train-auc:0.836174+0.001096 test-auc:0.795641+0.006862 
auc_selfreference <- sapply(pred_selfreference_list, FUN=function(q) Metrics::auc(q$label, q$xb))
recall_selfreference <- sapply(pred_selfreference_list, FUN=function(q) sum(q$label) )
bias_dv_df <- data.table::rbindlist(list(
    cbind(auc=auc_cords_dataset, recall=recall_cords_dataset) %>% data.frame() %>% tibble::rownames_to_column("label") %>% 
      mutate(label="Mil Coords") %>% mutate(Type="Original Geo Info"),
    cbind(auc=auc_text_dataset, recall=recall_text_dataset) %>% data.frame() %>% tibble::rownames_to_column("label") %>% 
      mutate(label="Text Location") %>% mutate(Type="Original Geo Info"),
    cbind(auc=auc_cordstext_dataset, recall=recall_cordstext_dataset) %>% data.frame() %>% tibble::rownames_to_column("label") %>% 
      mutate(label="Mil Coords or Text Location") %>% mutate(Type="Original Geo Info"),
    cbind(auc=auc_cordstext_dataset, recall=recall_cordstext_dataset) %>% data.frame() %>% tibble::rownames_to_column("label") %>% 
      mutate(label="Hand Rule") %>% mutate(Type="Rule"),
    cbind(auc=auc_cordstext_dataset, recall=recall_cordstext_dataset) %>% data.frame() %>% tibble::rownames_to_column("label") %>% 
      mutate(label="Ensemble Rule") %>% mutate(Type="Rule"),
        
    cbind(auc=auc_selfreference, recall=recall_selfreference) %>% data.frame() %>% 
      tibble::rownames_to_column("label") %>% mutate(label=ifelse(label, "Match to Other Events","No Match to Other Events")) %>% 
                        mutate(Type="Allow Match To Other Events"),
    
    cbind(auc=auc_fuzzy, recall=recall_fuzzy) %>% data.frame() %>% tibble::rownames_to_column("label") 
    %>% mutate(label=ifelse(label, "Fuzzy","Exact")) %>% mutate(Type="Match Type"),
    
    cbind(auc=auc_source_dataset, recall=recall_source_dataset) %>% data.frame() %>% 
      tibble::rownames_to_column("label") %>% mutate(Type="Source Dataset"),
    
    cbind(auc=auc_geometry_type, recall=recall_geometry_type) %>% data.frame() %>% 
      tibble::rownames_to_column("label") %>% mutate(Type="Geometry Type")
))
saveRDS(bias_dv_df,
      file=glue::glue(getwd(), "/../inst/extdata/bias_dv_df.Rds")
)

Plot

(Appendix Figure 9)

#bias_dv_df <- readRDS(system.file("extdata", "bias_dv_df.Rds", package = "MeasuringLandscape"))
sentence_case <- function(x) stringr::str_to_sentence(tolower(gsub("_"," ",x)))
#install.packages("extrafont");
#library(extrafont)
#library(extrafont)
#extrafont::font_import(prompt=F )
#capabilities()
#windowsFonts()
#sort(as.vector(unlist(windowsFonts())))
fonts <- c('Times New Roman',
           'Calibri',
           'Courier New',
           "Georgia",
           "Tunga",
           "Lucida Fax")
           #'serif','Helvetica','Bookman','Palatino')
library(ggplot2)
bias_dv_df[Type=="Allow Match To Other Events" & label=="No Match to Other Events", label:= "Self Ref.",]
bias_dv_df[Type=="Allow Match To Other Events" & label=="Match to Other Events", label:= "No Self Ref.",]
bias_dv_df[Type=="Allow Match To Other Events" , Type:= "Self Reference",]
fontfaces <- factor(c("plain","bold","italic","bold.italic","plain","plain"))
colours = c("Self Reference" = "#F8766D",
            "Geometry Type" = "#A3A500",
            "Match Type" = "#00BF7D",
            "Rule" = "#00B0F6",
            "Source Dataset"="#E76BF3",
            "Original Geo Info"="#53B400") 
#p_load(ggrepel, tools)
p_bias_dv <- bias_dv_df %>% 
        #filter(!(label %in% c('kenya_district1962','kenya_cadastral','kenya_cadastral_district',"LINESTRING"))) %>%
        #filter(term != "(Intercept)") %>%
        #mutate(label[Type=="Match Type"]=gsub("FALSE", "Exact", label[Type=="Match Type"])) %>%
        #mutate(label[Type=="Match Type"]=gsub("True", "Fuzzy", label[Type=="Match Type"])) %>%
        mutate(Type=sentence_case(Type),
               label=sentence_case(label)
               ) %>%
  ggplot(aes(x=auc,
             y=round(recall/nrow(events_sf),2),
             color=Type,
             label=label,
             family = fonts[as.numeric(as.factor(Type))],
             fontface= fontfaces[as.numeric(as.factor(Type))]
             ))  + 
  ggrepel::geom_text_repel(size=3) +
  theme_bw() +
  xlab(sentence_case("Predictability of Missingness of Imputed Locations, Area Under the Curve")) +
  ylab(sentence_case("Recovery Rate")) +
  theme(
    legend.position = c(0.9, 0.3), # c(0,0) bottom left, c(1,1) top-right.
  )
p_bias_dv

 #+ coord_cartesian(y="log")
ggsave(
  filename = glue::glue(dir_figures, "p_bias_dv.pdf"),
  plot = p_bias_dv, width = 9, height = 6,
  device = cairo_pdf #have to use cairo to correctly embed the fonts
) 
