Demonstrates that georeferencing options vary in how systematically they include some kinds of events while excluding other kinds. Asks whether or not missingness of geographic information is truly random, or if certain kinds of events or places are being dropped.
rm(list=ls()); gc()
# !diagnostics off
library(MeasuringLandscape)
library(tidyverse)
dir_figures <- glue::glue(getwd(), "/../paper/figures/")
gc()
knitr::opts_knit$set(progress = TRUE, verbose = TRUE)
knitr::opts_chunk$set(fig.width=12, fig.height=8, warning=FALSE, message=FALSE, cache=TRUE)
options(width = 160)
#Load Events
events_sf <- readRDS(system.file("extdata", "events_sf.Rdata", package = "MeasuringLandscape"))
events_sf_text_coord_unique <- plyr::ddply(events_sf[,c('location_text','name_clean','name_cleaner','document_district_clean','map_coordinate_clean_latitude','map_coordinate_clean_longitude')],
"location_text", transform,
map_coordinate_has =sum(!is.na(map_coordinate_clean_latitude))
)
#Reload from scratch each time in case we subset sometehing weirdly
georef_all_dt <- readRDS(system.file("extdata", "georef_all_dt_recomendations.Rds", package = "MeasuringLandscape"))
table(events_sf$name_cleaner %in% georef_all_dt$name_cleaner) #All events are in here
TRUE
10469
table(events_sf$name_cleaner %in% georef_all_dt$name_cleaner[!is.na(georef_all_dt$georef_b)]) #7,742 events with at least one gazetteer suggestion
FALSE TRUE
2527 7942
#Exclude all distance = 0 obs, those are self matches
georef_all_dt <- subset(georef_all_dt,
!is.na(name_cleaner) & # must have a name
(is.na(distance_km) | distance_km!=0) ) #Can be either missing or not zero. Only thing we drop is zero because that's a self match
Proof of concept, show it works on original missingness
glue::glue("Military coordinates only")
Military coordinates only
y <- !is.na(events_sf$map_coordinate_clean_latitude); table(y)
y
FALSE TRUE
4877 5592
pred_cords <- MeasuringLandscape:::predict_missingness_dv(y)
[1] train-auc:0.804097+0.003315 test-auc:0.799740+0.006637
Multiple eval metrics are present. Will use test_auc for early stopping.
Will train until test_auc hasn't improved in 10 rounds.
[21] train-auc:0.889264+0.001468 test-auc:0.872310+0.006274
[41] train-auc:0.898854+0.002677 test-auc:0.877003+0.003656
[50] train-auc:0.901976+0.002753 test-auc:0.877772+0.003660
auc_cords_dataset <- Metrics::auc(pred_cords$label, pred_cords$xb)
recall_cords_dataset <- sum(pred_cords$label)
glue::glue("Text only")
Text only
y <- !is.na(events_sf$name_cleaner); table(y)
y
FALSE TRUE
1831 8638
pred_text <- MeasuringLandscape:::predict_missingness_dv(y)
[1] train-auc:0.830160+0.001570 test-auc:0.825829+0.006723
Multiple eval metrics are present. Will use test_auc for early stopping.
Will train until test_auc hasn't improved in 10 rounds.
[21] train-auc:0.892263+0.001669 test-auc:0.877117+0.006347
[41] train-auc:0.905476+0.002618 test-auc:0.882680+0.005951
[50] train-auc:0.909041+0.002292 test-auc:0.883159+0.005973
auc_text_dataset <- Metrics::auc(pred_text$label, pred_text$xb)
recall_text_dataset <- sum(pred_text$label)
glue::glue("Military or Text")
Military or Text
y <- !is.na(events_sf$name_cleaner) | #Name isn't missing
!is.na(events_sf$map_coordinate_clean_latitude); #or it has coordinates
table(y)
y
FALSE TRUE
1377 9092
pred_cordstext <- MeasuringLandscape:::predict_missingness_dv(y)
[1] train-auc:0.841018+0.002213 test-auc:0.835831+0.011111
Multiple eval metrics are present. Will use test_auc for early stopping.
Will train until test_auc hasn't improved in 10 rounds.
[21] train-auc:0.905167+0.000436 test-auc:0.889299+0.005454
[41] train-auc:0.918015+0.002224 test-auc:0.891847+0.004062
[50] train-auc:0.922551+0.001271 test-auc:0.892790+0.004624
auc_cordstext_dataset <- Metrics::auc(pred_cordstext$label, pred_cordstext$xb)
recall_cordstext_dataset <- sum(pred_cordstext$label)
#Hand Rule
glue::glue("Hand Rule")
Hand Rule
data.table::setkey(georef_all_dt, handrule)
georef_all_dt_handrule <- georef_all_dt[,.SD[1], by=list(event_hash) ]
y <- events_sf$event_hash %in% georef_all_dt_handrule$event_hash #Which events received a imputed location under this rule
table(y)
y
FALSE TRUE
2248 8221
pred_source_handrule <- MeasuringLandscape:::predict_missingness_dv(y)
[1] train-auc:0.777390+0.004390 test-auc:0.767687+0.011393
Multiple eval metrics are present. Will use test_auc for early stopping.
Will train until test_auc hasn't improved in 10 rounds.
[21] train-auc:0.849321+0.001842 test-auc:0.827999+0.007427
[41] train-auc:0.861369+0.001751 test-auc:0.832096+0.008182
[50] train-auc:0.866109+0.001986 test-auc:0.833090+0.008597
auc_handrule_dataset <- Metrics::auc(pred_source_handrule$label, pred_source_handrule$xb)
recall_handrule_dataset <- sum(pred_source_handrule$label)
#Ensemble Rule
glue::glue("Ensemble Rule")
Ensemble Rule
data.table::setkey(georef_all_dt, rule_ensemble)
georef_all_dt_ensemble <- georef_all_dt[,.SD[1], by=list(event_hash) ]
y <- events_sf$event_hash %in% georef_all_dt_ensemble$event_hash #Which events received a imputed location under this rule
table(y)
y
FALSE TRUE
2248 8221
pred_source_ensemble <- MeasuringLandscape:::predict_missingness_dv(y)
[1] train-auc:0.780940+0.005652 test-auc:0.771963+0.009399
Multiple eval metrics are present. Will use test_auc for early stopping.
Will train until test_auc hasn't improved in 10 rounds.
[21] train-auc:0.851907+0.003191 test-auc:0.827541+0.006146
[41] train-auc:0.863698+0.003920 test-auc:0.831828+0.006781
[50] train-auc:0.868085+0.003498 test-auc:0.832048+0.006420
auc_ensemble_dataset <- Metrics::auc(pred_source_ensemble$label, pred_source_ensemble$xb)
recall_ensemble_dataset <- sum(pred_source_ensemble$label)
#fuzzy
glue::glue("fuzzy")
fuzzy
georef_all_dt_byfuzzy <- georef_all_dt[,
list(distance_km_min=min(distance_km, na.rm=T) ),
by=list(event_hash, fuzzy)
]
No non-missing values found in at least one group. Returning 'Inf' for such groups to be consistent with base
summary(georef_all_dt_byfuzzy)
event_hash fuzzy distance_km_min
Length:13593 Mode :logical Min. : 0.01807
Class :character FALSE:5683 1st Qu.: 2.99298
Mode :character TRUE :7214 Median :37.37747
NA's :696 Mean : Inf
3rd Qu.: Inf
Max. : Inf
pred_fuzzy_list <- list()
q_all <- na.omit(unique(georef_all_dt_byfuzzy$fuzzy)) ; table(q_all)
q_all
FALSE TRUE
1 1
for( q in q_all ){
print(q)
y <- events_sf$event_hash %in%
georef_all_dt_byfuzzy[fuzzy==q #The right kind of fuzzy
]$event_hash
pred_fuzzy_list[[as.character(q)]] <- MeasuringLandscape:::predict_missingness_dv(y)
}
[1] FALSE
[1] train-auc:0.760763+0.002087 test-auc:0.751381+0.010016
Multiple eval metrics are present. Will use test_auc for early stopping.
Will train until test_auc hasn't improved in 10 rounds.
[21] train-auc:0.801600+0.002656 test-auc:0.772316+0.010604
Stopping. Best iteration:
[22] train-auc:0.802831+0.002088 test-auc:0.772947+0.010750
[1] TRUE
[1] train-auc:0.780915+0.003241 test-auc:0.773619+0.008622
Multiple eval metrics are present. Will use test_auc for early stopping.
Will train until test_auc hasn't improved in 10 rounds.
[21] train-auc:0.823140+0.003137 test-auc:0.796991+0.009034
[41] train-auc:0.834482+0.003847 test-auc:0.797205+0.008080
Stopping. Best iteration:
[36] train-auc:0.832137+0.003594 test-auc:0.797721+0.008106
auc_fuzzy <- sapply(pred_fuzzy_list, FUN=function(q) Metrics::auc(q$label, q$xb))
recall_fuzzy <- sapply(pred_fuzzy_list, FUN=function(q) sum(q$label) )
#Source dataset
glue::glue("Source dataset")
Source dataset
georef_all_dt_bysource <- georef_all_dt[,list(distance_km_min=min(distance_km, na.rm=T) ),by=list(event_hash, source_dataset)] ; dim(georef_all_dt_bysource)
No non-missing values found in at least one group. Returning 'Inf' for such groups to be consistent with base
[1] 58290 3
georef_all_dt_bysource[!is.finite(distance_km_min), distance_km_min:=NA]
pred_source_dataset_list <- list()
for( q in na.omit(unique(georef_all_dt_bysource$source_dataset)) ){
print(q)
y <- events_sf$event_hash %in% georef_all_dt_bysource[source_dataset==q ]$event_hash
pred_source_dataset_list[[as.character(q)]] <- MeasuringLandscape:::predict_missingness_dv(y)
}
[1] "events"
[1] train-auc:0.769434+0.003422 test-auc:0.763054+0.011735
Multiple eval metrics are present. Will use test_auc for early stopping.
Will train until test_auc hasn't improved in 10 rounds.
[21] train-auc:0.815349+0.003901 test-auc:0.791311+0.006508
Stopping. Best iteration:
[21] train-auc:0.815349+0.003901 test-auc:0.791311+0.006508
[1] "bing"
[1] train-auc:0.750718+0.001182 test-auc:0.737795+0.012900
Multiple eval metrics are present. Will use test_auc for early stopping.
Will train until test_auc hasn't improved in 10 rounds.
[21] train-auc:0.794599+0.002226 test-auc:0.758811+0.006010
Stopping. Best iteration:
[29] train-auc:0.800159+0.001365 test-auc:0.759775+0.005683
[1] "nga"
[1] train-auc:0.770880+0.003116 test-auc:0.759014+0.007822
Multiple eval metrics are present. Will use test_auc for early stopping.
Will train until test_auc hasn't improved in 10 rounds.
[21] train-auc:0.819542+0.002779 test-auc:0.793420+0.009689
[41] train-auc:0.832146+0.002556 test-auc:0.795167+0.009888
[50] train-auc:0.835503+0.002805 test-auc:0.795277+0.009902
[1] "geonames"
[1] train-auc:0.770527+0.001975 test-auc:0.760426+0.008883
Multiple eval metrics are present. Will use test_auc for early stopping.
Will train until test_auc hasn't improved in 10 rounds.
[21] train-auc:0.818685+0.002154 test-auc:0.793308+0.011331
[41] train-auc:0.830110+0.002676 test-auc:0.795345+0.011904
Stopping. Best iteration:
[35] train-auc:0.827043+0.001663 test-auc:0.795647+0.011786
[1] "livestock_points"
[1] train-auc:0.774481+0.002335 test-auc:0.767195+0.008053
Multiple eval metrics are present. Will use test_auc for early stopping.
Will train until test_auc hasn't improved in 10 rounds.
[21] train-auc:0.818804+0.003677 test-auc:0.789986+0.011579
Stopping. Best iteration:
[25] train-auc:0.821982+0.004149 test-auc:0.790738+0.011826
[1] "openstreetmap"
[1] train-auc:0.674669+0.001454 test-auc:0.665281+0.005709
Multiple eval metrics are present. Will use test_auc for early stopping.
Will train until test_auc hasn't improved in 10 rounds.
[21] train-auc:0.755103+0.002599 test-auc:0.710490+0.005086
[41] train-auc:0.772432+0.002472 test-auc:0.715184+0.007262
[50] train-auc:0.778505+0.001777 test-auc:0.715358+0.007452
[1] "historical"
[1] train-auc:0.765187+0.002063 test-auc:0.752054+0.006422
Multiple eval metrics are present. Will use test_auc for early stopping.
Will train until test_auc hasn't improved in 10 rounds.
[21] train-auc:0.807012+0.003043 test-auc:0.778607+0.007958
[41] train-auc:0.818725+0.002731 test-auc:0.779360+0.008012
[50] train-auc:0.822632+0.001443 test-auc:0.779657+0.008408
[1] "google"
[1] train-auc:0.762845+0.003636 test-auc:0.747845+0.015454
Multiple eval metrics are present. Will use test_auc for early stopping.
Will train until test_auc hasn't improved in 10 rounds.
[21] train-auc:0.804601+0.004104 test-auc:0.767300+0.014475
[41] train-auc:0.815128+0.003840 test-auc:0.766495+0.012439
Stopping. Best iteration:
[33] train-auc:0.811827+0.004018 test-auc:0.767676+0.012807
[1] "wikidata"
[1] train-auc:0.744611+0.002555 test-auc:0.728887+0.008280
Multiple eval metrics are present. Will use test_auc for early stopping.
Will train until test_auc hasn't improved in 10 rounds.
[21] train-auc:0.793132+0.001818 test-auc:0.753363+0.007966
Stopping. Best iteration:
[29] train-auc:0.800007+0.001360 test-auc:0.754428+0.007321
[1] "gadm"
[1] train-auc:0.770666+0.003388 test-auc:0.757944+0.011324
Multiple eval metrics are present. Will use test_auc for early stopping.
Will train until test_auc hasn't improved in 10 rounds.
[21] train-auc:0.817256+0.003392 test-auc:0.787340+0.014773
[41] train-auc:0.829924+0.003937 test-auc:0.788950+0.013814
[50] train-auc:0.834424+0.003656 test-auc:0.789107+0.014312
[1] "kenya_cadastral_district"
[1] train-auc:0.773146+0.008101 test-auc:0.732873+0.016350
Multiple eval metrics are present. Will use test_auc for early stopping.
Will train until test_auc hasn't improved in 10 rounds.
[21] train-auc:0.845288+0.005454 test-auc:0.782971+0.019089
Stopping. Best iteration:
[14] train-auc:0.835930+0.003822 test-auc:0.784113+0.015809
[1] "livestock_boundaries"
[1] train-auc:0.767371+0.002642 test-auc:0.755384+0.010713
Multiple eval metrics are present. Will use test_auc for early stopping.
Will train until test_auc hasn't improved in 10 rounds.
[21] train-auc:0.810280+0.002879 test-auc:0.776519+0.010880
[41] train-auc:0.823395+0.001955 test-auc:0.777546+0.012048
Stopping. Best iteration:
[39] train-auc:0.822695+0.002227 test-auc:0.777960+0.011979
[1] "tgn"
[1] train-auc:0.721959+0.006640 test-auc:0.690729+0.012434
Multiple eval metrics are present. Will use test_auc for early stopping.
Will train until test_auc hasn't improved in 10 rounds.
[21] train-auc:0.825427+0.004686 test-auc:0.746431+0.014124
[41] train-auc:0.843449+0.004627 test-auc:0.751312+0.016043
[50] train-auc:0.848913+0.004747 test-auc:0.751344+0.015644
[1] "kenya_district1962"
[1] train-auc:0.854161+0.003213 test-auc:0.817628+0.019381
Multiple eval metrics are present. Will use test_auc for early stopping.
Will train until test_auc hasn't improved in 10 rounds.
[21] train-auc:0.908943+0.004764 test-auc:0.847120+0.013950
Stopping. Best iteration:
[11] train-auc:0.896112+0.002746 test-auc:0.848098+0.016216
[1] "kenya_cadastral"
[1] train-auc:0.904636+0.006685 test-auc:0.852674+0.031010
Multiple eval metrics are present. Will use test_auc for early stopping.
Will train until test_auc hasn't improved in 10 rounds.
[21] train-auc:0.948584+0.001499 test-auc:0.888132+0.016014
Stopping. Best iteration:
[19] train-auc:0.946756+0.000714 test-auc:0.888853+0.015630
auc_source_dataset <- sapply(pred_source_dataset_list, FUN=function(q) Metrics::auc(q$label, q$xb))
recall_source_dataset <- sapply(pred_source_dataset_list, FUN=function(q) sum(q$label) )
#geometry_type
glue::glue("geometry_type")
geometry_type
georef_all_dt_bygeometry_type <- georef_all_dt[,list(distance_km_min=min(distance_km, na.rm=T) ),by=list(event_hash, geometry_type)]
No non-missing values found in at least one group. Returning 'Inf' for such groups to be consistent with base
georef_all_dt_bygeometry_type[!is.finite(distance_km_min), distance_km_min:=NA]
pred_geometry_type_list <- list()
for( q in na.omit(unique(georef_all_dt_bygeometry_type$geometry_type)) ){
print(q)
y <- events_sf$event_hash %in% georef_all_dt_bygeometry_type[geometry_type==q ]$event_hash
pred_geometry_type_list[[as.character(q)]] <- MeasuringLandscape:::predict_missingness_dv(y)
}
[1] "POINT"
[1] train-auc:0.793843+0.003068 test-auc:0.787794+0.012419
Multiple eval metrics are present. Will use test_auc for early stopping.
Will train until test_auc hasn't improved in 10 rounds.
[21] train-auc:0.834564+0.001957 test-auc:0.811295+0.009149
[41] train-auc:0.846155+0.001758 test-auc:0.812287+0.008703
Stopping. Best iteration:
[37] train-auc:0.844202+0.001934 test-auc:0.812594+0.008373
[1] "MULTIPOLYGON"
[1] train-auc:0.764748+0.002854 test-auc:0.754327+0.008761
Multiple eval metrics are present. Will use test_auc for early stopping.
Will train until test_auc hasn't improved in 10 rounds.
[21] train-auc:0.810010+0.004237 test-auc:0.777654+0.009797
[41] train-auc:0.822522+0.004614 test-auc:0.777865+0.009815
Stopping. Best iteration:
[35] train-auc:0.820118+0.004601 test-auc:0.778672+0.010421
[1] "POLYGON"
[1] train-auc:0.763042+0.001849 test-auc:0.752128+0.003365
Multiple eval metrics are present. Will use test_auc for early stopping.
Will train until test_auc hasn't improved in 10 rounds.
[21] train-auc:0.804911+0.001350 test-auc:0.773768+0.006505
[41] train-auc:0.819318+0.002769 test-auc:0.774795+0.006044
Stopping. Best iteration:
[33] train-auc:0.813731+0.001761 test-auc:0.774945+0.006811
[1] "LINESTRING"
[1] train-auc:0.729706+0.008462 test-auc:0.702811+0.026694
Multiple eval metrics are present. Will use test_auc for early stopping.
Will train until test_auc hasn't improved in 10 rounds.
[21] train-auc:0.827082+0.004572 test-auc:0.749591+0.020059
Stopping. Best iteration:
[27] train-auc:0.832162+0.005524 test-auc:0.751773+0.020394
auc_geometry_type <- sapply(pred_geometry_type_list, FUN=function(q) Metrics::auc(q$label, q$xb))
recall_geometry_type <- sapply(pred_geometry_type_list, FUN=function(q) sum(q$label) )
#Self Reference
glue::glue("Self Reference")
Self Reference
georef_all_dt_byselfreference <- georef_all_dt[,list(distance_km_min=min(distance_km, na.rm=T) ),by=list(event_hash, SelfReference )] ; dim(georef_all_dt_bysource)
No non-missing values found in at least one group. Returning 'Inf' for such groups to be consistent with base
[1] 58290 3
georef_all_dt_byselfreference[!is.finite(distance_km_min), distance_km_min:=NA]
pred_selfreference_list <- list()
for( q in na.omit(unique(georef_all_dt_byselfreference$SelfReference)) ){
print(q)
y <- events_sf$event_hash %in% georef_all_dt_byselfreference[SelfReference==q]$event_hash
pred_selfreference_list[[as.character(q)]] <- MeasuringLandscape:::predict_missingness_dv(y)
}
[1] TRUE
[1] train-auc:0.770494+0.002377 test-auc:0.763557+0.004348
Multiple eval metrics are present. Will use test_auc for early stopping.
Will train until test_auc hasn't improved in 10 rounds.
[21] train-auc:0.815309+0.000789 test-auc:0.790626+0.003589
[41] train-auc:0.827838+0.001723 test-auc:0.791958+0.004529
[50] train-auc:0.831600+0.001346 test-auc:0.791737+0.004698
[1] FALSE
[1] train-auc:0.773348+0.002983 test-auc:0.764485+0.008508
Multiple eval metrics are present. Will use test_auc for early stopping.
Will train until test_auc hasn't improved in 10 rounds.
[21] train-auc:0.820040+0.001409 test-auc:0.793394+0.006822
[41] train-auc:0.831762+0.001024 test-auc:0.796301+0.006800
[50] train-auc:0.836174+0.001096 test-auc:0.795641+0.006862
auc_selfreference <- sapply(pred_selfreference_list, FUN=function(q) Metrics::auc(q$label, q$xb))
recall_selfreference <- sapply(pred_selfreference_list, FUN=function(q) sum(q$label) )
bias_dv_df <- data.table::rbindlist(list(
cbind(auc=auc_cords_dataset, recall=recall_cords_dataset) %>% data.frame() %>% tibble::rownames_to_column("label") %>%
mutate(label="Mil Coords") %>% mutate(Type="Original Geo Info"),
cbind(auc=auc_text_dataset, recall=recall_text_dataset) %>% data.frame() %>% tibble::rownames_to_column("label") %>%
mutate(label="Text Location") %>% mutate(Type="Original Geo Info"),
cbind(auc=auc_cordstext_dataset, recall=recall_cordstext_dataset) %>% data.frame() %>% tibble::rownames_to_column("label") %>%
mutate(label="Mil Coords or Text Location") %>% mutate(Type="Original Geo Info"),
cbind(auc=auc_cordstext_dataset, recall=recall_cordstext_dataset) %>% data.frame() %>% tibble::rownames_to_column("label") %>%
mutate(label="Hand Rule") %>% mutate(Type="Rule"),
cbind(auc=auc_cordstext_dataset, recall=recall_cordstext_dataset) %>% data.frame() %>% tibble::rownames_to_column("label") %>%
mutate(label="Ensemble Rule") %>% mutate(Type="Rule"),
cbind(auc=auc_selfreference, recall=recall_selfreference) %>% data.frame() %>%
tibble::rownames_to_column("label") %>% mutate(label=ifelse(label, "Match to Other Events","No Match to Other Events")) %>%
mutate(Type="Allow Match To Other Events"),
cbind(auc=auc_fuzzy, recall=recall_fuzzy) %>% data.frame() %>% tibble::rownames_to_column("label")
%>% mutate(label=ifelse(label, "Fuzzy","Exact")) %>% mutate(Type="Match Type"),
cbind(auc=auc_source_dataset, recall=recall_source_dataset) %>% data.frame() %>%
tibble::rownames_to_column("label") %>% mutate(Type="Source Dataset"),
cbind(auc=auc_geometry_type, recall=recall_geometry_type) %>% data.frame() %>%
tibble::rownames_to_column("label") %>% mutate(Type="Geometry Type")
))
saveRDS(bias_dv_df,
file=glue::glue(getwd(), "/../inst/extdata/bias_dv_df.Rds")
)
Plot
(Appendix Figure 9)
#bias_dv_df <- readRDS(system.file("extdata", "bias_dv_df.Rds", package = "MeasuringLandscape"))
sentence_case <- function(x) stringr::str_to_sentence(tolower(gsub("_"," ",x)))
#install.packages("extrafont");
#library(extrafont)
#library(extrafont)
#extrafont::font_import(prompt=F )
#capabilities()
#windowsFonts()
#sort(as.vector(unlist(windowsFonts())))
fonts <- c('Times New Roman',
'Calibri',
'Courier New',
"Georgia",
"Tunga",
"Lucida Fax")
#'serif','Helvetica','Bookman','Palatino')
library(ggplot2)
bias_dv_df[Type=="Allow Match To Other Events" & label=="No Match to Other Events", label:= "Self Ref.",]
bias_dv_df[Type=="Allow Match To Other Events" & label=="Match to Other Events", label:= "No Self Ref.",]
bias_dv_df[Type=="Allow Match To Other Events" , Type:= "Self Reference",]
fontfaces <- factor(c("plain","bold","italic","bold.italic","plain","plain"))
colours = c("Self Reference" = "#F8766D",
"Geometry Type" = "#A3A500",
"Match Type" = "#00BF7D",
"Rule" = "#00B0F6",
"Source Dataset"="#E76BF3",
"Original Geo Info"="#53B400")
#p_load(ggrepel, tools)
p_bias_dv <- bias_dv_df %>%
#filter(!(label %in% c('kenya_district1962','kenya_cadastral','kenya_cadastral_district',"LINESTRING"))) %>%
#filter(term != "(Intercept)") %>%
#mutate(label[Type=="Match Type"]=gsub("FALSE", "Exact", label[Type=="Match Type"])) %>%
#mutate(label[Type=="Match Type"]=gsub("True", "Fuzzy", label[Type=="Match Type"])) %>%
mutate(Type=sentence_case(Type),
label=sentence_case(label)
) %>%
ggplot(aes(x=auc,
y=round(recall/nrow(events_sf),2),
color=Type,
label=label,
family = fonts[as.numeric(as.factor(Type))],
fontface= fontfaces[as.numeric(as.factor(Type))]
)) +
ggrepel::geom_text_repel(size=3) +
theme_bw() +
xlab(sentence_case("Predictability of Missingness of Imputed Locations, Area Under the Curve")) +
ylab(sentence_case("Recovery Rate")) +
theme(
legend.position = c(0.9, 0.3), # c(0,0) bottom left, c(1,1) top-right.
)
p_bias_dv
#+ coord_cartesian(y="log")
ggsave(
filename = glue::glue(dir_figures, "p_bias_dv.pdf"),
plot = p_bias_dv, width = 9, height = 6,
device = cairo_pdf #have to use cairo to correctly embed the fonts
)