Demonstrates that georeferencing options vary terms of recall (how many event locations they recover) and accuracy (how far away their imputed locations tend to be from the true location)
rm(list=ls()); gc()
# !diagnostics off
library(MeasuringLandscape)
library(tidyverse)
dir_figures <- glue::glue(getwd(), "/../paper/figures/")
gc()
knitr::opts_knit$set(progress = TRUE, verbose = TRUE)
knitr::opts_chunk$set(fig.width=12, fig.height=8, warning=FALSE, message=FALSE, cache=TRUE)
options(width = 160)
Load the events file
#Load Events
events_sf <- readRDS(system.file("extdata", "events_sf.Rdata", package = "MeasuringLandscape"))
print(dim(events_sf))
[1] 10469 104
events_sf_text_coord_unique <- plyr::ddply(events_sf[,c('location_text','name_clean','name_cleaner','document_district_clean','map_coordinate_clean_latitude','map_coordinate_clean_longitude')],
"location_text", transform,
map_coordinate_has =sum(!is.na(map_coordinate_clean_latitude))
)
Load the suggestions file
#Reload from scratch each time in case we subset sometehing weirdly
# georef_all_dt_recommendations <- readRDS(system.file("extdata", "georef_all_dt_recomendations.Rds", package = "MeasuringLandscape"))
georef_all_dt <- readRDS(system.file("extdata", "georef_all_dt_recomendations.Rds", package = "MeasuringLandscape"))
glue::glue("Size of the Suggestions dataset ", nrow(georef_all_dt))
Size of the Suggestions dataset 532711
table(!is.na(events_sf$name_cleaner)) #8,638 events have at least some text
FALSE TRUE
1831 8638
table(events_sf$name_cleaner %in% georef_all_dt$name_cleaner) #8,242 events have a name in our suggestions list
TRUE
10469
table(events_sf$name_cleaner %in% georef_all_dt$name_cleaner[!is.na(georef_all_dt$georef_b)]) #7,532 events with at least one gazetteer suggestion
FALSE TRUE
2527 7942
georef_all_dt %>% ggplot(aes(x=distance_km)) + geom_histogram(bins = 100)
georef_all_dt %>% summary(distance_km)
event_hash name_cleaner document_district_clean document_unit_type document_date_best_year georef_b toponym_xb_model_prediction place_hash source_dataset geometry_type
Length:532711 Length:532711 Length:532711 Length:532711 Min. :1952 Length:532711 Min. :0.5001 Length:532711 Length:532711 Length:532711
Class :character Class :character Class :character Class :character 1st Qu.:1953 Class :character 1st Qu.:0.9788 Class :character Class :character Class :character
Mode :character Mode :character Mode :character Mode :character Median :1954 Mode :character Median :0.9974 Mode :character Mode :character Mode :character
Mean :1954 Mean :0.9428
3rd Qu.:1954 3rd Qu.:0.9975
Max. :2052 Max. :0.9996
NA's :63468 NA's :2527
feature_code X1 Y1 X2 Y2 distance_km SelfReference fuzzy handrule1 handrule2 handrule3 handrule4
Length:532711 Min. :34.88 Min. :-2.70 Min. :27.27 Min. :-4.6667 Min. : 0.00 Mode :logical Mode :logical Min. :1.000 Min. :1.000 Min. :1.000 Min. : 1.000
Class :character 1st Qu.:36.85 1st Qu.:-0.84 1st Qu.:36.82 1st Qu.:-0.9375 1st Qu.: 12.79 FALSE:276022 FALSE:167425 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:1.000 1st Qu.: 1.000
Mode :character Median :36.99 Median :-0.65 Median :36.97 Median :-0.6723 Median : 27.83 TRUE :254162 TRUE :362759 Median :2.000 Median :2.000 Median :1.000 Median : 2.000
Mean :37.04 Mean :-0.65 Mean :37.02 Mean :-0.6804 Mean : 43.97 NA's :2527 NA's :2527 Mean :1.684 Mean :1.521 Mean :1.079 Mean : 3.195
3rd Qu.:37.20 3rd Qu.:-0.45 3rd Qu.:37.20 3rd Qu.:-0.4615 3rd Qu.: 62.52 3rd Qu.:2.000 3rd Qu.:2.000 3rd Qu.:1.000 3rd Qu.: 4.000
Max. :41.72 Max. : 3.86 Max. :41.52 Max. : 4.0616 Max. :747.43 Max. :2.000 Max. :2.000 Max. :4.000 Max. :15.000
NA's :195733 NA's :195733 NA's :2527 NA's :2527 NA's :196476 NA's :2527 NA's :2527 NA's :2527 NA's :2527
handrule rule_ensemble
Length:532711 Min. : 0.872
Class :character 1st Qu.: 16.220
Mode :character Median : 21.756
Mean : 31.592
3rd Qu.: 41.900
Max. :364.257
table(georef_all_dt$distance_km==0) #These are identical matches, same event on the left hand and right hand sides
FALSE TRUE
331127 5108
#Exclude all distance = 0 obs, those are self matches
georef_all_dt <- subset(georef_all_dt,
!is.na(name_cleaner) & # must have a name
(is.na(distance_km) | distance_km!=0) ) #Can be either missing or not zero. Only thing we drop is zero because that's a self match
table(georef_all_dt$distance_km==0) #These are identical matches, same event on the left hand and right hand sides
FALSE
331127
This section calculates the recall rate and geographic accuracy of gazetteer suggestions. Recall is the number of events that receive at least one suggestion under a given set of rules. Accuracy is the distance between an observed location with a military coordinate and an imputed location for the same event.
Accuracy is measured differently for hand rule/ensemble and every other specific decision. Hand rule/ensemble accuracy is the distance observed for the best match selected by that process.
Accuracy for the remainder of decisions is the BEST CASE accuracy for any suggestion matching that rule. For example, if the decision is “point” geometry type, then accuracy would be the smallest observed distance of every single suggestion with a point type.
Hand Rules
print("Hand Rule Orderings")
[1] "Hand Rule Orderings"
table(georef_all_dt$handrule)
01_01_01_01 01_02_01_02 01_02_01_03 01_02_01_04 01_02_01_06 01_02_01_07 01_02_01_09 01_02_01_10 01_02_01_13 01_02_01_14 01_02_02_05 01_02_02_11 01_02_02_12 01_02_02_14 01_02_03_08 01_02_03_15 01_02_04_14
116885 7253 7301 7772 6175 3063 1641 1320 2326 1846 3354 816 587 70 2075 22 169
02_01_01_01 02_02_01_02 02_02_01_03 02_02_01_04 02_02_01_06 02_02_01_07 02_02_01_09 02_02_01_10 02_02_01_13 02_02_01_14 02_02_02_05 02_02_02_11 02_02_02_12 02_02_02_14 02_02_02_15 02_02_03_08 02_02_03_14
132169 42709 49879 53216 29803 6373 9788 1510 8265 8933 7375 217 89 1953 242 6663 73
02_02_03_15 02_02_04_14 NA_NA_NA_NA
148 2996 696
#Hand Rule
data.table::setkey(georef_all_dt, event_hash, handrule) #Sort by event and then hand rule
georef_all_dt_handrule <- georef_all_dt[,.SD[1], by=list(event_hash) ] #Take the smallest hand rule score, lower is better
georef_all_dt_handrule_summary <- georef_all_dt_handrule[,list(
rule="Hand Rule",
distance_rmse=sqrt(mean(distance_km^2, na.rm=T)),
distance_mse=mean(distance_km^2, na.rm=T),
distance_me=mean(distance_km, na.rm=T),
distance_median=median(distance_km, na.rm=T),
source_dataset_count=.N
)
]
print("Distance for Hand Rule")
[1] "Distance for Hand Rule"
georef_all_dt_handrule_summary
Ensemble Rule
summary(georef_all_dt$rule_ensemble)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.872 16.538 21.756 31.803 42.354 364.257
georef_all_dt %>% ggplot(aes(x=rule_ensemble)) + geom_histogram(bins=100)
#Ensemble Rule
data.table::setkey(georef_all_dt, rule_ensemble)
georef_all_dt_ensemble <- georef_all_dt[,.SD[1], by=list(event_hash) ] #Take the smallest estimated distance, lower is better
georef_all_dt_ensemble_summary <- georef_all_dt_ensemble[,list(
rule="Ensemble Rule",
distance_rmse=sqrt(mean(distance_km^2, na.rm=T)) ,
distance_mse=mean(distance_km^2, na.rm=T),
distance_me=mean(distance_km, na.rm=T),
distance_median=median(distance_km, na.rm=T),
source_dataset_count=.N
)
]
print("Distance for Ensemble")
[1] "Distance for Ensemble"
georef_all_dt_ensemble_summary
rbind(georef_all_dt_handrule %>% mutate(label="hand"),
georef_all_dt_ensemble %>% mutate(label="ensemble")) %>%
ggplot(aes(x=distance_km, fill=label)) + geom_histogram()
Now calculate the minimum observed distance for any suggestion matching a specific decision.
#Minimum distance by event-source
georef_all_dt_bysource <- georef_all_dt[,
list(distance_min=min(distance_km, na.rm=T) ),
by=list(event_hash, source_dataset)] ; dim(georef_all_dt_bysource)
No non-missing values found in at least one group. Returning 'Inf' for such groups to be consistent with base
[1] 58290 3
georef_all_dt_bysource[!is.finite(distance_min), distance_min:=NA]
#Minimum distance and recall by source
georef_all_dt_bysource_summary <- georef_all_dt_bysource[,
list(
distance_rmse=sqrt(mean(distance_min^2, na.rm=T)) ,
source_dataset_count=.N),
by=list(source_dataset)
]
georef_all_dt_byfuzzy <- georef_all_dt[
,list(distance_min=min(distance_km, na.rm=T) ),
by=list(event_hash, fuzzy)] ;
No non-missing values found in at least one group. Returning 'Inf' for such groups to be consistent with base
georef_all_dt_byfuzzy[!is.finite(distance_min), distance_min:=NA]
georef_all_dt_byfuzzy_summary <- georef_all_dt_byfuzzy[,
list(distance_rmse=sqrt(mean(distance_min^2, na.rm=T)) ,
source_dataset_count=.N),
by=list(fuzzy) ]
georef_all_dt_byselfreference <- georef_all_dt[,list(distance_min=min(distance_km, na.rm=T) ),
by=list(event_hash, SelfReference)] ; dim(georef_all_dt_bysource)
No non-missing values found in at least one group. Returning 'Inf' for such groups to be consistent with base
[1] 58290 3
georef_all_dt_byselfreference[!is.finite(distance_min), distance_min:=NA]
georef_all_dt_byselfreference_summary <- georef_all_dt_byselfreference[,list(
distance_rmse=sqrt(mean(distance_min^2, na.rm=T)) ,
source_dataset_count=.N),
by=list(SelfReference)
]
georef_all_dt_bygeometry_type <- georef_all_dt[,list(distance_min=min(distance_km, na.rm=T) ),by=list(event_hash, geometry_type
)] ; dim(georef_all_dt_bysource)
No non-missing values found in at least one group. Returning 'Inf' for such groups to be consistent with base
[1] 58290 3
georef_all_dt_bygeometry_type[!is.finite(distance_min), distance_min:=NA]
georef_all_dt_bygeometry_type <- georef_all_dt_bygeometry_type[,list(
distance_rmse=sqrt(mean(distance_min^2, na.rm=T)) ,
source_dataset_count=.N),
by=list(geometry_type)
]
georef_all_dt_milcoords_summary <- data.frame(rule="Mil. Coord",
distance_rmse=0,
source_dataset_count=sum(!is.na(events_sf$map_coordinate_clean_latitude))
)
recall_accuracy <- data.table::rbindlist(list(
#georef_all_dt_milcoords_summary %>% mutate(label="Mil. Coord") %>% mutate(Type="Rule"),
georef_all_dt_handrule_summary %>% mutate(label="Hand Rule") %>% mutate(Type="Rule"),
georef_all_dt_ensemble_summary %>% mutate(label="Ensemble Rule") %>% mutate(Type="Rule"),
georef_all_dt_bysource_summary %>% mutate(label=source_dataset) %>% mutate(Type="Source Gazetteer"),
georef_all_dt_byfuzzy_summary %>% mutate(label=ifelse(fuzzy, "Fuzzy","Exact")) %>% mutate(Type="Match Type"),
georef_all_dt_byselfreference_summary %>% mutate(label=ifelse(SelfReference, "Match To Other Events","No Match To Other Events")) %>%
mutate(Type="Allow Match To Other Events"),
georef_all_dt_bygeometry_type %>% mutate(label=geometry_type) %>% mutate(Type="Geometry Type")
),fill=T) %>% mutate(recall_rate=source_dataset_count/10469)
Combine and plot the results for every specific decision and the ensembles.
(Figure 2, Comparison of recall and accuracy by georeferencing method)
sentence_case <- function(x) stringr::str_to_sentence(tolower(gsub("_"," ",x)))
#The first time you run this, you'll need to install these additional fonts
#install.packages("extrafont");
#library(extrafont)
#library(extrafont)
#extrafont::font_import(prompt=F )
#capabilities()
#windowsFonts()
#sort(as.vector(unlist(windowsFonts())))
fonts <- c('Times New Roman',
'Calibri',
'Courier New',
"Georgia",
"Tunga",
"Lucida Fax")
#'serif','Helvetica','Bookman','Palatino')
#library(ggplot2)
fontfaces <- factor(c("plain","bold","italic","bold.italic","plain","plain"))
colours = c("Self reference" = "#F8766D",
"Geometry type" = "#A3A500",
"Match type" = "#00BF7D",
"Rule" = "#00B0F6",
"Source dataset"="#E76BF3",
"Original geo info"="#53B400")
#p_load(ggrepel, tools)
p_RecallBias <- recall_accuracy %>%
filter(!label %in% "tgn") %>% #exclude TGN for being an outlier
mutate(
label=sentence_case(label),
Type=sentence_case(Type)
) %>%
ggplot(
aes(x=distance_rmse,
y=source_dataset_count / 10469,
color=Type,
label=label,
family = fonts[as.numeric(as.factor(Type))],
fontface= fontfaces[as.numeric(as.factor(Type))]
)
) +
ggrepel::geom_text_repel(size=3) +
theme_bw() +
xlab(stringr::str_to_sentence("Root Mean Squared Distance from Observed Coordinate (km)")) +
ylab(stringr::str_to_sentence("Recovery Rate")) +
theme(
legend.position = c(0.2, 0.3), # c(0,0) bottom left, c(1,1) top-right.
)
Error in eval(lhs, parent, parent) : object 'recall_accuracy' not found
In terms of accuracy, we find wide variation between decisions. In brief: Ensemble Rule > Hand Rule Exact > Fuzzy Point > Multipolygon > Polygon > Linestring Match to Other Events > No Match to Other Events events > nga > livestock_points" > historical > geonames > bing> kenya_cadastral_district > livestock_boundaries > gadm > kenya_district1962 > wikidata > google > kenya_cadastral > openstreetmap > tgn
The hand rules and supervised ensemble represent a compromise between high recovery rate and best case accuracy of each individual decision.