Given a location string, and multiple possible matches to real-world places, this provides some logic for how to rank some matches as better than others.
First, using simple hand rules of what kind of match to prefer over others.
Second, with a supervised model that attempts to predict which match will be geographically closest to the true location (fewest kilometers away from the right answer).
rm(list=ls()); gc()
# Hiding output and warnings
# !diagnostics off
library(MeasuringLandscape)
library(tidyverse)
dir_figures <- glue::glue(getwd(), "/../paper/figures/")
gc()
knitr::opts_knit$set(progress = TRUE, verbose = TRUE)
knitr::opts_chunk$set(fig.width=12, fig.height=8, warning=FALSE, message=FALSE, cache=TRUE)
options(width = 160)
georef_all_dt <- readRDS(paste0(here::here(), "/inst/extdata/georef_all_dt.Rds"))
#Calculate some features
dim(georef_all_dt)
[1] 532711 23
georef_all_dt[!is.finite(distance_km),distance_km:=NA]
georef_all_dt[,SelfReference:=source_dataset=="events"]
georef_all_dt[,fuzzy:=name_cleaner!=georef_b]
#georef_all_dt <- subset(georef_all_dt, distance_km=!0) #This excludes all self references , we need that here but not necessarily elsewhere
georef_all_dt <- georef_all_dt %>% mutate(handrule1 = dplyr::recode(as.character(fuzzy), "TRUE" = 2, "FALSE" = 1))
georef_all_dt <- georef_all_dt %>% mutate(handrule2 = dplyr::recode(as.character(SelfReference), "TRUE" = 1, "FALSE" = 2))
table(georef_all_dt$geometry_type)
LINESTRING MULTIPOLYGON POINT POLYGON
3165 14703 503335 8981
georef_all_dt <- georef_all_dt %>% mutate(handrule3 = dplyr::recode(as.character(geometry_type), "POINT" = 1, "MULTIPOLYGON"=2, "POLYGON" = 3,"LINESTRING"=4))
table(georef_all_dt$source_dataset, useNA="always") #why are their missing sources? A: Because there are events labels with no matching coordinates at all
bing events gadm geonames google historical kenya_cadastral kenya_cadastral_district
9436 254162 10729 60988 10591 49962 412 1033
kenya_district1962 livestock_boundaries livestock_points nga openstreetmap tgn wikidata <NA>
676 8738 35978 57180 16040 2830 11429 2527
georef_all_dt <- georef_all_dt %>%
mutate(handrule4 = dplyr::recode(as.character(source_dataset),
"events" = 1,
"historical"=2,
"nga"=3,
"geonames" = 4,
"gadm"=5,
"livestock_points"=6,
"bing"=7,
"livestock_boundaries"=8,
"wikidata"=9,
"tgn"=10,
"kenya_cadastral_district"=11,
"kenya_district1962"=12,
"google"=13,
"openstreetmap"=14,
"kenya_cadastral"=15
)) %>%
data.table::as.data.table()
table(georef_all_dt$source_dataset,
georef_all_dt$handrule4, useNA="always")
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 <NA>
bing 0 0 0 0 0 0 9436 0 0 0 0 0 0 0 0 0
events 254162 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
gadm 0 0 0 0 10729 0 0 0 0 0 0 0 0 0 0 0
geonames 0 0 0 60988 0 0 0 0 0 0 0 0 0 0 0 0
google 0 0 0 0 0 0 0 0 0 0 0 0 10591 0 0 0
historical 0 49962 0 0 0 0 0 0 0 0 0 0 0 0 0 0
kenya_cadastral 0 0 0 0 0 0 0 0 0 0 0 0 0 0 412 0
kenya_cadastral_district 0 0 0 0 0 0 0 0 0 0 1033 0 0 0 0 0
kenya_district1962 0 0 0 0 0 0 0 0 0 0 0 676 0 0 0 0
livestock_boundaries 0 0 0 0 0 0 0 8738 0 0 0 0 0 0 0 0
livestock_points 0 0 0 0 0 35978 0 0 0 0 0 0 0 0 0 0
nga 0 0 57180 0 0 0 0 0 0 0 0 0 0 0 0 0
openstreetmap 0 0 0 0 0 0 0 0 0 0 0 0 0 16040 0 0
tgn 0 0 0 0 0 0 0 0 0 2830 0 0 0 0 0 0
wikidata 0 0 0 0 0 0 0 0 11429 0 0 0 0 0 0 0
<NA> 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2527
georef_all_dt$handrule <- paste(stringr::str_pad(georef_all_dt$handrule1, 2, pad = "0"),
stringr::str_pad(georef_all_dt$handrule2,2, pad = "0"),
stringr::str_pad(georef_all_dt$handrule3,2, pad = "0"),
stringr::str_pad(georef_all_dt$handrule4, 2, pad = "0"),
sep="_")
data.table::setkey(georef_all_dt, handrule )
temp <- subset(georef_all_dt,event_hash=="be37b40f")
saveRDS(georef_all_dt,
file=glue::glue(getwd(), "/../inst/extdata/georef_all_dt.Rds")
)
Note, the immediately following code chunk is no longer used or maintained. It has been set to eval=F so it will not run automatically. A user running code by hand for replication should skip this chunk.
vars_x_string <- c(
"Jaro",
"Optimal_String_Alignment" ,
"Levenshtein",
"Damerau_Levenshtein" ,
"Longest_Common_Substring" ,
"q_gram_1",
"q_gram_2",
"q_gram_3",
"q_gram_4",
"q_gram_5",
'Cosine_1',
'Cosine_2',
'Cosine_3',
'Cosine_4',
'Cosine_5',
"Jaccard" ,
"First_Mistmatch" ,
"a_nchar" ,
"b_nchar" ,
"ab_nchar_diff" ,
"dJaro",
"dOptimal_String_Alignment" ,
"dLevenshtein" ,
"dDamerau_Levenshtein" ,
"dLongest_Common_Substring",
"dq_gram",
"dCosine",
"dJaccard"
)
georef_all_dt$a <- georef_all_dt$name_cleaner
georef_all_dt$b <- georef_all_dt$georef_b
georef_all_dt <- MeasuringLandscape:::toponym_add_distances_dt(georef_all_dt)
dpredict<-xgb.DMatrix(data= as.matrix(georef_all_dt[,vars_x_string, with=F]), missing = NA)
georef_all_dt$toponym_xb_model_prediction <- predict( toponym_xb_model, dpredict )
georef_all_dt$toponym_xb_model_prediction <- 1/(1 + exp(-georef_all_dt$toponym_xb_model_prediction ))
Select features
vars_y <- "distance_km"
vars_x <- c("source_dataset","geometry_type","SelfReference","fuzzy",
"document_district_clean",
"document_unit_type",
"document_date_best_year",
"toponym_xb_model_prediction",
"feature_code")
vars_all <- c(vars_x,vars_y)
xy_all <- georef_all_dt[,vars_all,with=F]
xy_all <- as.data.frame(xy_all)
dim(xy_all)
[1] 532711 10
#p_load(xgboost,dummies)
condition <- !is.na(xy_all$distance_km) & xy_all$distance_km!=0
x_all <- xy_all[,vars_x]
x_all <- dummies::dummy.data.frame(x_all,
all=T,
dummy.classes=c('character','factor','ordered') ) #Have to specify the dummy columns, because getOption("dummy.classes") is failing to be found in this namespace
x_train <- x_all[condition,]
label <- log(xy_all$distance_km[condition])
Develop training and test splits
ids <- georef_all_dt$name_cleaner[condition]
ids_unique <- sample(sort(unique(ids)))
chunks <- split(ids_unique, ceiling(seq_along(ids_unique)/ (length(ids_unique)/5)))
fold1 <- which(ids %in% chunks[[1]])
fold2 <- which(ids %in% chunks[[2]])
fold3 <- which(ids %in% chunks[[3]])
fold4 <- which(ids %in% chunks[[4]])
fold5 <- which(ids %in% chunks[[5]])
folds <- list(fold1,fold2,fold3,fold4,fold5)
#label= label #
condition <- !is.na(label)
dtrain <- xgboost::xgb.DMatrix(data=as.matrix( x_train ), label = label, missing = NA )
dpredict <- xgboost::xgb.DMatrix(data=as.matrix( x_all ), missing = NA )
param <- list("objective" ="reg:linear", #"objective" = logregobj,
#"scale_pos_weight" = sumwneg / sumwpos,
"eta" = 0.3,
"max_depth" = 6,
"eval_metric" = "rmse",
"silent" = 1,
"nthread" = 48,
'maximize'=T)
#choose folds by name so we're not cheating
#First cross validation to get credible out of sample accuracy
xb <- xgboost::xgb.cv(params=param,
data=dtrain,
nrounds = 100,
early_stopping_rounds=10,
#nfold=5,
folds=folds,
prediction=T)
[1] train-rmse:2.215960+0.006381 test-rmse:2.251821+0.050686
Multiple eval metrics are present. Will use test_rmse for early stopping.
Will train until test_rmse hasn't improved in 10 rounds.
[2] train-rmse:1.708904+0.004499 test-rmse:1.766501+0.062566
[3] train-rmse:1.392307+0.003548 test-rmse:1.465286+0.069398
[4] train-rmse:1.204573+0.003678 test-rmse:1.289565+0.068088
[5] train-rmse:1.097004+0.004138 test-rmse:1.188437+0.058479
[6] train-rmse:1.038022+0.004030 test-rmse:1.133218+0.051482
[7] train-rmse:1.005890+0.003023 test-rmse:1.108456+0.049016
[8] train-rmse:0.988091+0.004577 test-rmse:1.090660+0.044450
[9] train-rmse:0.978515+0.004071 test-rmse:1.081975+0.041584
[10] train-rmse:0.972519+0.004176 test-rmse:1.078056+0.040417
[11] train-rmse:0.968287+0.004509 test-rmse:1.075447+0.039519
[12] train-rmse:0.964832+0.004142 test-rmse:1.073183+0.038638
[13] train-rmse:0.962704+0.003764 test-rmse:1.071396+0.038265
[14] train-rmse:0.961593+0.003588 test-rmse:1.070196+0.037689
[15] train-rmse:0.960042+0.003683 test-rmse:1.071757+0.038962
[16] train-rmse:0.958017+0.003418 test-rmse:1.071095+0.038559
[17] train-rmse:0.957319+0.003479 test-rmse:1.070730+0.038417
[18] train-rmse:0.955255+0.003705 test-rmse:1.071056+0.038192
[19] train-rmse:0.953743+0.003818 test-rmse:1.070643+0.038123
[20] train-rmse:0.952704+0.003335 test-rmse:1.070461+0.038372
[21] train-rmse:0.950996+0.003626 test-rmse:1.069882+0.038533
[22] train-rmse:0.949852+0.003620 test-rmse:1.069466+0.038085
[23] train-rmse:0.948339+0.003458 test-rmse:1.069263+0.037830
[24] train-rmse:0.947634+0.003605 test-rmse:1.070079+0.038836
[25] train-rmse:0.947088+0.003525 test-rmse:1.069788+0.038960
[26] train-rmse:0.946164+0.003418 test-rmse:1.068958+0.039123
[27] train-rmse:0.945300+0.003965 test-rmse:1.068751+0.038951
[28] train-rmse:0.944482+0.003855 test-rmse:1.068460+0.038949
[29] train-rmse:0.942923+0.003708 test-rmse:1.068221+0.038972
[30] train-rmse:0.942078+0.003828 test-rmse:1.067969+0.038698
[31] train-rmse:0.941581+0.003534 test-rmse:1.068218+0.038652
[32] train-rmse:0.940857+0.003673 test-rmse:1.068140+0.038659
[33] train-rmse:0.940342+0.003730 test-rmse:1.067821+0.038559
[34] train-rmse:0.939569+0.004075 test-rmse:1.067525+0.038903
[35] train-rmse:0.938742+0.004188 test-rmse:1.067498+0.038906
[36] train-rmse:0.938097+0.004275 test-rmse:1.066835+0.038600
[37] train-rmse:0.937164+0.004648 test-rmse:1.066779+0.038505
[38] train-rmse:0.936795+0.004719 test-rmse:1.066720+0.038536
[39] train-rmse:0.936298+0.004657 test-rmse:1.066647+0.038519
[40] train-rmse:0.935870+0.004479 test-rmse:1.066568+0.038538
[41] train-rmse:0.935247+0.004268 test-rmse:1.066850+0.038711
[42] train-rmse:0.934551+0.004297 test-rmse:1.066563+0.038914
[43] train-rmse:0.934179+0.004342 test-rmse:1.066861+0.038994
[44] train-rmse:0.933954+0.004292 test-rmse:1.066875+0.039097
[45] train-rmse:0.933126+0.004360 test-rmse:1.067064+0.039080
[46] train-rmse:0.932713+0.004214 test-rmse:1.066859+0.039131
[47] train-rmse:0.932284+0.004249 test-rmse:1.066838+0.039025
[48] train-rmse:0.931705+0.003812 test-rmse:1.066420+0.040249
[49] train-rmse:0.931059+0.003889 test-rmse:1.066066+0.040467
[50] train-rmse:0.930395+0.004169 test-rmse:1.066799+0.041536
[51] train-rmse:0.929924+0.004181 test-rmse:1.066701+0.041543
[52] train-rmse:0.928958+0.003976 test-rmse:1.065950+0.041699
[53] train-rmse:0.928663+0.004079 test-rmse:1.065904+0.041826
[54] train-rmse:0.928058+0.003878 test-rmse:1.065524+0.041863
[55] train-rmse:0.927153+0.003885 test-rmse:1.064744+0.041728
[56] train-rmse:0.926067+0.004262 test-rmse:1.064443+0.041548
[57] train-rmse:0.925753+0.004443 test-rmse:1.062514+0.041334
[58] train-rmse:0.925019+0.004858 test-rmse:1.063515+0.041895
[59] train-rmse:0.924537+0.005106 test-rmse:1.063298+0.042186
[60] train-rmse:0.923764+0.005208 test-rmse:1.063202+0.042159
[61] train-rmse:0.923294+0.005300 test-rmse:1.063119+0.041993
[62] train-rmse:0.922900+0.005061 test-rmse:1.063272+0.041848
[63] train-rmse:0.922193+0.004803 test-rmse:1.063809+0.042485
[64] train-rmse:0.921936+0.004912 test-rmse:1.063832+0.042502
[65] train-rmse:0.921276+0.004392 test-rmse:1.063767+0.042799
[66] train-rmse:0.920213+0.004314 test-rmse:1.064766+0.043753
[67] train-rmse:0.919014+0.004270 test-rmse:1.064722+0.043891
Stopping. Best iteration:
[57] train-rmse:0.925753+0.004443 test-rmse:1.062514+0.041334
#Then fit a single model for variable importance
xb2 <- xgboost::xgb.train(params=param,
data=dtrain,
nrounds = 100,
#early_stopping_rounds=10,
#nfold=5,
prediction=T)
#predictions_train <- predict(xb2, dtrain)
predictions_all <- predict(xb2, dpredict) #predict over everything
#Replace with cross validation predictions
condition <- !is.na(xy_all$distance_km) & xy_all$distance_km!=0 #what was used to subset to train above
cross_valid_predictions <- xb$pred
predictions_all[condition] <- cross_valid_predictions #but for the subset we trained on, only use predictions from out of sample folds.
georef_all_dt$rule_ensemble <- exp(predictions_all)
setkey(georef_all_dt, handrule )
Error in setkey(georef_all_dt, handrule) :
could not find function "setkey"
Evaluate accuracy (Appendix Figure 7)
#plot(georef_all_dt$rule_ensemble, georef_all_dt$distance_km)
p_ensemble_risiduals <- georef_all_dt %>% ggplot(aes(x=log(distance_km), y=log(rule_ensemble) )) +
geom_point(alpha=.01) +
geom_smooth(span=.2) +
theme_bw() +
ggtitle("Observed versus predicted Distance to Match (Ensemble Residuals)") +
xlab("Observed distance between event and gazeteer suggestion (log km)") +
ylab("Predicted distance between event and gazeteer suggestion (log km) (out of sample)")
p_ensemble_risiduals
ggsave(
filename = glue::glue(dir_figures, "p_ensemble_risiduals.png"), #save as a png because there are too many points and it's crashing the pdf reader
plot = p_ensemble_risiduals,
width = 10,
height = 8#,
#device = cairo_pdf #have to use cairo to correctly embed the fonts
)
Variable importance (Appendix Figure 8)
importance_importance <- xgboost::xgb.importance(feature_names=colnames(dtrain), model = xb2) #won't calculate on cv
pdf(file=glue::glue(dir_figures, "p_variable_importance_supervised_ensemble.pdf"), width=5.5, height=6)
xgboost::xgb.plot.importance(importance_importance %>%
head(50) #Top 50 features
)
#autoplot(uauc1)
dev.off()
null device
1
xgboost::xgb.plot.importance(importance_importance %>%
head(50) #Top 50 features
)