Demonstrate what kinds of events tend to systematically get excluded. Here, in terms of whether the event would have received an original military coordinate or not.

rm(list=ls()); gc()
# !diagnostics off
library(MeasuringLandscape)
library(tidyverse)
dir_figures <- glue::glue(getwd(), "/../paper/figures/")
gc()
knitr::opts_knit$set(progress = TRUE, verbose = TRUE)
knitr::opts_chunk$set(fig.width=12, fig.height=8,  warning=FALSE, message=FALSE, cache=TRUE)
options(width = 160)
sentence_case <- function(x) stringr::str_to_sentence(tolower(gsub("_"," ",x)))
#Load Events
events_sf <- readRDS(system.file("extdata", "events_sf.Rdata", package = "MeasuringLandscape")) 
events_sf_text_coord_unique <- plyr::ddply(events_sf[,c('location_text',
                                    'name_clean','name_cleaner','document_district_clean','map_coordinate_clean_latitude','map_coordinate_clean_longitude')],
                                     "location_text", transform,
      map_coordinate_has =sum(!is.na(map_coordinate_clean_latitude))
      )

Plot the predicted effects for a single model, Mil. Coords or no Mil. Coords

pred_cords <- MeasuringLandscape:::predict_missingness_dv(is.na(events_sf$map_coordinate_clean_latitude))
[1] train-auc:0.805745+0.002232 test-auc:0.796851+0.005518 
Multiple eval metrics are present. Will use test_auc for early stopping.
Will train until test_auc hasn't improved in 10 rounds.

[21]    train-auc:0.888206+0.001797 test-auc:0.869619+0.010641 
[41]    train-auc:0.898716+0.002047 test-auc:0.874282+0.009499 
[50]    train-auc:0.904202+0.003233 test-auc:0.875923+0.008717 
rf <- pred_cords$xb_model
train <- pred_cords$x_all_pre_dummy
label <- pred_cords$label
x_all_pre_dummy <- pred_cords$x_all_pre_dummy
x_all <- dummies::dummy.data.frame(pred_cords$x_all_pre_dummy,
                                   all=T,
                                   dummy.classes=c('character','factor','ordered'))
dtrain <- xgboost::xgb.DMatrix(data=as.matrix( x_all ), 
                      label = label, missing = NA )
#options(na.action='na.pass')
#testdata_dummy <-  model.matrix(~ . - 1, pred_cords$x_all_pre_dummy)
#options(na.action='na.omit')
testdata_dummy <- x_all_pre_dummy %>% as.tibble %>% fastDummies::dummy_columns() %>% dplyr::select_if( (is.numeric) )
#testdata_dummy <- dummies::dummy.data.frame(x_all_pre_dummy, drop=F,
#                                   dummy.classes=c('character','factor','ordered'))
dtest <- xgboost::xgb.DMatrix(data=as.matrix( testdata_dummy ),  missing = NA ) 
dtest <- xgboost::xgb.DMatrix(data=as.matrix( pred_cords$postdummy ),  missing = NA )

Importance scores for each variable, predicting the missingness of exact map coordinates as a function of each event’s details.

importance_importance <- xgboost::xgb.importance(feature_names=names(testdata_dummy),
                                        model = rf)
xgboost::xgb.plot.importance(importance_importance)

histogram=T
scale=2
a <- MeasuringLandscape:::plot_partial_effects(rf=pred_cords$xb_model,
                  outcome="mapcoordinate_clean_missing",var="document_date_type",minsize=100,
                  scale=scale,histogram=T)
[1] "unknown"
[1] "missing"
[1] "on the"
[1] "to"
[1] "week"
[1] "period"
[1] "fortnight"
[1] "ending"
b <- MeasuringLandscape:::plot_partial_effects(rf=pred_cords$xb_model,
                  outcome="mapcoordinate_clean_missing",var="document_date_best_year",minsize=100,
                  scale=scale,histogram=T)
[1] "1952"
[1] "1953"
[1] "1954"
[1] "1955"
[1] "1956"
c <- MeasuringLandscape:::plot_partial_effects(rf=pred_cords$xb_model,
                  outcome="mapcoordinate_clean_missing",var="initiator_clean_1_aggmed",minsize=100,
                  scale=scale,histogram=T)
[1] "british military"
[1] "civilians"
[1] "colonial authorities"
[1] "home guard"
[1] "kings african rifles"
[1] "military (generic)"
[1] "police (generic)"
[1] "suspected insurgents"
d <- MeasuringLandscape:::plot_partial_effects(rf=pred_cords$xb_model,
                  outcome="mapcoordinate_clean_missing",var="target_clean_1_aggmed",minsize=100,
                  scale=scale,histogram=T)
[1] "british military"
[1] "civilians"
[1] "colonial authorities"
[1] "home guard"
[1] "kings african rifles"
[1] "military (generic)"
[1] "police (generic)"
[1] "suspected insurgents"
e <- MeasuringLandscape:::plot_partial_effects(rf=pred_cords$xb_model,
                  outcome="mapcoordinate_clean_missing",var="type_clean_aggmed",minsize=100,
                  scale=scale,histogram=T)
[1] "physical violence"
[1] "security operations"
[1] "property destruction"
[1] "rebel capture"
[1] "theft"
[1] "oathing"
f <- MeasuringLandscape:::plot_partial_effects(rf=pred_cords$xb_model,
                  outcome="mapcoordinate_clean_missing",var="document_unit_type",minsize=100,
                  scale=scale,histogram=T)
[1] "City"
[1] "District"
[1] "Operation Jock Scott"
[1] "Province"
g <- MeasuringLandscape:::plot_partial_effects(rf=pred_cords$xb_model,
                  outcome="mapcoordinate_clean_missing", var="document_district_clean",
                  minsize=100, train=pred_cords$x_all_pre_dummy ,
                  scale=scale,histogram=T)
[1] "Central Province"
[1] "Embu"
[1] "FORT HALL"
[1] "Nairobi"
[1] "Kiambu"
[1] "Laikipia"
[1] "Machakos"
[1] "Meru"
[1] "Naivasha"
[1] "Nakuru"
[1] "Nanyuki"
[1] "Narok"
[1] "Nyeri"
[1] "Rift Valley"
[1] "Thika"
h <- MeasuringLandscape:::plot_partial_effects(rf=pred_cords$xb_model,
                  outcome="mapcoordinate_clean_missing",var="event_date_clean_year",minsize=100,
                  scale=scale,histogram=T)
[1] "1952"
[1] "1953"
[1] "1954"
[1] "1955"
[1] "1956"
#e <- plot_partial_effects(rf=pred_cords$xb_model,
#                  outcome="mapcoordinate_clean_missing",var="locationtext_ruleclean_suffix",minsize=100)
#p_load(cowplot)
final_histogram <- cowplot::plot_grid(
  
  a+ggtitle(sentence_case('Document Date Type')),
  b+ggtitle(sentence_case('Document Year')),
  c+ggtitle(sentence_case('Initiator')),
  d+ggtitle(sentence_case('Target')),
  e+ggtitle(sentence_case('Act Type')),
  f+ggtitle(sentence_case('Document Unit')),
  g+ggtitle(sentence_case('Document District')),
  h+ggtitle(sentence_case('Event Year')) + ylab(sentence_case("Probability of Missing Military Coordinates")),
  #i+ggtitle('Reporting Office'),
  ncol = 3, align = "hv" ) #,rel_heights=heights)
final_histogram

ggsave(
  filename = glue::glue(dir_figures, "rf_mapcoordinate_clean_missing.pdf"),
  plot = final_histogram, width = 10, height = 8
)
 
LS0tCnRpdGxlOiAiMDkgUHJlZGljdGVkIEVmZmVjdHMiCmF1dGhvcjogIlJleCBXLiBEb3VnbGFzcyBhbmQgS3Jpc3RlbiBIYXJrbmVzcyIKZGF0ZTogIk1hcmNoIDksIDIwMTgiCm91dHB1dDogCiAgaHRtbF9ub3RlYm9vazoKICAgIHRvYzogdHJ1ZQogICAgdG9jX2Zsb2F0OiB0cnVlCmVkaXRvcl9vcHRpb25zOiAKICBjaHVua19vdXRwdXRfdHlwZTogaW5saW5lCi0tLQo8c3R5bGU+CiAgICBib2R5IC5tYWluLWNvbnRhaW5lciB7CiAgICAgICAgbWF4LXdpZHRoOiAxMDAlOwogICAgfQo8L3N0eWxlPgoKCkRlbW9uc3RyYXRlIHdoYXQga2luZHMgb2YgZXZlbnRzIHRlbmQgdG8gc3lzdGVtYXRpY2FsbHkgZ2V0IGV4Y2x1ZGVkLiBIZXJlLCBpbiB0ZXJtcyBvZiB3aGV0aGVyIHRoZSBldmVudCB3b3VsZCBoYXZlIHJlY2VpdmVkIGFuIG9yaWdpbmFsIG1pbGl0YXJ5IGNvb3JkaW5hdGUgb3Igbm90LgoKYGBge3IgLCByZXN1bHRzPSdoaWRlJywgbWVzc2FnZT1GQUxTRSwgd2FybmluZz1GQUxTRX0Kcm0obGlzdD1scygpKTsgZ2MoKQojICFkaWFnbm9zdGljcyBvZmYKbGlicmFyeShNZWFzdXJpbmdMYW5kc2NhcGUpCmxpYnJhcnkodGlkeXZlcnNlKQoKZGlyX2ZpZ3VyZXMgPC0gZ2x1ZTo6Z2x1ZShnZXR3ZCgpLCAiLy4uL3BhcGVyL2ZpZ3VyZXMvIikKCmdjKCkKCmtuaXRyOjpvcHRzX2tuaXQkc2V0KHByb2dyZXNzID0gVFJVRSwgdmVyYm9zZSA9IFRSVUUpCmtuaXRyOjpvcHRzX2NodW5rJHNldChmaWcud2lkdGg9MTIsIGZpZy5oZWlnaHQ9OCwgIHdhcm5pbmc9RkFMU0UsIG1lc3NhZ2U9RkFMU0UsIGNhY2hlPVRSVUUpCm9wdGlvbnMod2lkdGggPSAxNjApCgpzZW50ZW5jZV9jYXNlIDwtIGZ1bmN0aW9uKHgpIHN0cmluZ3I6OnN0cl90b19zZW50ZW5jZSh0b2xvd2VyKGdzdWIoIl8iLCIgIix4KSkpCgpgYGAKCgpgYGB7cn0KCiNMb2FkIEV2ZW50cwpldmVudHNfc2YgPC0gcmVhZFJEUyhzeXN0ZW0uZmlsZSgiZXh0ZGF0YSIsICJldmVudHNfc2YuUmRhdGEiLCBwYWNrYWdlID0gIk1lYXN1cmluZ0xhbmRzY2FwZSIpKSAKCmV2ZW50c19zZl90ZXh0X2Nvb3JkX3VuaXF1ZSA8LSBwbHlyOjpkZHBseShldmVudHNfc2ZbLGMoJ2xvY2F0aW9uX3RleHQnLAogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAnbmFtZV9jbGVhbicsJ25hbWVfY2xlYW5lcicsJ2RvY3VtZW50X2Rpc3RyaWN0X2NsZWFuJywnbWFwX2Nvb3JkaW5hdGVfY2xlYW5fbGF0aXR1ZGUnLCdtYXBfY29vcmRpbmF0ZV9jbGVhbl9sb25naXR1ZGUnKV0sCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAibG9jYXRpb25fdGV4dCIsIHRyYW5zZm9ybSwKICAgICAgbWFwX2Nvb3JkaW5hdGVfaGFzID1zdW0oIWlzLm5hKG1hcF9jb29yZGluYXRlX2NsZWFuX2xhdGl0dWRlKSkKICAgICAgKQoKYGBgCgoKUGxvdCB0aGUgcHJlZGljdGVkIGVmZmVjdHMgZm9yIGEgc2luZ2xlIG1vZGVsLCBNaWwuIENvb3JkcyBvciBubyBNaWwuIENvb3JkcwoKYGBge1IgfQoKCnByZWRfY29yZHMgPC0gTWVhc3VyaW5nTGFuZHNjYXBlOjo6cHJlZGljdF9taXNzaW5nbmVzc19kdihpcy5uYShldmVudHNfc2YkbWFwX2Nvb3JkaW5hdGVfY2xlYW5fbGF0aXR1ZGUpKQpyZiA8LSBwcmVkX2NvcmRzJHhiX21vZGVsCnRyYWluIDwtIHByZWRfY29yZHMkeF9hbGxfcHJlX2R1bW15CmxhYmVsIDwtIHByZWRfY29yZHMkbGFiZWwKeF9hbGxfcHJlX2R1bW15IDwtIHByZWRfY29yZHMkeF9hbGxfcHJlX2R1bW15CnhfYWxsIDwtIGR1bW1pZXM6OmR1bW15LmRhdGEuZnJhbWUocHJlZF9jb3JkcyR4X2FsbF9wcmVfZHVtbXksCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgYWxsPVQsCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgZHVtbXkuY2xhc3Nlcz1jKCdjaGFyYWN0ZXInLCdmYWN0b3InLCdvcmRlcmVkJykpCgpkdHJhaW4gPC0geGdib29zdDo6eGdiLkRNYXRyaXgoZGF0YT1hcy5tYXRyaXgoIHhfYWxsICksIAogICAgICAgICAgICAgICAgICAgICAgbGFiZWwgPSBsYWJlbCwgbWlzc2luZyA9IE5BICkKCiNvcHRpb25zKG5hLmFjdGlvbj0nbmEucGFzcycpCiN0ZXN0ZGF0YV9kdW1teSA8LSAgbW9kZWwubWF0cml4KH4gLiAtIDEsIHByZWRfY29yZHMkeF9hbGxfcHJlX2R1bW15KQojb3B0aW9ucyhuYS5hY3Rpb249J25hLm9taXQnKQp0ZXN0ZGF0YV9kdW1teSA8LSB4X2FsbF9wcmVfZHVtbXkgJT4lIGFzLnRpYmJsZSAlPiUgZmFzdER1bW1pZXM6OmR1bW15X2NvbHVtbnMoKSAlPiUgZHBseXI6OnNlbGVjdF9pZiggKGlzLm51bWVyaWMpICkKCiN0ZXN0ZGF0YV9kdW1teSA8LSBkdW1taWVzOjpkdW1teS5kYXRhLmZyYW1lKHhfYWxsX3ByZV9kdW1teSwgZHJvcD1GLAojICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBkdW1teS5jbGFzc2VzPWMoJ2NoYXJhY3RlcicsJ2ZhY3RvcicsJ29yZGVyZWQnKSkKZHRlc3QgPC0geGdib29zdDo6eGdiLkRNYXRyaXgoZGF0YT1hcy5tYXRyaXgoIHRlc3RkYXRhX2R1bW15ICksICBtaXNzaW5nID0gTkEgKSAKZHRlc3QgPC0geGdib29zdDo6eGdiLkRNYXRyaXgoZGF0YT1hcy5tYXRyaXgoIHByZWRfY29yZHMkcG9zdGR1bW15ICksICBtaXNzaW5nID0gTkEgKQoKYGBgCgpJbXBvcnRhbmNlIHNjb3JlcyBmb3IgZWFjaCB2YXJpYWJsZSwgcHJlZGljdGluZyB0aGUgbWlzc2luZ25lc3Mgb2YgZXhhY3QgbWFwIGNvb3JkaW5hdGVzIGFzIGEgZnVuY3Rpb24gb2YgZWFjaCBldmVudCdzIGRldGFpbHMuCgpgYGB7ciwgZmlnLndpZHRoPTEyLCBmaWcuaGVpZ2h0PTh9CmltcG9ydGFuY2VfaW1wb3J0YW5jZSA8LSB4Z2Jvb3N0Ojp4Z2IuaW1wb3J0YW5jZShmZWF0dXJlX25hbWVzPW5hbWVzKHRlc3RkYXRhX2R1bW15KSwKICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIG1vZGVsID0gcmYpCnhnYm9vc3Q6OnhnYi5wbG90LmltcG9ydGFuY2UoaW1wb3J0YW5jZV9pbXBvcnRhbmNlKQoKCmBgYAoKCgpgYGB7cn0KCmhpc3RvZ3JhbT1UCnNjYWxlPTIKYSA8LSBNZWFzdXJpbmdMYW5kc2NhcGU6OjpwbG90X3BhcnRpYWxfZWZmZWN0cyhyZj1wcmVkX2NvcmRzJHhiX21vZGVsLAogICAgICAgICAgICAgICAgICBvdXRjb21lPSJtYXBjb29yZGluYXRlX2NsZWFuX21pc3NpbmciLHZhcj0iZG9jdW1lbnRfZGF0ZV90eXBlIixtaW5zaXplPTEwMCwKICAgICAgICAgICAgICAgICAgc2NhbGU9c2NhbGUsaGlzdG9ncmFtPVQpCgpiIDwtIE1lYXN1cmluZ0xhbmRzY2FwZTo6OnBsb3RfcGFydGlhbF9lZmZlY3RzKHJmPXByZWRfY29yZHMkeGJfbW9kZWwsCiAgICAgICAgICAgICAgICAgIG91dGNvbWU9Im1hcGNvb3JkaW5hdGVfY2xlYW5fbWlzc2luZyIsdmFyPSJkb2N1bWVudF9kYXRlX2Jlc3RfeWVhciIsbWluc2l6ZT0xMDAsCiAgICAgICAgICAgICAgICAgIHNjYWxlPXNjYWxlLGhpc3RvZ3JhbT1UKQoKYyA8LSBNZWFzdXJpbmdMYW5kc2NhcGU6OjpwbG90X3BhcnRpYWxfZWZmZWN0cyhyZj1wcmVkX2NvcmRzJHhiX21vZGVsLAogICAgICAgICAgICAgICAgICBvdXRjb21lPSJtYXBjb29yZGluYXRlX2NsZWFuX21pc3NpbmciLHZhcj0iaW5pdGlhdG9yX2NsZWFuXzFfYWdnbWVkIixtaW5zaXplPTEwMCwKICAgICAgICAgICAgICAgICAgc2NhbGU9c2NhbGUsaGlzdG9ncmFtPVQpCgoKCmQgPC0gTWVhc3VyaW5nTGFuZHNjYXBlOjo6cGxvdF9wYXJ0aWFsX2VmZmVjdHMocmY9cHJlZF9jb3JkcyR4Yl9tb2RlbCwKICAgICAgICAgICAgICAgICAgb3V0Y29tZT0ibWFwY29vcmRpbmF0ZV9jbGVhbl9taXNzaW5nIix2YXI9InRhcmdldF9jbGVhbl8xX2FnZ21lZCIsbWluc2l6ZT0xMDAsCiAgICAgICAgICAgICAgICAgIHNjYWxlPXNjYWxlLGhpc3RvZ3JhbT1UKQoKZSA8LSBNZWFzdXJpbmdMYW5kc2NhcGU6OjpwbG90X3BhcnRpYWxfZWZmZWN0cyhyZj1wcmVkX2NvcmRzJHhiX21vZGVsLAogICAgICAgICAgICAgICAgICBvdXRjb21lPSJtYXBjb29yZGluYXRlX2NsZWFuX21pc3NpbmciLHZhcj0idHlwZV9jbGVhbl9hZ2dtZWQiLG1pbnNpemU9MTAwLAogICAgICAgICAgICAgICAgICBzY2FsZT1zY2FsZSxoaXN0b2dyYW09VCkKCgpmIDwtIE1lYXN1cmluZ0xhbmRzY2FwZTo6OnBsb3RfcGFydGlhbF9lZmZlY3RzKHJmPXByZWRfY29yZHMkeGJfbW9kZWwsCiAgICAgICAgICAgICAgICAgIG91dGNvbWU9Im1hcGNvb3JkaW5hdGVfY2xlYW5fbWlzc2luZyIsdmFyPSJkb2N1bWVudF91bml0X3R5cGUiLG1pbnNpemU9MTAwLAogICAgICAgICAgICAgICAgICBzY2FsZT1zY2FsZSxoaXN0b2dyYW09VCkKCgpnIDwtIE1lYXN1cmluZ0xhbmRzY2FwZTo6OnBsb3RfcGFydGlhbF9lZmZlY3RzKHJmPXByZWRfY29yZHMkeGJfbW9kZWwsCiAgICAgICAgICAgICAgICAgIG91dGNvbWU9Im1hcGNvb3JkaW5hdGVfY2xlYW5fbWlzc2luZyIsIHZhcj0iZG9jdW1lbnRfZGlzdHJpY3RfY2xlYW4iLAogICAgICAgICAgICAgICAgICBtaW5zaXplPTEwMCwgdHJhaW49cHJlZF9jb3JkcyR4X2FsbF9wcmVfZHVtbXkgLAogICAgICAgICAgICAgICAgICBzY2FsZT1zY2FsZSxoaXN0b2dyYW09VCkKCmggPC0gTWVhc3VyaW5nTGFuZHNjYXBlOjo6cGxvdF9wYXJ0aWFsX2VmZmVjdHMocmY9cHJlZF9jb3JkcyR4Yl9tb2RlbCwKICAgICAgICAgICAgICAgICAgb3V0Y29tZT0ibWFwY29vcmRpbmF0ZV9jbGVhbl9taXNzaW5nIix2YXI9ImV2ZW50X2RhdGVfY2xlYW5feWVhciIsbWluc2l6ZT0xMDAsCiAgICAgICAgICAgICAgICAgIHNjYWxlPXNjYWxlLGhpc3RvZ3JhbT1UKQoKI2UgPC0gcGxvdF9wYXJ0aWFsX2VmZmVjdHMocmY9cHJlZF9jb3JkcyR4Yl9tb2RlbCwKIyAgICAgICAgICAgICAgICAgIG91dGNvbWU9Im1hcGNvb3JkaW5hdGVfY2xlYW5fbWlzc2luZyIsdmFyPSJsb2NhdGlvbnRleHRfcnVsZWNsZWFuX3N1ZmZpeCIsbWluc2l6ZT0xMDApCgpgYGAKCgpgYGB7ciwgZmlnLndpZHRoPTEyLCBmaWcuaGVpZ2h0PTh9CgojcF9sb2FkKGNvd3Bsb3QpCmZpbmFsX2hpc3RvZ3JhbSA8LSBjb3dwbG90OjpwbG90X2dyaWQoCiAgCiAgYStnZ3RpdGxlKHNlbnRlbmNlX2Nhc2UoJ0RvY3VtZW50IERhdGUgVHlwZScpKSwKICBiK2dndGl0bGUoc2VudGVuY2VfY2FzZSgnRG9jdW1lbnQgWWVhcicpKSwKICBjK2dndGl0bGUoc2VudGVuY2VfY2FzZSgnSW5pdGlhdG9yJykpLAogIGQrZ2d0aXRsZShzZW50ZW5jZV9jYXNlKCdUYXJnZXQnKSksCiAgZStnZ3RpdGxlKHNlbnRlbmNlX2Nhc2UoJ0FjdCBUeXBlJykpLAogIGYrZ2d0aXRsZShzZW50ZW5jZV9jYXNlKCdEb2N1bWVudCBVbml0JykpLAogIGcrZ2d0aXRsZShzZW50ZW5jZV9jYXNlKCdEb2N1bWVudCBEaXN0cmljdCcpKSwKICBoK2dndGl0bGUoc2VudGVuY2VfY2FzZSgnRXZlbnQgWWVhcicpKSArIHlsYWIoc2VudGVuY2VfY2FzZSgiUHJvYmFiaWxpdHkgb2YgTWlzc2luZyBNaWxpdGFyeSBDb29yZGluYXRlcyIpKSwKICAjaStnZ3RpdGxlKCdSZXBvcnRpbmcgT2ZmaWNlJyksCiAgbmNvbCA9IDMsIGFsaWduID0gImh2IiApICMscmVsX2hlaWdodHM9aGVpZ2h0cykKZmluYWxfaGlzdG9ncmFtCgpnZ3NhdmUoCiAgZmlsZW5hbWUgPSBnbHVlOjpnbHVlKGRpcl9maWd1cmVzLCAicmZfbWFwY29vcmRpbmF0ZV9jbGVhbl9taXNzaW5nLnBkZiIpLAogIHBsb3QgPSBmaW5hbF9oaXN0b2dyYW0sIHdpZHRoID0gMTAsIGhlaWdodCA9IDgKKQogCmBgYAoKCgoKCgo=