Arrest Risk Prediction in Chicago, IL

0.1 Selection bias in arrests
- 0.1.1 Map of arrests in point form
- 0.1.2 Map of arrests joined to the fishnet
0.2 Risk factors for arrests
- 0.2.1 Map of arrests risk factors in the fishnet
0.3 Spatial process of arrests
- 0.3.1 Local Moran’s I-related map of arrests
0.4 Correlation tests
- 0.4.1 Scatterplot with correlations
0.5 Risk prediction
0.6 Conclusion
0.7 Reference

knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
library(sf)
library(RSocrata)
library(viridis)
library(spatstat)
library(raster)
library(spdep)
library(FNN)
library(grid)
library(gridExtra)
library(knitr)
library(kableExtra)
library(tidycensus)
library(mapview)
root.dir = "https://raw.githubusercontent.com/urbanSpatial/Public-Policy-Analytics-Landing/master/DATA/"
source("https://raw.githubusercontent.com/urbanSpatial/Public-Policy-Analytics-Landing/master/functions.r")

paletteGray <- c("gray90", "gray70", "gray50", "gray30", "gray10")

policeDistricts <- 
  st_read("https://data.cityofchicago.org/api/geospatial/fthy-xz3r?method=export&format=GeoJSON") %>%
  st_transform('ESRI:102271') %>%
  dplyr::select(District = dist_num)
  
policeBeats <- 
  st_read("https://data.cityofchicago.org/api/geospatial/aerh-rz74?method=export&format=GeoJSON") %>%
  st_transform('ESRI:102271') %>%
  dplyr::select(District = beat_num)

bothPoliceUnits <- rbind(mutate(policeDistricts, Legend = "Police Districts"), 
                         mutate(policeBeats, Legend = "Police Beats"))

chicagoBound <- 
  st_read(file.path(root.dir,"/Chapter5/chicagoBoundary.geojson")) %>%
  st_transform('ESRI:102271') 

fishnet <- 
  st_make_grid(chicagoBound, cellsize = 500) %>%
  st_sf() %>%
  mutate(uniqueID = rownames(.))

#####risk factors below#####
#Proximity to drug dealing areas; Prostitution areas; Public transport; Bars, pubs and exotic clubs; Schools, banks and cash points; Post-offices, Leisure and fast-food outlets. 
abandonCars <- 
  read.socrata("https://data.cityofchicago.org/Service-Requests/311-Service-Requests-Abandoned-Vehicles/3c9v-pnva") %>%
    mutate(year = substr(creation_date,1,4)) %>% filter(year == "2017") %>%
    dplyr::select(Y = latitude, X = longitude) %>%
    na.omit() %>%
    st_as_sf(coords = c("X", "Y"), crs = 4326, agr = "constant") %>%
    st_transform(st_crs(fishnet)) %>%
    mutate(Legend = "Abandoned_Cars")
  
abandonBuildings <- 
  read.socrata("https://data.cityofchicago.org/Service-Requests/311-Service-Requests-Vacant-and-Abandoned-Building/7nii-7srd") %>%
    mutate(year = substr(date_service_request_was_received,1,4)) %>%  filter(year == "2017") %>%
    dplyr::select(Y = latitude, X = longitude) %>%
    na.omit() %>%
    st_as_sf(coords = c("X", "Y"), crs = 4326, agr = "constant") %>%
    st_transform(st_crs(fishnet)) %>%
    mutate(Legend = "Abandoned_Buildings")

graffiti <- 
  read.socrata("https://data.cityofchicago.org/Service-Requests/311-Service-Requests-Graffiti-Removal-Historical/hec5-y4x5") %>%
    mutate(year = substr(creation_date,1,4)) %>% filter(year == "2017") %>%
    filter(where_is_the_graffiti_located_ %in% c("Front", "Rear", "Side")) %>%
    dplyr::select(Y = latitude, X = longitude) %>%
    na.omit() %>%
    st_as_sf(coords = c("X", "Y"), crs = 4326, agr = "constant") %>%
    st_transform(st_crs(fishnet)) %>%
    mutate(Legend = "Graffiti")

streetLightsOut <- 
  read.socrata("https://data.cityofchicago.org/Service-Requests/311-Service-Requests-Street-Lights-All-Out/zuxi-7xem") %>%
    mutate(year = substr(creation_date,1,4)) %>% filter(year == "2017") %>%
    dplyr::select(Y = latitude, X = longitude) %>%
    na.omit() %>%
    st_as_sf(coords = c("X", "Y"), crs = 4326, agr = "constant") %>%
    st_transform(st_crs(fishnet)) %>%
    mutate(Legend = "Street_Lights_Out")

sanitation <-
  read.socrata("https://data.cityofchicago.org/Service-Requests/311-Service-Requests-Sanitation-Code-Complaints-Hi/me59-5fac") %>%
    mutate(year = substr(creation_date,1,4)) %>% filter(year == "2017") %>%
    dplyr::select(Y = latitude, X = longitude) %>%
    na.omit() %>%
    st_as_sf(coords = c("X", "Y"), crs = 4326, agr = "constant") %>%
    st_transform(st_crs(fishnet)) %>%
    mutate(Legend = "Sanitation")

liquorRetail <- 
  read.socrata("https://data.cityofchicago.org/resource/nrmj-3kcf.json") %>%  
    filter(business_activity == "Retail Sales of Packaged Liquor") %>%
    dplyr::select(Y = latitude, X = longitude) %>%
    na.omit() %>%
    st_as_sf(coords = c("X", "Y"), crs = 4326, agr = "constant") %>%
    st_transform(st_crs(fishnet)) %>%
    mutate(Legend = "Liquor_Retail")

bus_stops <-
  st_read("E:/Upenn/CPLN508/assign3/Project-3-Risk-Prediction/CTA_BusStops/CTA_BusStops.shp") %>%
  #st_read("/Users/annaduan/Documents/GitHub/Project-3-Risk-Prediction/CTA_BusStops/CTA_BusStops.shp") %>%
    dplyr::select(Y = POINT_Y, X = POINT_X) %>%
    na.omit() %>%
    st_as_sf(coords = c("X", "Y"), crs = 4326, agr = "constant") %>%
    st_transform(st_crs(fishnet)) %>%
    mutate(Legend = "Bus_stops")
#https://data.cityofchicago.org/Transportation/CTA-Bus-Stops-Shapefile/pxug-u72f


#CENSUS
census_api_key("d9ebfd04caa0138647fbacd94c657cdecbf705e9", install = TRUE, overwrite = TRUE)
#row 1: vacant, total housing units, mhhinc
#row 2: white, population, renter occ, owner occ, #no HS degree
#row 3: male adults, female adults, poverty level, youth unemployed (18-34, veteran), youth unemployed (non veteran)
#row 4-5: male 15-17, male 18-19, male 20, male 21, male 22-24, male 25-29, male 30-34
#row 6-7: female 18-34
acs <-
  get_acs(geography = "tract", variables = c("B25002_003E", "B25001_001E", "B19013_001E", "B01001A_001E", "B01003_001E", "B07013_002E", "B07013_003E", "B06009_002E", 
"B05003_008E", "B05003_019E", "B06012_002", "B21005_006E", "B21005_011E", 
"B01001_006E", "B01001_007E", "B01001_008E", "B01001_009E","B01001_010E", "B01001_011E", "B01001_012E",
"B01001_031E", "B01001_032E", "B01001_033E", "B01001_034E", "B01001_035E", "B01001_036E"
), year=2018, state=17, county=031, geometry=T) %>%
  st_transform(st_crs(fishnet))
  
#filter for chicago tracts
acs <-
  rbind(
    st_centroid(acs)[chicagoBound,] %>%
      st_drop_geometry() %>%
      left_join(acs) %>%
      st_sf() %>%
      mutate(inChicago = "YES"),
    st_centroid(acs)[chicagoBound, op = st_disjoint] %>%
      st_drop_geometry() %>%
      left_join(acs) %>%
      st_sf() %>%
      mutate(inChicago = "NO")) %>%
  filter(inChicago == "YES") %>%
  dplyr::select(-inChicago)
#long to wide form
acs <-
  acs %>%
  dplyr::select(-moe, -GEOID) %>%
  spread(variable, estimate) %>%
  dplyr::select(-geometry) %>%
  rename(vacantUnits = B25002_003,
         totalUnits = B25001_001,
         medHHInc = B19013_001,
         white = B01001A_001,
         population = B01003_001,
         ownerOcc = B07013_002,
         renterOcc = B07013_003,
         noHsDegree = B06009_002,
         maleAdult = B05003_008,
         femaleAdult = B05003_019,
         poverty = B06012_002,
         youthUnempVet = B21005_006,
         youthUnempNonVet = B21005_011,
         male1517 = B01001_006,
         male1819 = B01001_007,
         male20 = B01001_008,
         male21 = B01001_009,
         male2224 = B01001_010,
         male2529 = B01001_011,
         male3034 = B01001_012,
         female1819 = B01001_031,
         female20 = B01001_032,
         female21 = B01001_033,
         female2224 = B01001_034,
         female2529 = B01001_035,
         female3034 = B01001_036)

acs <- 
  acs %>%
  mutate(pctVacant = ifelse(totalUnits > 0, vacantUnits / totalUnits, 0),
         pctWhite = ifelse(population > 0, white / population, 0), 
         pctRenterOcc = renterOcc/ (renterOcc + ownerOcc),
         pctNoHS = noHsDegree/ (maleAdult + femaleAdult),
         pctPoverty = ifelse(population > 0, poverty / population, 0),
         youthUnemploy = (youthUnempVet + youthUnempNonVet) / (male1819 + male20 + male21 + male2224 + male2529 + male3034 + female1819 + female20 + female21 + female2224 + female2529 + female3034),
         pctMaleYouth = ifelse(population > 0, (male1517 + male1819 + male20 + male21 + male2224 + male2529 + male3034) / population, 0)) %>%
  dplyr::select(-totalUnits,-vacantUnits,-white,-renterOcc,-ownerOcc, -noHsDegree, -maleAdult, -femaleAdult, -youthUnempVet, -youthUnempNonVet, -male1517, -male1819, -male20, -male21, -male2224, -male2529, -male3034, -female1819, -female20, -female21, -female2224, -female2529, -female3034, -poverty)


#attach variables to fishnet
vars_net <- 
  rbind(abandonCars,streetLightsOut,abandonBuildings,
        liquorRetail, graffiti, sanitation) %>%
  st_join(., fishnet, join=st_within) %>%
  st_drop_geometry() %>%
  group_by(uniqueID, Legend) %>%
  summarize(count = n()) %>%
    full_join(fishnet) %>%
    spread(Legend, count, fill=0) %>%
    st_sf() %>%
    dplyr::select(-`<NA>`) %>%
    na.omit() %>%
    ungroup()
vars_net <- vars_net %>%
  st_join(., acs)


##nearest neighbor##

nn_function <- function(measureFrom,measureTo,k) {
  measureFrom_Matrix <-
    as.matrix(measureFrom)
  measureTo_Matrix <-
    as.matrix(measureTo)
  nn <-   
    get.knnx(measureTo, measureFrom, k)$nn.dist
    output <-
      as.data.frame(nn) %>%
      rownames_to_column(var = "thisPoint") %>%
      gather(points, point_distance, V1:ncol(.)) %>%
      arrange(as.numeric(thisPoint)) %>%
      group_by(thisPoint) %>%
      summarize(pointDistance = mean(point_distance)) %>%
      arrange(as.numeric(thisPoint)) %>% 
      dplyr::select(-thisPoint) %>%
      pull()
  
  return(output) 
}


st_c <- st_coordinates
st_coid <- st_centroid

vars_net <-
  vars_net %>%
    mutate(
      Abandoned_Buildings.nn =
        nn_function(st_c(st_coid(vars_net)), st_c(abandonBuildings),3),
      Abandoned_Cars.nn =
        nn_function(st_c(st_coid(vars_net)), st_c(abandonCars),3),
      Graffiti.nn =
        nn_function(st_c(st_coid(vars_net)), st_c(graffiti),3),
      Liquor_Retail.nn =
        nn_function(st_c(st_coid(vars_net)), st_c(liquorRetail),3),
      Street_Lights_Out.nn =
        nn_function(st_c(st_coid(vars_net)), st_c(streetLightsOut),3),
      Sanitation.nn =
        nn_function(st_c(st_coid(vars_net)), st_c(sanitation),3))


vars_net.long.nn <- 
  dplyr::select(vars_net, ends_with(".nn")) %>%
    gather(Variable, value, -geometry)


# Add distance to downtown as variable
neighborhoods <- 
  st_read("https://raw.githubusercontent.com/blackmad/neighborhoods/master/chicago.geojson") %>%
  st_transform(st_crs(fishnet)) 

loopPoint <- neighborhoods %>%
  filter(name == "Loop") %>%
  st_centroid()

vars_net$loopDistance =
  st_distance(st_centroid(vars_net),loopPoint) %>%
  as.numeric() 


#download 2018 arrests data
arrests18 <- 
  read.socrata("https://data.cityofchicago.org/Public-Safety/Crimes-2018/3i3m-jwuy?Arrest=true") %>%
  mutate(x = gsub("[()]", "", Location)) %>%
  separate(x,into= c("Y","X"), sep=",") %>%
  mutate(X = as.numeric(X),
         Y = as.numeric(Y)) %>% 
  na.omit %>%
  st_as_sf(coords = c("X", "Y"), crs = 4326, agr = "constant") %>%
  st_transform('ESRI:102271') %>% 
  distinct() %>%
  .[fishnet,]

0.1 Selection bias in arrests

Selection bias in arrests on African Americans has received tons of public attention following the death of George Floyd. Concerns about racial biased in predictive algorithms have long been expressed and discussed in many news articles of national organizations as well as many researches articles (Brantingham P.J. et, al., 2018). It’s said that minority communities are often targeted in cases of arrest and this leads to discriminatory consequences under race context. This bias on minority groups can be a issue because it deepen the feeling of untruest between minority groups and the policy, which harms the overall stability of the society and adds to the difficulty in police’s job to ensure a safe local environment. This is especially the case when the selection bias results in higher rate of arrest in minority communities, then more patrols are sent to these communities given more arrest incidents happened (even though some or even many of them may be false arrests). This vicious cycle will eventually result in more innocent individuals in those minority neighborhoods being arrested and challenge the principles of fairness and equity in this democratic country. In order to end this vicious cycle, selection bias in the arrest predicting algorithms needs to be fully identified and adjusted.

0.1.1 Map of arrests in point form

Total arrest incidents in Chicago 2017 are shown in Figure 1.1. Apart from some places in the far south and the far north, as well as some in the middle, arrests seem to happen quite commonly across Chicago.

arrests <- 
  read.socrata("https://data.cityofchicago.org/Public-Safety/Crimes-2017/d62x-nvdr?Arrest=true") %>% 
    mutate(x = gsub("[()]", "", Location)) %>%
    separate(x,into= c("Y","X"), sep=",") %>%
    mutate(X = as.numeric(X),Y = as.numeric(Y)) %>% 
    na.omit() %>%
    st_as_sf(coords = c("X", "Y"), crs = 4326, agr = "constant")%>%
    st_transform('ESRI:102271') %>% 
    distinct()

ggplot() +
  geom_sf(data = chicagoBound, fill = "black") +
  geom_sf(data = arrests, colour = "red", size = 0.1) +
  labs(title = "Figure 1.1: Arrests Made, 2017", subtitle = "Chicago, IL") +  mapTheme()

0.1.2 Map of arrests joined to the fishnet

This incidents are joined to fish net in order to study and examine the local spatial process of arrests. As shown in Figure 1.2, arrests are most frequent in the northwestern and middle-southern part of Chicago .

crime_net <- 
  dplyr::select(arrests) %>% 
  mutate(countArrests = 1) %>% 
  aggregate(., fishnet, sum) %>%
  mutate(countArrests = replace_na(countArrests, 0),
         uniqueID = rownames(.),
         cvID = sample(round(nrow(fishnet) / 24), size=nrow(fishnet), replace = TRUE))

ggplot() +
  geom_sf(data = crime_net, aes(fill = countArrests)) +
  scale_fill_viridis() +
  labs(title = "Figure 2.1: Count of Arrests for the fishnet") +
  mapTheme()

0.2 Risk factors for arrests

0.2.1 Map of arrests risk factors in the fishnet

We consider 22 risk factors relating to arrests as shown in figure 2.1. Abandon buildings, percent of population without high school degree (pctNOHS) and percent of poverty have similar spatial distribution/concentration as observed arrest incidents across Chicago.

vas_net_noname <- subset(vars_net, select = -c(NAME))
vars_net.long <- 
  gather(vas_net_noname, Variable, value, -geometry, -uniqueID)
vars <- unique(vars_net.long$Variable)
mapList <- list()

for(i in vars){
  mapList[[i]] <- 
    ggplot() +
      geom_sf(data = filter(vars_net.long, Variable == i), aes(fill=value), colour=NA) +
      scale_fill_viridis(name="") +
      labs(title=i) +
      mapTheme()}

do.call(grid.arrange,c(mapList, ncol =3, top = "Figure 3.1: Risk Factors by Fishnet"))

Nearest Neighborhood risk factors indicate that the southern Chicago has the lowest incidents of abandoned_cars, graffiti, street lights out and sanitation problems. Also, it’s furthest away from liquor retail places.

vars.nn <- unique(vars_net.long.nn$Variable)
mapList.nn <- list()

for(i in vars.nn){
  mapList.nn[[i]] <- 
    ggplot() +
      geom_sf(data = filter(vars_net.long.nn, Variable == i), aes(fill=value), colour=NA) +
      scale_fill_viridis(name="") +
      labs(title=i) +
      mapTheme()}   

do.call(grid.arrange,c(mapList.nn, ncol = 3, top = "Figure 3.2: Nearest Neighborhood Risk Factors by Fishnet"))

0.3 Spatial process of arrests

0.3.1 Local Moran’s I-related map of arrests

Local Moran’s-I suggests a clustering of arrests in the northwestern area, which can be confirmed in the maps of arrest-count and significant hot spots (Figure 4.1). P values are higher in area with few arrest data.

##final net##
final_net <-
  left_join(crime_net, st_drop_geometry(vars_net), by="uniqueID") 

final_net <-
  st_centroid(final_net) %>%
    st_join(dplyr::select(neighborhoods, name)) %>%
    st_join(dplyr::select(policeDistricts, District)) %>%
      st_drop_geometry() %>%
      left_join(dplyr::select(final_net, geometry, uniqueID)) %>%
      st_sf() %>%
  na.omit() 


##Prepare for moran's
final_net.nb <- poly2nb(as_Spatial(final_net), queen=TRUE)
final_net.weights <- nb2listw(final_net.nb, style="W", zero.policy=TRUE)


final_net.localMorans <- 
  cbind(
    as.data.frame(localmoran(final_net$countArrests, final_net.weights)),
    as.data.frame(final_net)) %>% 
    st_sf() %>%
      dplyr::select(Arrest_Count = countArrests, 
                    Local_Morans_I = Ii, 
                    P_Value = `Pr(z > 0)`) %>%
      mutate(Significant_Hotspots = ifelse(P_Value <= 0.0000001, 1, 0)) %>%
      gather(Variable, Value, -geometry)
  
vars <- unique(final_net.localMorans$Variable)
varList <- list()

for(i in vars){
  varList[[i]] <- 
    ggplot() +
      geom_sf(data = filter(final_net.localMorans, Variable == i), 
              aes(fill = Value), colour=NA) +
      scale_fill_viridis(name="") +
      labs(title=i) +
      mapTheme() + theme(legend.position="bottom")}

do.call(grid.arrange,c(varList, ncol = 4, top = "Figure 4.1: Local Morans I statistics, Arrests"))

##distance to hotspots## 
final_net <-
  final_net %>% 
  mutate(arrests.isSig = 
           ifelse(localmoran(final_net$countArrests, 
                             final_net.weights)[,5] <= 0.0000001, 1, 0)) %>%
  mutate(arrests.isSig.dist = 
           nn_function(st_coordinates(st_centroid(final_net)),
                       st_coordinates(st_centroid(
                         filter(final_net, arrests.isSig == 1))), 1))

0.4 Correlation tests

0.4.1 Scatterplot with correlations

Arrest counts have positive correlation with abandon buildings, distance to downtown, sanitation, liquor retail, percent of population with high school degree, percent of vacancy/renter-occupied housing/poverty/youth unemployment and street lights-out risk factors. Little correlation exits between arrest counts and abandoned cars, graffiti, percent of young males, population. Negative correlation exists for the rest of the risk factors.

correlation.long <-
  st_drop_geometry(final_net) %>%
    dplyr::select(-uniqueID, -cvID, -loopDistance, -name, -District, -NAME) %>%
    gather(Variable, Value, -countArrests) %>%
  mutate(Value = as.numeric(Value))

correlation.cor <-
  correlation.long %>%
    group_by(Variable) %>%
    summarize(correlation = cor(Value, countArrests, use = "complete.obs"))


ggplot(correlation.long, aes(Value, countArrests)) +  
  geom_point(size = 0.1) +
  geom_text(data = correlation.cor, aes(label = paste("r =", round(correlation, 2))),
            x=-Inf, y=Inf, vjust = 1.5, hjust = -.1) +
  geom_smooth(method = "lm", se = FALSE, colour = "black") +
  facet_wrap(~Variable, ncol = 2, scales = "free") +
  labs(title = "Figure 5.1: Arrest count as a function of risk factors") +
  plotTheme()

0.5 Risk prediction

0.5.1 Histogram of arrests count

Learn from Figure 6.1, many places have no arrests count, and a few hot-spot places have the most arrests count.

hist(final_net$countArrests, main = "Figure 6.1: Histogram of Arrest Count")

reg.vars <- c("Abandoned_Buildings.nn", "Abandoned_Cars.nn", "Graffiti.nn", 
              "Liquor_Retail.nn", "Street_Lights_Out.nn", "Sanitation.nn", 
              "loopDistance", "medHHInc", "pctVacant", "pctWhite", "pctRenterOcc", "pctNoHS", "pctPoverty", "youthUnemploy", "pctMaleYouth")

reg.ss.vars <- c("Abandoned_Buildings.nn", "Abandoned_Cars.nn", "Graffiti.nn", 
                 "Liquor_Retail.nn", "Street_Lights_Out.nn", "Sanitation.nn", 
                 "loopDistance", "arrests.isSig", "arrests.isSig.dist")

0.5.2 Map of model errors by random k-fold and spatial cross validation

Generally, LOGO-CV model has a more evenly-distributed MAE, and spatial process feature has a better performance in narrowing the MAE for LOGO-CV model than the k-fold model (Figure 7.1). This confirms that shared local arrests exist in Chicago. Spatially, hot-spot areas tend to have high errors, yet the adding of spatial feature helps improve this distribution of errors (Figure 7.2).

error_by_reg_and_fold %>%
  ggplot(aes(MAE)) + 
    geom_histogram(bins = 30, colour="black", fill = "#FDE725FF") +
    facet_wrap(~Regression) +  
    geom_vline(xintercept = 0) + scale_x_continuous(breaks = seq(0, 8, by = 1)) + 
    labs(title="Figure 7.1: Distribution of MAE", subtitle = "k-fold cross validation vs. LOGO-CV",
         x="Mean Absolute Error", y="Count")

map_of_k_and_validation <-
  ggplot() +
  geom_sf(data = error_by_reg_and_fold, aes(fill = MAE, colour = MAE)) +
        scale_fill_viridis() +
        facet_wrap(~Regression) + 
        scale_colour_viridis()+
        labs(title="Figure 7.2: Distribution of MAE", subtitle = "k-fold cross validation vs. LOGO-CV",
        x="Mean Absolute Error", y="Count")+
        mapTheme()

map_of_k_and_validation

0.5.3 Table of MAE and standard deviation MAE by regression

In both regression methods, it’s clear that spatial features help improve the model (Table 1). Because LOGO-CV has a more conservative assumption, its regression results have generally higher errors than the k-fold method.

st_drop_geometry(error_by_reg_and_fold) %>%
  group_by(Regression) %>% 
    summarize(Mean_MAE = round(mean(MAE), 2),
              SD_MAE = round(sd(MAE), 2)) %>%
  kable(caption = "Table 1: MAE and SF by regression") %>%
    kable_styling("striped", full_width = F) %>%
    row_spec(2, color = "black", background = "#FDE725FF") %>%
    row_spec(4, color = "black", background = "#FDE725FF")

Table 1: MAE and SF by regression
Regression	Mean_MAE	SD_MAE
Random k-fold CV: Just Risk Factors	3.86	3.31
Random k-fold CV: Spatial Process	3.52	3.05
Spatial LOGO-CV: Just Risk Factors	6.63	6.66
Spatial LOGO-CV: Spatial Process	5.33	4.75

neighborhood.weights <-
  filter(error_by_reg_and_fold, Regression == "Spatial LOGO-CV: Spatial Process") %>%
    group_by(cvID) %>%
      poly2nb(as_Spatial(.), queen=TRUE) %>%
      nb2listw(., style="W", zero.policy=TRUE)

filter(error_by_reg_and_fold, str_detect(Regression, "LOGO"))  %>% 
    st_drop_geometry() %>%
    group_by(Regression) %>%
    summarize(Morans_I = moran.mc(abs(Mean_Error), neighborhood.weights, 
                                 nsim = 999, zero.policy = TRUE, 
                                 na.action=na.omit)[[1]],
              p_value = moran.mc(abs(Mean_Error), neighborhood.weights, 
                                 nsim = 999, zero.policy = TRUE, 
                                 na.action=na.omit)[[3]])%>%
    kable(caption = "Table 2: Moran I's Error by Regression") %>%
    kable_styling("striped", full_width = F) %>%
    row_spec(2, color = "black", background = "#FDE725FF") 

#predictions
reg.summary %>%
  ggplot() +
  geom_sf(aes(fill = Prediction, colour = Prediction)) +
  scale_fill_viridis() +
  scale_colour_viridis() +
  facet_wrap(~Regression) +  
  mapTheme()+
  theme(strip.text = element_text(size=5))+
  labs(title = "Figure 8.1: Predictions With Different Methods ")

#actual counts, for comparison
reg.ss.cv %>%
  ggplot() +
  geom_sf(aes(fill = countArrests, colour = countArrests)) +
  scale_fill_viridis() +
  scale_colour_viridis()+
  mapTheme()+
  labs(title = "Figure 8.2: Actual Arrest Counts in 2017")

st_drop_geometry(reg.summary) %>%
  group_by(Regression) %>%
    mutate(arrests_Decile = ntile(countArrests, 10)) %>%
  group_by(Regression, arrests_Decile) %>%
    summarize(meanObserved = mean(countArrests, na.rm=T),
              meanPrediction = mean(Prediction, na.rm=T)) %>%
    gather(Variable, Value, -Regression, -arrests_Decile) %>%          
    ggplot(aes(arrests_Decile, Value, shape = Variable)) +
      geom_point(size = 2) + geom_path(aes(group = arrests_Decile), colour = "black") +
      scale_shape_manual(values = c(2, 17)) +
      facet_wrap(~Regression) + xlim(0,10) +
      labs(title = "Figure 8.3: Predicted and Observed Arrests by Observed Arrest Decile")

Table 2 suggests that spatial features help account for some of the spatial variance in arrests, but some still remain. Models cooperating spatial process feature have more smooth risk surface. Both k-fold and LOGO-CV methods predict distinct hotpots in the southern part of Chicago (Figure 8.1), which deviates from the actual observation (Figure 8.2). Additionally, both prediction methods yield a higher proportion of hot-spot areas than observed. These predictions indicate that there is latent arrests risk in Chicago that hasn’t been observed yet, especially in the southern part. All models over-predict areas with lower risks and under-predict hot-spot areas. This suggests that these models highlight low-risk areas and may not be adequate in predicting hot spots (Figure 8.3).

0.5.4 A table of raw errors by race context for a random k-fold vs. spatial cross validation regression

In general, LOGO-CV model generalizes well in terms of race, although it slightly under-predicts the majority non-white neighborhoods while over-predicts the majority white neighborhoods (Table 3). For some reasons, the spatial process model has higher mean error for both majority white and non-white neighborhoods.This may indicate that arrests across Chicago have a weak spatial relationship. Compared the actual arrests map in 2017 (Figure 8.2) with the map of racial context (Figure 9.1), the arrests concentrate in northwestern Chicago, which comprises mainly the majority non-white neighborhoods. However, it’s noted that even though there are non-white neighborhoods in the southern part, fewer arrests happened here. Some non-white neighborhoods in the south and the east have nearly 0 arrests. It’s either due to the low crime rate or the latent risk that was less observed before due to the selection bias.

raceContextACS <- acs %>%
  dplyr::select(pctWhite) %>%
  mutate(raceContext = ifelse(pctWhite > .5, "Majority_White", "Majority_Non_White"))

reg.summary %>% 
  filter(str_detect(Regression, "LOGO")) %>%
    st_centroid() %>%
    st_join(raceContextACS) %>%
    na.omit() %>%
      st_drop_geometry() %>%
      group_by(Regression, raceContext) %>%
      summarize(mean.Error = mean(Error, na.rm = T)) %>%
      spread(raceContext, mean.Error) %>%
      kable(caption = "Table 3: Mean Error by Neighborhood Racial Context") %>%
        kable_styling("striped", full_width = F)

Table 3: Mean Error by Neighborhood Racial Context
Regression	Majority_Non_White	Majority_White
Spatial LOGO-CV: Just Risk Factors	-0.6930006	0.5130669
Spatial LOGO-CV: Spatial Process	-2.2361356	2.0669150

ggplot() +
  geom_sf(data = raceContextACS, aes(fill = raceContext)) +
  labs(title = "Figure 9.1: Racial Context of Chicago") +
  mapTheme()

0.5.5 Map comparing kernel density to risk predictions for the next year’s arrests

Risk prediction model has a better spatial performance than the kernel density model. For the risk prediction morel, in areas with the lowest risk(1-29%), nearly no points fall in this purple zone, while in areas with the highest risk(90-100%), almost no areas are left uncovered by the points of incidents (Figure 10.1).

#compare kernel density vs risk prediction, overlay 2018 burglaries
arrests_ppp <- as.ppp(st_coordinates(arrests), W = st_bbox(final_net))
arrests_KD <- spatstat::density.ppp(arrests_ppp, 1000)

arrests_KDE_sf <- as.data.frame(arrests_KD) %>%
  st_as_sf(coords = c("x", "y"), crs = st_crs(final_net)) %>%
  aggregate(., final_net, mean) %>%
  mutate(label = "Kernel Density 2018",
         Risk_Category = ntile(value, 100),
         Risk_Category = case_when(
           Risk_Category >= 90 ~ "90% to 100%",
           Risk_Category >= 70 & Risk_Category <= 89 ~ "70% to 89%",
           Risk_Category >= 50 & Risk_Category <= 69 ~ "50% to 69%",
           Risk_Category >= 30 & Risk_Category <= 49 ~ "30% to 49%",
           Risk_Category >= 1 & Risk_Category <= 29 ~ "1% to 29%")) %>%
  cbind(
    aggregate(
      dplyr::select(arrests18) %>% mutate(arrestsCount = 1), ., sum) %>%
    mutate(arrestsCount = replace_na(arrestsCount, 0))) %>%
  dplyr::select(label, Risk_Category, arrestsCount)

arrests_risk_sf <-
  filter(reg.summary, Regression == "Spatial LOGO-CV: Spatial Process") %>%
  mutate(label = "Risk Predictions 2018",
         Risk_Category = ntile(Prediction, 100),
         Risk_Category = case_when(
           Risk_Category >= 90 ~ "90% to 100%",
           Risk_Category >= 70 & Risk_Category <= 89 ~ "70% to 89%",
           Risk_Category >= 50 & Risk_Category <= 69 ~ "50% to 69%",
           Risk_Category >= 30 & Risk_Category <= 49 ~ "30% to 49%",
           Risk_Category >= 1 & Risk_Category <= 29 ~ "1% to 29%")) %>%
  cbind(
    aggregate(
      dplyr::select(arrests18) %>% mutate(arrestsCount = 1), ., sum) %>%
      mutate(arrestsCount = replace_na(arrestsCount, 0))) %>%
  dplyr::select(label,Risk_Category, arrestsCount)

#Map
rbind(arrests_KDE_sf, arrests_risk_sf) %>%
  na.omit() %>%
  gather(Variable, Value, -label, -Risk_Category, -geometry) %>%
  ggplot() +
    geom_sf(aes(fill = Risk_Category), colour = NA) +
    geom_sf(data = sample_n(arrests18, 3000), size = .5, colour = "black") +
    facet_wrap(~label, ) +
    scale_fill_viridis(discrete = TRUE) +
    labs(title="Figure 10.1: Comparison of Kernel Density and Risk Predictions",
         subtitle="2018 arrest risk predictions; 2018 arrests") +
    mapTheme()

0.5.6 Bar plot for next years’ arrests

The risk prediction model narrowly edges out the Kernel Density in the highest risk category (90-100%), but is less effective in the middle to second highest risk category (50-89%). Thus this model has some value compared with the Kernel Density in areas suffering from the highest risk, but may not powerful enough in capturing the less-risky yet still vulnerable areas (Figure 11.1).

rbind(arrests_KDE_sf, arrests_risk_sf) %>%
  st_set_geometry(NULL) %>% na.omit() %>%
  gather(Variable, Value, -label, -Risk_Category) %>%
  group_by(label, Risk_Category) %>%
  summarize(countArrests = sum(Value)) %>%
  ungroup() %>%
  group_by(label) %>%
  mutate(Rate_of_test_set_crimes = countArrests / sum(countArrests)) %>%
    ggplot(aes(Risk_Category,Rate_of_test_set_crimes)) +
      geom_bar(aes(fill=label), position="dodge", stat="identity") +
      scale_fill_viridis(discrete = TRUE) +
      labs(title = "Figure 11.1: Risk prediction vs. Kernel density, 2018 Arrests") +
      theme(axis.text.x = element_text(angle = 45, vjust = 0.5))

0.6 Conclusion

In conclusion, I would recommend this model into production. This model has a generally satisfying accuracy (Table 1) and generalizibility in explaining the clustering (Figure 4.1) and local spatial relationship of the arrests (Figure 8.1). Additionally, as shown in Figure 11.1, the risk prediction model outperforms in areas with the highest risk and areas with no risk. Also, in Figure 10.1, this algorithm does a great job in dividing and classifying the hierarchy of risk areas spatially. Although this algorithm under-predicts majority non-white neighborhoods as well as some of the hot spots areas, it might help in adjusting the selection bias embedded in the observed arrest incidents data.

0.7 Reference

P. Jeffrey Brantingham, Matthew Valasik & George O. Mohler (2018) Does Predictive Policing Lead to Biased Arrests? Results From a Randomized Controlled Trial, Statistics and Public Policy, 5:1, 1-6, DOI: 10.1080/2330443X.2018.1438940