I. Introduction

This assignment developed a predictive policing model for the shootings in Philadelphia with the goal of model generalizability and reducing systematic racism.

# 1. load Libraries
library(sf)
library(tidyverse)
# install.packages('mapview')
library(mapview)
library(spdep)
library(caret)
library(ckanr) # for opening data APIs built on CKAN technology
library(FNN)
library(grid)
library(gridExtra)
library(ggcorrplot)
library(jtools)     # for regression model plots
library(stargazer) # for creating table
library(broom)
library(tufte)
library(rmarkdown)
library(kableExtra)
library(tidycensus)
# new in Lab6
library(RSocrata)
library(viridis)
library(spatstat) # make kernel density map
library(raster)
library(knitr)
library(rgdal)

# 2. Identify functions
mapTheme <- function(base_size = 12) {
  theme(
    text = element_text( color = "black"),
    plot.title = element_text(size = 15,colour = "black"),
    plot.subtitle=element_text(face="italic"),
    plot.caption=element_text(hjust=0),
    axis.ticks = element_blank(),
    panel.background = element_blank(),axis.title = element_blank(),
    axis.text = element_blank(),
    axis.title.x = element_blank(),
    axis.title.y = element_blank(),
    panel.grid.minor = element_blank(),
    panel.border = element_rect(colour = "black", fill=NA, size=2),
    strip.text.x = element_text(size = 14))
}

plotTheme <- function(base_size = 12) {
  theme(
    text = element_text( color = "black"),
    plot.title = element_text(color = "darkred", size=15, face="bold"),
    plot.subtitle = element_text(face="italic"),
    plot.caption = element_text(hjust=0),
    axis.ticks = element_blank(),
    panel.background = element_blank(),
    panel.grid.major = element_line("grey80", size = 0.1),
    panel.grid.minor = element_blank(),
    panel.border = element_rect(colour = "black", fill=NA, size=2),
    strip.background = element_rect(fill = "grey80", color = "white"),
    strip.text = element_text(size=12),
    axis.title = element_text(size=12),
    axis.text = element_text(size=10),
    plot.background = element_blank(),
    legend.background = element_blank(),
    legend.title = element_text(colour = "black", face = "italic"),
    legend.text = element_text(colour = "black", face = "italic"),
    strip.text.x = element_text(size = 14)
  )
}

# 3. Load Quantile break functions

qBr <- function(df, variable, rnd) {
  if (missing(rnd)) {
    as.character(quantile(round(df[[variable]],0),
                          c(.01,.2,.4,.6,.8), na.rm=T))
  } else if (rnd == FALSE | rnd == F) {
    as.character(formatC(quantile(df[[variable]]), digits = 3),
                 c(.01,.2,.4,.6,.8), na.rm=T)
  }
}

q5 <- function(variable) {as.factor(ntile(variable, 5))}


# Load hexadecimal color palette

palette <- c('#feedde', '#fdbe85', '#fd8d3c', '#e6550d', '#a63603')

# for calculating average nearest neighbor distance.

nn_function <- function(measureFrom,measureTo,k) {
  measureFrom_Matrix <- as.matrix(measureFrom)
  measureTo_Matrix <- as.matrix(measureTo)
  nn <-   
    get.knnx(measureTo, measureFrom, k)$nn.dist
  output <-
    as.data.frame(nn) %>%
    rownames_to_column(var = "thisPoint") %>%
    gather(points, point_distance, V1:ncol(.)) %>%
    arrange(as.numeric(thisPoint)) %>%
    group_by(thisPoint) %>%
    dplyr::summarize(pointDistance = mean(point_distance)) %>%
    arrange(as.numeric(thisPoint)) %>% 
    dplyr::select(-thisPoint) %>%
    pull() # pull() is similar to $. It's mostly useful because it looks a little nicer in pipes, it also works with remote data frames, and it can optionally name the output.
  
  return(output)  
}

II.Data loading

1. Base Map Datasets

2. Crime Data: A type of crime other than burglary that suffers from more selection bias.

  • Shootings: City-wide shootings occurred in 2018.

3. Risk Factors Data: point level features that may intrigue crime. See each dataset below:

  • Building Demolitions: A point feature class of demolitions occurred in 2018, performed by private owners/contractors and by the Department of Licenses and Inspections due to dangerous building conditions.

  • Vacant Land: A point feature class of the location of properties across Philadelphia that are likely to be a vacant land based on an assessment of City of Philadelphia administrative datasets.

  • Tobacco Retailer: A point feature class of the location of tobacco retailers that applied for a tobacco permit through an online application system in 2018.

  • Tobacco Youth Sales Violations: This dataset contains violations for tobacco sale to minors in 2018.

  • Affordable Housing: A point feature class of the affordable housing units built in or before 2018.

# polygon
phillypoliceDistricts <- 
  st_read("http://data-phl.opendata.arcgis.com/datasets/62ec63afb8824a15953399b1fa819df2_0.geojson") %>%
  st_transform('ESRI:102728') %>%
  dplyr::select(District = DIST_NUM)

phillyBoundary <- 
  st_read("http://data.phl.opendata.arcgis.com/datasets/405ec3da942d4e20869d4e1449a2be48_0.geojson") %>%
  st_transform('ESRI:102728') 

# 2018 Shooting Victims
shootings <- 
  st_read('https://phl.carto.com/api/v2/sql?q=SELECT+*+FROM+shootings&filename=shootings&format=geojson&skipfields=cartodb_id') %>% 
  filter(year == "2018") %>%
  na.omit() %>%
  st_transform('ESRI:102728')  

# Creating a fishnet grid

philly_fishnet <- 
  st_make_grid(phillyBoundary,
               cellsize = 500, 
               square = TRUE) %>%
  .[phillyBoundary] %>% 
  st_sf() %>%
  mutate(uniqueID = rownames(.))

phillyneigh <-
  st_read("https://raw.githubusercontent.com/azavea/geo-data/master/Neighborhoods_Philadelphia/Neighborhoods_Philadelphia.geojson") %>%
  st_transform('ESRI:102728') %>%
  st_transform(st_crs(philly_fishnet)) 

# import risk factors

building_demolition <-
  st_read('https://phl.carto.com/api/v2/sql?q=SELECT+*+FROM+demolitions&filename=demolitions&format=geojson&skipfields=cartodb_id') %>% 
  mutate(year = substr(start_date,1,4)) %>%
  filter(year == '2018') %>%
  st_transform('ESRI:102728') %>%
  st_transform(st_crs(philly_fishnet)) %>%
  mutate(Legend = "Building Demolition") %>%
  dplyr::select(geometry, Legend)

vacant_land <-
  st_read('https://opendata.arcgis.com/datasets/b990222a527849229b4192feb4c42dc0_0.geojson') %>% 
  filter(VACANT_FLAG == "Land") %>%
  na.omit() %>%
  st_transform('ESRI:102728') %>%
  st_transform(st_crs(philly_fishnet)) %>%
  mutate(Legend = "Vacant Land") %>%
  dplyr::select(geometry, Legend)

tobacco <-
  st_read('http://data-phl.opendata.arcgis.com/datasets/853a1421e738471b8cc0d6ff755d47ff_0.geojson') %>% 
  st_transform('ESRI:102728') %>%
  st_transform(st_crs(philly_fishnet)) %>%
  mutate(Legend = "Tobacco") %>%
  dplyr::select(geometry, Legend) %>%
  na.omit() 

tobacco_violation <-
  st_read('https://opendata.arcgis.com/datasets/25b43fb8cae84e8a89d74d8707bbb5f2_0.geojson') %>% 
  filter(COMP_CHK_Y == '2018')%>%
  na.omit() %>%
  st_transform('ESRI:102728') %>%
  st_transform(st_crs(philly_fishnet)) %>%
  mutate(Legend = "Tobacco Violation") %>%
  dplyr::select(geometry, Legend)

affordable_housing <-
  st_read('https://opendata.arcgis.com/datasets/ca8944190b604b2aae7eff17c8dd9ef5_0.geojson') %>% 
  filter(FISCAL_YEAR_COMPLETE <= "2018") %>%
  st_transform('ESRI:102728') %>%
  st_transform(st_crs(philly_fishnet)) %>%
  mutate(Legend = "Affordable Housing") %>%
  dplyr::select(geometry, Legend)

III. Visualizaion and Analysis

1. A map of shooting crime in 2018, Philadelphia

From the density map we can see that shooting crime in Philadelphia were clustering in North and West Philadelphia. A small proportion of the shooting crime data released by Philadelphia involved officer shooting. Most of the shooting victims were report injured, certain proportion of the crime were fatal.

Discussion about Stop & Frisk tends to refer to randomly stopping people and arbitrarily searching them without cause or reason. If selective enforcement is the result of an officer’s preconceived or biased beliefs, then this ‘selection bias’ will be baked into the crime outcome. If a model fails to account for this bias, it will fall into the error term of the regression and lead to predictions that do not generalize across space. The selection bias of policing is severe especially in the year of 2020. The clustering pattern of shooting crime in Philadelphia might be partially explained by the inherited bias in police enforcement.

# 1. A map of shooting crime in 2018, Philadelphia
grid.arrange(ncol=2,
             ggplot() + 
               geom_sf(data = phillyBoundary) +
               geom_sf(data = shootings, colour="darkred", size=0.5, show.legend = "point") +
               labs(title= "Shootings, Philadelphia, in 2018\n",
                    caption = 'Figure 1.1') +
               mapTheme() +
               plotTheme(),
             
             ggplot() + 
               geom_sf(data = phillyBoundary, fill = "#E5E5E5") +
               stat_density2d(data = data.frame(st_coordinates(shootings)), 
                              aes(X, Y, fill = ..level.., alpha = ..level..),
                              size = 0.01, bins = 40, geom = 'polygon') +
               scale_fill_viridis_c(option = "plasma") +
               scale_alpha(range = c(0.00, 0.35), guide = FALSE) +
               labs(title = "Density of Shootings\n",
                    caption = 'Figure 1.2') +
               mapTheme() + 
               theme(legend.position = "none") +
               plotTheme())

2. A map of shooting crime joined to the fishnet

In order to make the predictive model practical, fishnet (500ft by 500ft grid cells) is the approprite unit of analysis for predicting geospatial shooting crime risk. Figure 2 maps the count of shootings by grid cell. North and West Philly are the shooting-clustered areas.

# Aggregate points to the fishnet
shooting_net <- 
  dplyr::select(shootings) %>% 
  mutate(countshootings = 1) %>% 
  aggregate(., philly_fishnet, sum) %>%
  mutate(countshootings = replace_na(countshootings, 0),
         uniqueID = rownames(.),
         cvID = sample(round(nrow(philly_fishnet) / 24), 
                       size=nrow(philly_fishnet), replace = TRUE))

ggplot() +
  geom_sf(data = shooting_net, aes(fill = countshootings), color = NA) +
  scale_fill_viridis_c(option = "plasma",
                       name = 'Shooting Counts') +
  labs(title = "Count of Shootings for the fishnet\n",
       caption = 'Figure 2') +
  mapTheme() +
  plotTheme()

3. Map set of risk factors in the fishnet

The five risk factors: affordable housing, building demolition, tobacco retailers, tobacco violation and vacant land showed clear spatial patterns. Among all the risk factors, vacant land and building demolition and tobacco violation showed similar pattern with the shooting crime.

In the second map set, risk factors spatial pattern were illustrated by average nearest neighbor distance. The nearest neighbor map set showed similar patterns. North, South and West Philly have shorter distance to the nearest building demolition, affordable house, tobacco violation and vacant land, while the distance in deep south and far north Philly is relatively long.

# All variables in fishnet 
vars_net <- 
  rbind(building_demolition, tobacco, affordable_housing, tobacco_violation,
        vacant_land) %>%
  st_join(., philly_fishnet, join=st_within) %>%
  st_drop_geometry() %>%
  group_by(uniqueID, Legend) %>%
  summarize(count = n()) %>%
  full_join(philly_fishnet, by = "uniqueID") %>%
  spread(Legend, count, fill=0) %>%
  st_sf() %>%
  na.omit() %>% 
  dplyr::select(-`<NA>`) %>%
  ungroup()

### Multiple map for feature counts in fishnet
vars_net.long <- 
  gather(vars_net, Variable, value, -geometry, -uniqueID)

vars <- unique(vars_net.long$Variable)
mapList <- list()

for(i in vars){
  mapList[[i]] <- 
    ggplot() +
    geom_sf(data = filter(vars_net.long, Variable == i), aes(fill=value), colour=NA) +
    scale_fill_viridis_c(option = "plasma",
                         name = " ") +
    labs(title=i) +
    mapTheme()}

do.call(grid.arrange,c(mapList, ncol =2, top = "Risk Factors by Fishnet\n"))

## 3.2 Nearest Neighbor Feature
# convenience to reduce length of function names.
st_c <- st_coordinates
st_coid <- st_centroid

## create NN from abandoned cars, k = 5
'%!in%' <- function(x,y)!('%in%'(x,y))

vars_net$tobacco.nn <- 
  nn_function(st_c(st_coid(vars_net)), 
              st_c(tobacco),
              k = 3)
vars_net$building_demolition.nn <-
           nn_function(st_c(st_coid(vars_net)), 
                                           st_c(building_demolition),
                                           k = 3)

vars_net$affordable_housing.nn <- 
           nn_function(st_c(st_coid(vars_net)), 
                                         st_c(affordable_housing %>%
                                                filter(geometry %!in% 'c(NaN, NaN)')),
                                         k = 3)
vars_net$tobacco_violation.nn <- 
           nn_function(st_c(st_coid(vars_net)), 
                                 st_c(tobacco_violation),
                                 k = 3)
vars_net$vacant_land.nn <-
           nn_function(st_c(st_coid(vars_net)), 
                                               st_c(vacant_land),
                                               k = 3)

## Visualize the nearest three features
vars_net.long.nn <- 
  dplyr::select(vars_net, ends_with(".nn")) %>%
  gather(Variable, value, -geometry)

vars <- unique(vars_net.long.nn$Variable)
mapList <- list()

for(i in vars){
  mapList[[i]] <- 
    ggplot() +
    geom_sf(data = filter(vars_net.long.nn, Variable == i), aes(fill=value), colour=NA) +
    scale_fill_viridis_c(option = "plasma",
                         name = " ") +
    labs(title=i) +
    mapTheme() +
    plotTheme()}

do.call(grid.arrange,c(mapList, ncol = 2, top = "Nearest Neighbor Risk Factors by Fishnet\n"))

# IV and DVs all in fishnet
philly_final_net <-
  left_join(shooting_net, st_drop_geometry(vars_net), by="uniqueID") 

philly_final_net <-
  st_centroid(philly_final_net) %>%
  st_join(dplyr::select(phillyneigh, mapname), by = "uniqueID") %>%
  st_join(dplyr::select(phillypoliceDistricts, District), by = "uniqueID") %>%
  st_drop_geometry() %>%
  left_join(dplyr::select(philly_final_net, geometry, uniqueID)) %>%
  st_sf() %>%
  na.omit()

5. A small multiple scatterplot with correlations

The following scatterplots showed the relationship between shooting count and risk factors.

# 5. A small multiple scatterplot with correlations
philly.correlation.long <-
  st_drop_geometry(philly_final_net) %>%
  dplyr::select(-uniqueID, -cvID,-starts_with('mapname'),-starts_with('District')) %>%
  gather(Variable, Value, -countshootings)

philly.correlation.cor <-
  philly.correlation.long %>%
  group_by(Variable) %>%
  summarize(philly.correlation = cor(Value, countshootings, use = "complete.obs"))

ggplot(philly.correlation.long, aes(Value, countshootings)) +
  geom_point(size = 0.1) +
  geom_text(data = philly.correlation.cor, aes(label = paste("r =", round(philly.correlation, 2))),
            x=-Inf, y=Inf, vjust = 1.5, hjust = -.1) +
  geom_smooth(method = "lm", se = FALSE, colour = "black") +
  facet_wrap(~Variable, ncol = 4, scales = "free") +
  labs(title = "Shooting count as a function of risk factors") +
  plotTheme()

6. A histogram of dependent variable

From the histogram below, we can see that most of the grid have no shooting at all. Shootings in the rest of the grids varies from 1 to 5.

# 6. A histogram of dependent variable

# Given shooting is a relatively rare event, it is reasonable for most grid cells to contain no crime events.
# And this distribution is Poisson distribution

ggplot(data = philly_final_net) +
  geom_histogram(aes(x = countshootings), fill = 'orange') +
  labs(title="Histogram of Dependent Variable: Count of Shootings\n",
       caption = "Figure 6") +
  xlab('Count of Shootings') +
  ylab('Count') +
  plotTheme()

7. A small multiple map of model errors by random k-fold and spatial cross validation.

To test the generalizability of the model and record the goodness of fit, one local area was hold out each time, while the remaining areas was trained.

For a given risk factor, I selected feature counts in each fishnet and distance to shooting hotpots as well as distance to significant shooting hotpots as features used in the model to avoid collinerity. I also added Local Moran’s I spatial process features based on the result in Figure 4. Therefore, we have seven features in total. reg.ss.cv performs random k-fold cross validation using spatial process features, while reg.ss.spatialCV performs LOGO-CV, spatial cross-validation on neighborhood name, using the same features.

7.1 Distribution of MAE

With the Spatial Process features added, the MAE decreased significantly when performing LOGO-CV instead of k-fold cross validation.

#### 7.1 Distribution of MAE
error_by_reg_and_fold %>%
  ggplot(aes(MAE)) + 
  geom_histogram(bins = 30, colour="black", fill = "#5dccb9") +
  facet_wrap(~Regression) +  
  geom_vline(xintercept = 0) + 
  scale_x_continuous(breaks = seq(0, 8, by = 1)) + 
  labs(title="Distribution of MAE", 
       subtitle = "k-fold cross validation vs. LOGO-CV\n",
       caption = "Figure 7.1",
       x="Mean Absolute Error", y="Count") +
  plotTheme()

7.2 visualizes the random k-fold and LOGO-CV errors spatially.

Figure 7.2 visualizes the LOGO-CV errors spatially. These maps visualize where the higher errors occur when the local spatial process is not accounted for. Not surprisingly, the largest errors are in the hotspot locations.

error_by_reg_and_fold %>%
  ggplot() +
  geom_sf(aes(fill = MAE)) +
  facet_wrap(~Regression) +
  scale_fill_viridis_c(option = "plasma") +
  labs(title = "Shooting Errors by Rondom k-fold and LOGO-CV Regression\n",
       caption = 'Figure 7.2') +
  mapTheme() + theme(legend.position="bottom") +
  plotTheme()

7.3 Predicted shooting and observed shooting

Figure 7.3 below shows that all models over-predict in low shooting areas and under-predict in hot spot areas. Over-predictions in lower shooting areas may highlight areas of latent risk. Under-prediction in higher shooting areas may reflect difficulty predicting the hotspots.

st_drop_geometry(reg.summary) %>%
  group_by(Regression) %>%
  mutate(burglary_Decile = ntile(countshootings, 10)) %>%
  group_by(Regression, burglary_Decile) %>%
  summarize(meanObserved = mean(countshootings, na.rm=T),
            meanPrediction = mean(Prediction, na.rm=T)) %>%
  gather(Variable, Value, -Regression, -burglary_Decile) %>%          
  ggplot(aes(burglary_Decile, Value, shape = Variable)) +
  geom_point(size = 2) + 
  geom_path(aes(group = burglary_Decile), colour = "black") +
  scale_shape_manual(values = c(2, 17)) +
  facet_wrap(~Regression) + xlim(0,10) +
  labs(title = "Predicted and Observed Shootings by Observed Burglary Decile\n",
       caption = 'Figure 7.3') +
  plotTheme()

8. A table of MAE and standard deviation MAE by regression.

Table 8 shows that there is no significant difference between the mean and standard deviation in error between k-fold and LOGO CV regression.

st_drop_geometry(error_by_reg_and_fold) %>%
  group_by(Regression) %>% 
  summarize(Mean_MAE = round(mean(MAE), 2),
            SD_MAE = round(sd(MAE), 2)) %>%
  kable(caption = "Table 8. MAE by regression") %>%
  kable_styling("striped", full_width = F) 
Table 8. MAE by regression
Regression Mean_MAE SD_MAE
Random k-fold CV: Spatial Process 0.05 0.04
Spatial LOGO-CV: Spatial Process 0.04 0.05

9. A table of raw errors by race context for a random k-fold vs. spatial cross validation regression.

The model should be generalize across race context. To test this proposition, census tracts were divided into Majority_White and Majority_Non_White tracts. From the map we can see that Philadelphia is a race-segregate city.

Table 9 presents the error (subtracting the observed burglary count from the prediction) of the two race groups. The model under-predicts in Majority_Non_White neighborhoods and over-predicts in Majority_White neighborhoods.The Spatial Process model did not bring much difference to the errors, instead it further over-predict in Majority_White neighborhoods.

ggplot() + 
  geom_sf(data = na.omit(tracts18), aes(fill = raceContext)) +
  scale_fill_manual(values = c("#25CB10", "#ff9966"), name="Race Context") +
  labs(title = "Race Context in Philly\n",
       caption = 'Figure 9') +
  mapTheme() + 
  theme(legend.position="bottom") +
  plotTheme()

# The model on average, under-predicts in Majority_Non_White neighborhoods and over-predicts in Majority_White neighborhoods. 
# It looks like this algorithm generalizes well with respect to race.
reg.summary %>% 
  st_centroid() %>%
  st_join(tracts18) %>%
  na.omit() %>%
  st_drop_geometry() %>%
  group_by(Regression, raceContext) %>%
  summarize(mean.Error = mean(Error, na.rm = T)) %>%
  spread(raceContext, mean.Error) %>%
  kable(caption = "Table 9. Mean Error by neighborhood racial context") %>%
  kable_styling("striped", full_width = F) 
Table 9. Mean Error by neighborhood racial context
Regression Majority_Non_White Majority_White
Random k-fold CV: Spatial Process -0.0327744 0.0004754
Spatial LOGO-CV: Spatial Process -0.0314771 0.0006632

10. The map comparing kernel density to risk predictions for the next year’s crime.

10.1 Make Kernel Density Map

Figure 10.1 visualizes three Kernel density maps at three different scales.

### 10.1 Make Kernel Density Map
sho_ppp <- as.ppp(st_coordinates(shootings), W = st_bbox(philly_final_net))
sho_KD.1000 <- spatstat::density.ppp(sho_ppp, 1000)
sho_KD.1500 <- spatstat::density.ppp(sho_ppp, 1500)
sho_KD.2000 <- spatstat::density.ppp(sho_ppp, 2000)
sho_KD.df <- rbind(
  mutate(data.frame(rasterToPoints(mask(raster(sho_KD.1000), as(phillyneigh, 'Spatial')))), Legend = "1000 Ft."),
  mutate(data.frame(rasterToPoints(mask(raster(sho_KD.1500), as(phillyneigh, 'Spatial')))), Legend = "1500 Ft."),
  mutate(data.frame(rasterToPoints(mask(raster(sho_KD.2000), as(phillyneigh, 'Spatial')))), Legend = "2000 Ft.")) 

sho_KD.df$Legend <- factor(sho_KD.df$Legend, levels = c("1000 Ft.", "1500 Ft.", "2000 Ft."))

ggplot(data = sho_KD.df, aes(x = x, y = y)) +
  geom_raster(aes(fill=layer)) + 
  facet_wrap(~Legend) +
  coord_sf(crs=st_crs(philly_final_net)) + 
  scale_fill_viridis_c(option = "plasma",
                       name = "Density") +
  labs(title = "Kernel Density with 3 Different Search Radius Scales\n",
       caption = 'Figure 10.1') +
  mapTheme() +
  plotTheme()

10.2 Comparison Map

Figure 10.2 is generated of the risk categories for both model types with a sample of shooting19 points overlaid. A strongly fit model should show that the highest risk category is uniquely targeted to places with a high density of shooting points. Figure 10.2 compares the crime capture of 2019 kernel density and 2019 risk predictions. However, it’s hard to tell whether risk predictions or kernel density capture more observed shootings, meaning that it’s not clear which model provides a more robust targeting tool for allocating police resources. To this context, the bar plot in Figure.11 provides a more accurate comparison.

### 10.3 Prediction by Kernel Density Model
sho_KDE_sf <- as.data.frame(sho_KD.1000) %>%
  st_as_sf(coords = c("x", "y"), crs = st_crs(philly_final_net)) %>%
  aggregate(., philly_final_net, mean) %>%
  mutate(label = "Kernel Density",
         Risk_Category = ntile(value, 100),
         Risk_Category = case_when(
           Risk_Category >= 90 ~ "90% to 100%",
           Risk_Category >= 70 & Risk_Category <= 89 ~ "70% to 89%",
           Risk_Category >= 50 & Risk_Category <= 69 ~ "50% to 69%",
           Risk_Category >= 30 & Risk_Category <= 49 ~ "30% to 49%",
           Risk_Category >= 1 & Risk_Category <= 29 ~ "1% to 29%")) %>%
  cbind(
    aggregate(
      dplyr::select(shootings19) %>% mutate(shootingCount = 1), ., sum) %>%
      mutate(shootingCount = replace_na(shootingCount, 0))) %>%
  dplyr::select(label, Risk_Category, shootingCount)

##### 10.4 Prediction by Risk Prediction Model
sho_risk_sf <-
  reg.ss.spatialCV %>%
  mutate(label = "Risk Predictions",
         Risk_Category = ntile(Prediction, 100),
         Risk_Category = case_when(
           Risk_Category >= 90 ~ "90% to 100%",
           Risk_Category >= 70 & Risk_Category <= 89 ~ "70% to 89%",
           Risk_Category >= 50 & Risk_Category <= 69 ~ "50% to 69%",
           Risk_Category >= 30 & Risk_Category <= 49 ~ "30% to 49%",
           Risk_Category >= 1 & Risk_Category <= 29 ~ "1% to 29%")) %>%
  cbind(
    aggregate(
      dplyr::select(shootings19) %>% mutate(shootingCount = 1), ., sum) %>%
      mutate(shootingCount = replace_na(shootingCount, 0))) %>%
  dplyr::select(label,Risk_Category, shootingCount)

##### 10.5 Comparison Maps
# Below a map is generated of the risk categories for both model types with a sample of shooting19 points overlaid. A strongly fit model should show that the highest risk category is uniquely targeted to places with a high density of shooting points.
rbind(sho_KDE_sf, sho_risk_sf) %>%
  na.omit() %>%
  gather(Variable, Value, -label, -Risk_Category, -geometry) %>%
  ggplot() +
  geom_sf(aes(fill = Risk_Category), colour = NA) +
  geom_sf(data = sample_n(shootings19, 1130), size = .5, colour = "black") +
  facet_wrap(~label, ) +
  scale_fill_viridis_d(option = "plasma",
                       name = 'Risk Category') + 
  labs(title="Comparison of Kernel Density and Risk Predictions",
       subtitle="Bottom layer is 2019 predicted shooting counts.\nDot is observed 2019 shooting counts.\n",
       caption = 'Figure 10.2') +
  mapTheme() +
  plotTheme()

11. The bar plot making this comparison.

The risk prediction model narrowly edges out the Kernel Density in the highest risk categories, while the business-as-usual hot spot approach performs much better on the second highest risk category. It’s not clear which one is better based on the current model.

## 11. The bar plot making this comparison.

rbind(sho_KDE_sf, sho_risk_sf) %>%
  st_set_geometry(NULL) %>% 
  na.omit() %>%
  gather(Variable, Value, -label, -Risk_Category) %>%
  group_by(label, Risk_Category) %>%
  summarize(countShootings = sum(Value)) %>%
  ungroup() %>%
  group_by(label) %>%
  mutate(Rate_of_test_set_crimes = countShootings / sum(countShootings)) %>%
  ggplot(aes(Risk_Category,Rate_of_test_set_crimes)) +
  geom_bar(aes(fill=label), position="dodge", stat="identity") +
  scale_fill_viridis_d(option = "plasma",
                       name = 'Model') +
  labs(title = "Risk Prediction vs. Kernel Density, 2019 Shooing Crime\n",
       caption = 'Figure 11',
       x = 'Risk Category',
       y = 'Rate of Test Set Shooting Crimes') +
  plotTheme()

Conclusion

Overall, this algorithm is not ready to put into production for the Philadelphia Police Department. Despite that the model generalizes well across different neighborhood contexts, as a resource allocation tool, it still suffer from selection bias. Indeed, this algorithm addressed the issue of systematic racism against black community and is not over-predicting the risk of majority-black neighborhoods. However, over-predicting the risk of majority-white community does not make it righteous either.

Despite the selection bias, this algorithm predicts the shooting crime in Philadelphia at a extremely low error rate. Also, this algorithm reached a good balance between generalizability and accuracy and did a great job at revealing latent risk of shooting crime.