Skip to content

Instantly share code, notes, and snippets.

@bohdanszymanik
Last active July 31, 2025 02:04
Show Gist options
  • Save bohdanszymanik/f8db6cb2f286cec27390691d5e066ea7 to your computer and use it in GitHub Desktop.
Save bohdanszymanik/f8db6cb2f286cec27390691d5e066ea7 to your computer and use it in GitHub Desktop.
Analysis of the impact of public monitored cctv on wellington crime
library(tidyverse)
library(sf)
# quick aside... testing out sql querying in R - never done this before
# works well
# library(sqldf)
# VARB <- read_csv("C:\\Users\\...\\VictimisationsAgeROVBoundary.csv")
# slice <- sqldf("select * from VARB limit 10")
###################################################################
# comparing monitored street areas in meshblocks with victimisation
library(remotes)
install_github("yonghah/esri2sf")
library("esri2sf")
wgtn_cctv_url <- "https://services1.arcgis.com/CPYspmTk3abe6d7i/arcgis/rest/services/CCTV_Viewsheds_(View_layer)/FeatureServer/0"
wgtn_cctv <- esri2sf(wgtn_cctv_url)
hv_cctv_url <- "https://maps.huttcity.govt.nz/hosting/rest/services/Hosted/CCTV_Camera_Location/FeatureServer/1"
hv_cctv <- esri2sf(hv_cctv_url)
# at this stage I guess... I could either extract and load into ArcGIS, or load in the MB2013 areas and do a tally up
# of cctv locations/meshblock
# might do both
glimpse(wgtn_cctv)
# I want to look at the geometry column - normal glimpse won't show me enough
sf::st_as_text(wgtn_cctv$geoms) # this works
ggplot(wgtn_cctv) +
geom_sf(aes(colour = "red")) +
coord_sf(xlim = c(174.77, 174.78), ylim = c(-41.275, -41.30)) +
geom_sf_label(aes(label = Camera_Name), size = 2)
# I think I should look at the monitored areas as a fraction of
# the meshblock area. This isn't great actually - the crime is probably
# largely happening in the streets and some meshblocks will have
# a lot of street area, others very little.
# load in the meshblocks 2013 for wellington - it's already been cropped down to just
# the wellington region using mapshaper.org first
mb2013_wgtn <- st_read("C:\\Users\\...\\wd\\Areas\\statsnz-meshblock-2013-SHP-wgtn\\meshblock-2013.shp")
plot(mb2013_wgtn) # lots of columns to colourise
# let's just choose one eg LandAreaSQ
ggplot(data = mb2013_wgtn) +
geom_sf(aes(fill = LandAreaSQ)) +
theme_minimal()
# ahhh... a performance problem. This operation takes a couple few minutes to complete, we need to figure out how to
# parallelize these st_* operations
erased <- st_difference(mb2013_wgtn, wgtn_cctv)
glimpse(erased)
# this won't plot - I think ggplot gives up due to the complexity over the total area being plotted
ggplot(data = erased) +
geom_sf() +
coord_sf(xlim = c(174.773, 174.777), ylim = c(-41.2815, -41.2855))
theme_minimal()
# if I make the overall area smaller with less meshblocks then it works, here I concentrate on just the intersection from willis st to lambton quay
glimpse(erased |> filter(MeshblockN %in% c('2124400', '2125700', '2126200')))
# why so many rows?? because wgtn_cctv has many overlaying geometries - multiple viewsheds from nearby cameras
# this time it plots ok
ggplot(data = erased |> filter(MeshblockN %in% c('2124400', '2125700', '2126200', '2125900', '2126502', '2127200', '2126600', '2127300', '2125600', '2124000', '2125300'))) +
geom_sf() +
theme_minimal()
# it appears we have some LandAreaSQ measures that don't tally with what is calculated by st_area() so let's add a calculated area
mb2013_wgtn$LandAreaSQ_calc <- mb2013_wgtn |> st_area()
# I've just noticed that the order of geom_sf to coord_sf is important - I guess
# that figures - the operations here are additive
ggplot(data = mb2013_wgtn) +
geom_sf() +
coord_sf(xlim = c(174.772, 174.778), ylim = c(-41.281, -41.285)) +
theme_minimal()
# we can erase the cctv viewsheds from the meshblocks
erase_viewsheds_from_mbs <- st_difference(mb2013_wgtn, wgtn_cctv)
# or we can erase the meshblocks from the viewsheds - we don't want that
# erase_mbs_from_viewsheds <- st_difference(wgtn_cctv, mb2013_wgtn)
# hmmm - this doesn't work - I think it's due to the overlying viewsheds
ggplot(data = erase_viewsheds_from_mbs) +
geom_sf() +
# coord_sf(xlim = c(174.772, 174.778), ylim = c(-41.281, -41.285)) +
theme_minimal()
# and notice when you run this that the viewsheds consist of many overlaying geometries
ggplot(data = erase_mbs_from_viewsheds) +
geom_sf() +
coord_sf(xlim = c(174.772, 174.778), ylim = c(-41.281, -41.285)) +
theme_minimal()
# let's union together all those overlaying viewsheds to simplify the geometry
wgtn_cctv_union <- wgtn_cctv |> st_union()
ggplot(data = wgtn_cctv_union) +
geom_sf() +
theme_minimal()
# this works much better!!!
wgtn_mbs_less_viewsheds <- st_difference(mb2013_wgtn, wgtn_cctv_union) # a tiny bit faster with mb2013_wgtn_small
ggplot(data = wgtn_mbs_less_viewsheds) +
geom_sf() +
theme_minimal()
#################################################################
# Next task is to figure out by how much the area of a meshblock has decreased as a result of
# the viewshed - usefully, the meshblocks contain a km^2 area - but without units
# st_area() generates areas but with units - so let's assign some units
# hmmm, I think there's something odd about some of those LandAreaSQ quantities - I'm going to add a calculated column to compare against
library(units)
wgtn_mbs_less_viewsheds$LandAreaSQ_km2 <- set_units(wgtn_mbs_less_viewsheds$LandAreaSQ, km^2)
wgtn_mbs_less_viewsheds$AreaMinusViewshed <- st_area(wgtn_mbs_less_viewsheds)
wgtn_mbs_less_viewsheds$AreaMinusViewshed_km2 <- set_units(st_area(wgtn_mbs_less_viewsheds), km^2)
# this doesn't give sensible answers because some of the LandAreaSQ values aren't right
#wgtn_mbs_less_viewsheds$ChangeInArea <- 1 - drop_units(wgtn_mbs_less_viewsheds$AreaMinusViewshed_km2 / wgtn_mbs_less_viewsheds$LandAreaSQ_km2)
glimpse(wgtn_mbs_less_viewsheds)
wgtn_mbs_less_viewsheds$ChangeInArea <- wgtn_mbs_less_viewsheds$LandAreaSQ_calc - wgtn_mbs_less_viewsheds$AreaMinusViewshed
wgtn_mbs_less_viewsheds$ChangeInAreaRatio <- 1 - drop_units(wgtn_mbs_less_viewsheds$AreaMinusViewshed / wgtn_mbs_less_viewsheds$LandAreaSQ_calc)
wgtn_mbs_less_viewsheds |> mutate (ChangeInArea = drop_units(ChangeInArea)) |> filter(ChangeInArea > 1 )
wgtn_mbs_less_viewsheds |> filter(ChangeInAreaRatio > 0.1)
#################################################################
# and now need to join to the crime data
# I have two choices for the victimisation data
# the first choice is retail specific obtained through OIA
v <- read_csv("C:\\Users\\...\\wd\\RetailVictimisation\\v.csv")
glimpse(v)
# the second is public - the Victimisations Time and Place data set on https://www.police.govt.nz/about-us/publications-statistics/data-and-statistics/policedatanz
# I think actually that second data set might be better, it excludes victimisations occurring in dwellings but we're interested
# not in dwellings but crime on the street and retail crime
vtp <- read_csv("C:\\Users\\...\\wd\\VictimisationByMeshblock\\vtp_jul2025.csv",
col_types = cols(
`Meshblock` = col_character()
))
glimpse(vtp)
# I could try and assign the datetime of columns on the import but... unfortunately there
# are a few rows where the occurrence datetime is not like "%d%b%Y %H:%M:%S" - just hours:mins unfortunately
# v <- read_csv("C:\\Users\\...\\wd\\RetailVictimisation\\v.csv",
# col_types = cols(
# occurrence_datetime = col_datetime("%d%b%Y %H:%M:%S"),
# reported_date = col_date("%d%b%Y")
# ))
# so maybe better to read in and then create a new column for a clean date that we calculate based on both
# occurred and reported dates
# lubridate can handle multiple datetime formats with parse_date_time()
v$parsed_occurrence_datetime <- parse_date_time(v$occurrence_datetime, orders = c("%d/%m/%Y %H:%M", "%d%b%Y %H:%M:%S", "%d%b%Y %H:%M"))
v$parsed_reported_date <- parse_date_time(v$reported_date, orders = c("%d/%m/%Y", "%d%b%Y"))
glimpse(v)
# turns out that in some of the victimisation data we have some occurrence datetimes that are plainly wrong - either last century or beyond! or in the future!!
library(lubridate)
arrange(v, occurrence_datetime) |> slice_head(n = 20) |> select(occurrence_datetime)
v |> arrange(desc(occurrence_datetime)) |> slice_head(n = 20) |> select(occurrence_datetime)
# so best apply some logic to generate a clean occurrence date
invalid_year_picker <- function(date1, date2) {
return
if_else (year(as.Date(date1)) < 2019 | year(as.Date(date1)) > year(now()),
as.Date(date2),
as.Date(date1)
)
}
invalid_year_picker(as.Date("2016-01-01"), as.Date("2020-01-01"))
v <- v |>
mutate(clean_occurrence_date = invalid_year_picker(parsed_occurrence_datetime, parsed_reported_date))
glimpse(v)
# I want to aggregate the victimisation data up to year and meshblock
# still not sure which dataset I should use - maybe use v for the moment, and then swap to vtp
v_year_mb <-
v |>
group_by(year = year(clean_occurrence_date), meshblock_2013_census) |>
summarise( sum_v = sum(victimisations))
# so joining on v.meshblock_2013_census = wgtn_mbs_less_viewsheds.MeshblockN
# and I don't need all the columns in wgtn_mbs_less_viewsheds so I'll filter that
wgtn_mbs_less_viewsheds_v <-
wgtn_mbs_less_viewsheds |>
select(MeshblockN, ChangeInArea, ChangeInAreaRatio, AreaMinusViewshed, LandAreaSQ_calc ) |>
left_join(v_year_mb, join_by(MeshblockN == meshblock_2013_census))
glimpse(wgtn_mbs_less_viewsheds_v)
# now I can compare the two extreme ends of our data eg in the v dataset from 2019 to 2024 (since we're interested in full years)
# want to view victimisations cf change in area cf 2019 versus 2024
install.packages("viridis")
library(viridis)
ggplot(data = wgtn_mbs_less_viewsheds_v |> filter(year %in% c(2019, 2020, 2021, 2022, 2023, 2024))) +
geom_point(mapping = aes(x = ChangeInArea, y = sum_v, colour = year)) +
scale_colour_viridis(option = "B")
wgtn_mbs_less_viewsheds_v |> filter(year %in% c(2019, 2024))
# I need to pivot_wider wgtn_mbs_less_viewsheds_v for the two years 2019, and 2020 to get the difference in sum_v
wgtn_mbs_less_viewsheds_v_2019_vrs_2024 <-
wgtn_mbs_less_viewsheds_v |> filter(year %in% c(2019, 2024)) |>
pivot_wider(
names_from = year,
values_from = sum_v
)
wgtn_mbs_less_viewsheds_v_2019_vrs_2024$v_2019_2024 <- wgtn_mbs_less_viewsheds_v_2019_vrs_2024$`2024` - wgtn_mbs_less_viewsheds_v_2019_vrs_2024$`2019`
ggplot(data = wgtn_mbs_less_viewsheds_v_2019_vrs_2024) +
geom_point(mapping = aes(x = ChangeInAreaRatio, y = v_2019_2024)) +
scale_colour_viridis(option = "B") +
labs(x = "Proportion of Meshblock in Viewshed", y = "Victimisations 2024 minus victimisations 2019")
# let's include the ANZSOC Division this time and use it as a colour aesthetic
v_year_mb_anzsocdivision <-
v |>
group_by(year = year(clean_occurrence_date), meshblock_2013_census, anzsoc_division) |>
summarise( sum_v = sum(victimisations))
wgtn_mbs_less_viewsheds_v_anzsocdivision <-
wgtn_mbs_less_viewsheds |>
select(MeshblockN, ChangeInArea, ChangeInAreaRatio, AreaMinusViewshed, LandAreaSQ_calc ) |>
left_join(v_year_mb_anzsocdivision, join_by(MeshblockN == meshblock_2013_census))
wgtn_mbs_less_viewsheds_v_anzsocdiv_2019_vrs_2024 <-
wgtn_mbs_less_viewsheds_v_anzsocdivision |> filter(year %in% c(2019, 2024)) |>
pivot_wider(
names_from = year,
values_from = sum_v
)
wgtn_mbs_less_viewsheds_v_anzsocdiv_2019_vrs_2024$v_2019_2024 <- wgtn_mbs_less_viewsheds_v_anzsocdiv_2019_vrs_2024$`2024` - wgtn_mbs_less_viewsheds_v_anzsocdiv_2019_vrs_2024$`2019`
ggplot(data = wgtn_mbs_less_viewsheds_v_anzsocdiv_2019_vrs_2024) +
geom_point(mapping = aes(x = ChangeInAreaRatio, y = v_2019_2024, colour = anzsoc_division)) +
labs(x = "Proportion of Meshblock in Viewshed", y = "Victimisations 2024 minus victimisations 2019")
# now I want to
# 1. repeat the exercise but use the vtp dataset - I think this might better highlight street level crime
# 2. aggregate for the change in area - maybe 0-0.2, 0.2-0.4 ... 0.8-1.0
# 3. figure out what are the meshblocks that show the largest change in area
###############################################################################
# just quickly - what are these meshblocks that have a large change in area
wgtn_mbs_less_viewsheds |>
select(MeshblockN, TLAName, AreaUnitNa, LandAreaSQ_calc, AreaMinusViewshed, ChangeInArea, ChangeInAreaRatio) |>
arrange(desc(ChangeInAreaRatio))
# that's bollocks! the LandAreaSQ_km2 just can't be right - have added a calculated area and recalculating the areaminusviewshed
########################
# vtp dataset version
library(stringr)
vtp$year <- as.numeric( vtp$`Month Year` |> str_sub(-4) )
glimpse(vtp)
vtp |>select(`ANZSOC Division`) |> distinct()
vtp_year_mb <-
vtp |>
group_by(year, meshblock = Meshblock) |>
summarise( sum_v = sum(Victimisations))
glimpse(vtp_year_mb)
wgtn_mbs_less_viewsheds_vtp <-
wgtn_mbs_less_viewsheds |>
select(MeshblockN, ChangeInArea, ChangeInAreaRatio, AreaMinusViewshed, LandAreaSQ_calc) |>
left_join(vtp_year_mb, join_by(MeshblockN == meshblock))
glimpse(wgtn_mbs_less_viewsheds_v)
# the vtp data only goes back to june 2021 - that sux as it's impacted by covid and it's a midyear value
wgtn_mbs_less_viewsheds_vtp_2022_vrs_2024 <-
wgtn_mbs_less_viewsheds_vtp |> filter(year %in% c(2022, 2024)) |>
pivot_wider(
names_from = year,
values_from = sum_v
)
wgtn_mbs_less_viewsheds_vtp_2022_vrs_2024$v_2022_2024 <- wgtn_mbs_less_viewsheds_vtp_2022_vrs_2024$`2024` - wgtn_mbs_less_viewsheds_vtp_2022_vrs_2024$`2022`
# not sure there's a significant movement below the 0 line
ggplot(data = wgtn_mbs_less_viewsheds_vtp_2022_vrs_2024) +
geom_point(mapping = aes(x = ChangeInAreaRatio, y = v_2022_2024)) +
scale_colour_viridis(option = "B") +
labs(x = "Proportion of Meshblock in Viewshed", y = "Victimisations 2024 minus victimisations 2022")
## what happens if I just look at ANZSOC Division = 'Theft and Related Offences'
vtp_year_mb_theft <-
vtp |>
filter(`ANZSOC Division` == 'Theft and Related Offences') |>
group_by(year, meshblock = Meshblock) |>
summarise( sum_v = sum(Victimisations))
glimpse(vtp_year_mb_theft)
wgtn_mbs_less_viewsheds_vtp_theft <-
wgtn_mbs_less_viewsheds |>
select(MeshblockN, ChangeInArea, ChangeInAreaRatio, AreaMinusViewshed, LandAreaSQ_calc) |>
left_join(vtp_year_mb_theft, join_by(MeshblockN == meshblock))
glimpse(wgtn_mbs_less_viewsheds_vtp_theft)
# the vtp data only goes back to june 2021 - that sux as it's impacted by covid and it's a midyear value
wgtn_mbs_less_viewsheds_vtp_2022_vrs_2024_theft <-
wgtn_mbs_less_viewsheds_vtp_theft |>
filter(year %in% c(2022, 2024)) |>
pivot_wider(
names_from = year,
values_from = sum_v
)
wgtn_mbs_less_viewsheds_vtp_2022_vrs_2024_theft$v_2022_2024 <- wgtn_mbs_less_viewsheds_vtp_2022_vrs_2024_theft$`2024` - wgtn_mbs_less_viewsheds_vtp_2022_vrs_2024_theft$`2022`
# not sure there's a significant movement below the 0 line
ggplot(data = wgtn_mbs_less_viewsheds_vtp_2022_vrs_2024_theft) +
geom_point(mapping = aes(x = ChangeInAreaRatio, y = v_2022_2024)) +
scale_colour_viridis(option = "B") +
labs(x = "Proportion of Meshblock in Viewshed", y = "Victimisations 2024 minus victimisations 2022", title = 'For ANZSOC Division = Theft and Related Offences')
###########################################
## why not just using the ANZSOC Division as an asthetic to colour the points?
vtp_year_mb <-
vtp |>
group_by(year, meshblock = Meshblock, anzsoc_division = `ANZSOC Division`) |>
summarise( sum_v = sum(Victimisations))
glimpse(vtp_year_mb)
wgtn_mbs_less_viewsheds_vtp <-
wgtn_mbs_less_viewsheds |>
select(MeshblockN, ChangeInArea, ChangeInAreaRatio, AreaMinusViewshed, LandAreaSQ_calc) |>
left_join(vtp_year_mb, join_by(MeshblockN == meshblock))
glimpse(wgtn_mbs_less_viewsheds_vtp)
# the vtp data only goes back to june 2021 - that sux as it's impacted by covid and it's a midyear value
wgtn_mbs_less_viewsheds_vtp_2022_vrs_2024 <-
wgtn_mbs_less_viewsheds_vtp |> filter(year %in% c(2022, 2024)) |>
pivot_wider(
names_from = year,
values_from = sum_v
)
wgtn_mbs_less_viewsheds_vtp_2022_vrs_2024$v_2022_2024 <- wgtn_mbs_less_viewsheds_vtp_2022_vrs_2024$`2024` - wgtn_mbs_less_viewsheds_vtp_2022_vrs_2024$`2022`
# not sure there's a significant movement below the 0 line
ggplot(data = wgtn_mbs_less_viewsheds_vtp_2022_vrs_2024) +
geom_point(mapping = aes(x = ChangeInAreaRatio, y = v_2022_2024, colour = anzsoc_division)) +
labs(x = "Proportion of Meshblock in Viewshed", y = "Victimisations 2024 minus victimisations 2022")
### todo - I could simply count the below the line versus above the line victimisations for ChangeInAreaRatio > some amount eg 0.1
# or maybe bracket it between values
outcome_vtp <-
wgtn_mbs_less_viewsheds_vtp_2022_vrs_2024 |>
group_by(Improved = v_2022_2024 < 0, CameraImpact = ChangeInAreaRatio > 0.2) |>
summarise(sum_sum_v = sum(v_2022_2024))
# Got better Got worse
# Camera impact -... ...
# No camera impact -... ...
# So, basically...
# Areas under camera impact had twice the victimisations in 2024 versus 2022
# While areas without camera impact had about 1.5x worse
# comparing back to the retail victimisation data obtained through data sharing
outcome_v <-
wgtn_mbs_less_viewsheds_v_2019_vrs_2024 |>
group_by(Improved = v_2019_2024 < 0, CameraImpact = ChangeInAreaRatio > 0.05) |>
summarise(sum_sum_v = sum(v_2019_2024))
# gives
# Got better Got worse
# Camera impact -... ...
# No camera impact -... ...
outcome_v_anzsocd <-
wgtn_mbs_less_viewsheds_v_anzsocdiv_2019_vrs_2024 |>
group_by(Improved = v_2019_2024 < 0, CameraImpact = ChangeInAreaRatio > 0.05, anzsoc_division) |>
summarise(sum_sum_v = sum(v_2019_2024))
# gives for 'Theft'
# Got better Got worse
# Camera impact -... ...
# No camera impact -... ...
# why don't we try this a different way? How about just the victimisations in 2019 compared to 2024 for the two cases of camera impact?
outcome_v1 <-
wgtn_mbs_less_viewsheds_v |>
filter(year %in% c(2019, 2024)) |>
group_by(CameraImpact = ChangeInAreaRatio > 0.05, year) |>
summarise(sum_sum_v = sum(sum_v))
# I think this is easier to understand
# 2019 2024 Percentage Increase
# Camera Impact ... ... ...
# No camera impact ... ... ...%
# Now we can repeat but for ANZSOC Division
outcome_v_abs_anzsocd <-
wgtn_mbs_less_viewsheds_v_anzsocdivision |>
filter(year %in% c(2019, 2024)) |>
group_by(CameraImpact = ChangeInAreaRatio > 0.05, year, anzsoc_division) |>
summarise(sum_sum_v = sum(sum_v))
outcome_v <-
wgtn_mbs_less_viewsheds_v_2019_vrs_2024 |>
group_by(CameraImpact = ChangeInAreaRatio > 0.05) |>
summarise(sum_sum_v = sum(v_2019_2024))
# todo - double check if 2022 to 2024 makes for a sensible time range
## what happens if I just look at ANZSOC Division = 'Theft and Related Offences'
vtp_year_mb_theft <-
vtp |>
filter(`ANZSOC Division` == 'Theft and Related Offences') |>
group_by(year, meshblock = Meshblock) |>
summarise( sum_v = sum(Victimisations))
glimpse(vtp_year_mb_theft)
wgtn_mbs_less_viewsheds_vtp_theft <-
wgtn_mbs_less_viewsheds |>
select(MeshblockN, ChangeInArea, AreaMinusViewshed, LandAreaSQ_km2) |>
left_join(vtp_year_mb_theft, join_by(MeshblockN == meshblock))
glimpse(wgtn_mbs_less_viewsheds_vtp_theft)
# the vtp data only goes back to june 2021 - that sux as it's impacted by covid and it's a midyear value
wgtn_mbs_less_viewsheds_vtp_2022_vrs_2024_theft <-
wgtn_mbs_less_viewsheds_vtp_theft |>
filter(year %in% c(2022, 2024)) |>
pivot_wider(
names_from = year,
values_from = sum_v
)
wgtn_mbs_less_viewsheds_vtp_2022_vrs_2024_theft$v_2022_2024 <- wgtn_mbs_less_viewsheds_vtp_2022_vrs_2024_theft$`2024` - wgtn_mbs_less_viewsheds_vtp_2022_vrs_2024_theft$`2022`
# not sure there's a significant movement below the 0 line
ggplot(data = wgtn_mbs_less_viewsheds_vtp_2022_vrs_2024_theft) +
geom_point(mapping = aes(x = ChangeInAreaRatio, y = v_2022_2024)) +
scale_colour_viridis(option = "B") +
labs(x = "Proportion of Meshblock in Viewshed", y = "Victimisations 2024 minus victimisations 2022", title = 'For ANZSOC Division = Theft and Related Offences')
# no crime category stands out
################################################
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment