bohdanszymanik · July 31, 2025 02:04
diff --git a/wgtn_cctv.R b/wgtn_cctv.R
 library(tidyverse)
 library(sf)

 # quick aside... testing out sql querying in R - never done this before
 # works well
 # library(sqldf)
 # VARB <- read_csv("C:\\Users\\...\\VictimisationsAgeROVBoundary.csv")
 # slice <- sqldf("select * from VARB limit 10")

 ###################################################################
 # comparing monitored street areas in meshblocks with victimisation

 library(remotes)
 install_github("yonghah/esri2sf")

 library("esri2sf")
 wgtn_cctv_url <- "https://services1.arcgis.com/CPYspmTk3abe6d7i/arcgis/rest/services/CCTV_Viewsheds_(View_layer)/FeatureServer/0"

 wgtn_cctv <- esri2sf(wgtn_cctv_url)

 hv_cctv_url <- "https://maps.huttcity.govt.nz/hosting/rest/services/Hosted/CCTV_Camera_Location/FeatureServer/1"

 hv_cctv <- esri2sf(hv_cctv_url)

 # at this stage I guess... I could either extract and load into ArcGIS, or load in the MB2013 areas and do a tally up 
 # of cctv locations/meshblock
 # might do both

 glimpse(wgtn_cctv)

 # I want to look at the geometry column - normal glimpse won't show me enough
 sf::st_as_text(wgtn_cctv$geoms) # this works

 ggplot(wgtn_cctv) +
  geom_sf(aes(colour = "red")) +
  coord_sf(xlim = c(174.77, 174.78), ylim = c(-41.275, -41.30)) +
  geom_sf_label(aes(label = Camera_Name), size = 2)

 # I think I should look at the monitored areas as a fraction of
 # the meshblock area. This isn't great actually - the crime is probably
 # largely happening in the streets and some meshblocks will have
 # a lot of street area, others very little.

 # load in the meshblocks 2013 for wellington - it's already been cropped down to just 
 # the wellington region using mapshaper.org first
 mb2013_wgtn <- st_read("C:\\Users\\...\\wd\\Areas\\statsnz-meshblock-2013-SHP-wgtn\\meshblock-2013.shp")
 plot(mb2013_wgtn) # lots of columns to colourise

 # let's just choose one eg LandAreaSQ
 ggplot(data = mb2013_wgtn) +
  geom_sf(aes(fill = LandAreaSQ)) +
  theme_minimal()

 # ahhh... a performance problem. This operation takes a couple few minutes to complete, we need to figure out how to 
 # parallelize these st_* operations
 erased <- st_difference(mb2013_wgtn, wgtn_cctv)

 glimpse(erased)

 # this won't plot - I think ggplot gives up due to the complexity over the total area being plotted
 ggplot(data = erased) +
  geom_sf() +
  coord_sf(xlim = c(174.773, 174.777), ylim = c(-41.2815, -41.2855))
  theme_minimal()

 # if I make the overall area smaller with less meshblocks then it works, here I concentrate on just the intersection from willis st to lambton quay
 glimpse(erased |> filter(MeshblockN %in% c('2124400', '2125700', '2126200')))

 # why so many rows?? because wgtn_cctv has many overlaying geometries - multiple viewsheds from nearby cameras

 # this time it plots ok
 ggplot(data = erased |> filter(MeshblockN %in% c('2124400', '2125700', '2126200', '2125900', '2126502', '2127200', '2126600', '2127300', '2125600', '2124000', '2125300'))) +
  geom_sf() +
  theme_minimal()
  


 # it appears we have some LandAreaSQ measures that don't tally with what is calculated by st_area() so let's add a calculated area
 mb2013_wgtn$LandAreaSQ_calc <- mb2013_wgtn |> st_area()

 # I've just noticed that the order of geom_sf to coord_sf is important - I guess
 # that figures - the operations here are additive
 ggplot(data = mb2013_wgtn) +
  geom_sf() +
  coord_sf(xlim = c(174.772, 174.778), ylim = c(-41.281, -41.285)) +
  theme_minimal()

 # we can erase the cctv viewsheds from the meshblocks
 erase_viewsheds_from_mbs <- st_difference(mb2013_wgtn, wgtn_cctv)
 # or we can erase the meshblocks from the viewsheds - we don't want that
 # erase_mbs_from_viewsheds <- st_difference(wgtn_cctv, mb2013_wgtn)

 # hmmm - this doesn't work - I think it's due to the overlying viewsheds
 ggplot(data = erase_viewsheds_from_mbs) +
  geom_sf() +
  # coord_sf(xlim = c(174.772, 174.778), ylim = c(-41.281, -41.285)) +
  theme_minimal()

 # and notice when you run this that the viewsheds consist of many overlaying geometries
 ggplot(data = erase_mbs_from_viewsheds) +
  geom_sf() +
  coord_sf(xlim = c(174.772, 174.778), ylim = c(-41.281, -41.285)) +
  theme_minimal()

 # let's union together all those overlaying viewsheds to simplify the geometry
 wgtn_cctv_union <- wgtn_cctv |> st_union()
 ggplot(data = wgtn_cctv_union) +
  geom_sf() +
  theme_minimal()


 # this works much better!!!
 wgtn_mbs_less_viewsheds <- st_difference(mb2013_wgtn, wgtn_cctv_union) # a tiny bit faster with mb2013_wgtn_small
 ggplot(data = wgtn_mbs_less_viewsheds) +
  geom_sf() +
  theme_minimal()


 #################################################################

 # Next task is to figure out by how much the area of a meshblock has decreased as a result of
 # the viewshed - usefully, the meshblocks contain a km^2 area - but without units
 # st_area() generates areas but with units - so let's assign some units
 # hmmm, I think there's something odd about some of those LandAreaSQ quantities - I'm going to add a calculated column to compare against

 library(units)

 wgtn_mbs_less_viewsheds$LandAreaSQ_km2 <- set_units(wgtn_mbs_less_viewsheds$LandAreaSQ, km^2)

 wgtn_mbs_less_viewsheds$AreaMinusViewshed <- st_area(wgtn_mbs_less_viewsheds)
 wgtn_mbs_less_viewsheds$AreaMinusViewshed_km2 <- set_units(st_area(wgtn_mbs_less_viewsheds), km^2)

 # this doesn't give sensible answers because some of the LandAreaSQ values aren't right
 #wgtn_mbs_less_viewsheds$ChangeInArea <- 1 - drop_units(wgtn_mbs_less_viewsheds$AreaMinusViewshed_km2 / wgtn_mbs_less_viewsheds$LandAreaSQ_km2)

 glimpse(wgtn_mbs_less_viewsheds)
 wgtn_mbs_less_viewsheds$ChangeInArea <- wgtn_mbs_less_viewsheds$LandAreaSQ_calc - wgtn_mbs_less_viewsheds$AreaMinusViewshed

 wgtn_mbs_less_viewsheds$ChangeInAreaRatio <- 1 - drop_units(wgtn_mbs_less_viewsheds$AreaMinusViewshed / wgtn_mbs_less_viewsheds$LandAreaSQ_calc)
 wgtn_mbs_less_viewsheds |> mutate (ChangeInArea = drop_units(ChangeInArea)) |> filter(ChangeInArea > 1 )
 wgtn_mbs_less_viewsheds |> filter(ChangeInAreaRatio > 0.1)


 #################################################################

 # and now need to join to the crime data
 # I have two choices for the victimisation data
 # the first choice is retail specific obtained through OIA
 v <- read_csv("C:\\Users\\...\\wd\\RetailVictimisation\\v.csv")
 glimpse(v)

 # the second is public - the Victimisations Time and Place data set on https://www.police.govt.nz/about-us/publications-statistics/data-and-statistics/policedatanz
 # I think actually that second data set might be better, it excludes victimisations occurring in dwellings but we're interested
 # not in dwellings but crime on the street and retail crime
 vtp <- read_csv("C:\\Users\\...\\wd\\VictimisationByMeshblock\\vtp_jul2025.csv",
                col_types = cols(
                  `Meshblock` = col_character()
                ))
 glimpse(vtp)

 # I could try and assign the datetime of columns on the import but... unfortunately there 
 # are a few rows where the occurrence datetime is not like "%d%b%Y %H:%M:%S" - just hours:mins unfortunately
 # v <- read_csv("C:\\Users\\...\\wd\\RetailVictimisation\\v.csv",
 #               col_types = cols(
 #                 occurrence_datetime = col_datetime("%d%b%Y %H:%M:%S"),
 #                 reported_date = col_date("%d%b%Y")
 #               ))

 # so maybe better to read in and then create a new column for a clean date that we calculate based on both
 # occurred and reported dates

 # lubridate can handle multiple datetime formats with parse_date_time()
 v$parsed_occurrence_datetime <- parse_date_time(v$occurrence_datetime, orders = c("%d/%m/%Y %H:%M", "%d%b%Y %H:%M:%S", "%d%b%Y %H:%M"))
 v$parsed_reported_date <- parse_date_time(v$reported_date, orders = c("%d/%m/%Y", "%d%b%Y"))

 glimpse(v)

 # turns out that in some of the victimisation data we have some occurrence datetimes that are plainly wrong - either last century or beyond! or in the future!!
 library(lubridate)
 arrange(v, occurrence_datetime) |> slice_head(n = 20) |> select(occurrence_datetime)
 v |> arrange(desc(occurrence_datetime)) |> slice_head(n = 20) |> select(occurrence_datetime)


 # so best apply some logic to generate a clean occurrence date
 invalid_year_picker <- function(date1, date2) {
  return 
  if_else (year(as.Date(date1)) < 2019 | year(as.Date(date1)) > year(now()),
           as.Date(date2),
           as.Date(date1)
  )
 }


 invalid_year_picker(as.Date("2016-01-01"), as.Date("2020-01-01"))

 v <- v |>
  mutate(clean_occurrence_date = invalid_year_picker(parsed_occurrence_datetime, parsed_reported_date))

 glimpse(v)


 # I want to aggregate the victimisation data up to year and meshblock
 # still not sure which dataset I should use - maybe use v for the moment, and then swap to vtp

 v_year_mb <-
  v |>
    group_by(year = year(clean_occurrence_date), meshblock_2013_census) |>
    summarise( sum_v = sum(victimisations))

 # so joining on v.meshblock_2013_census = wgtn_mbs_less_viewsheds.MeshblockN
 # and I don't need all the columns in wgtn_mbs_less_viewsheds so I'll filter that

 wgtn_mbs_less_viewsheds_v <-
  wgtn_mbs_less_viewsheds |>
  select(MeshblockN, ChangeInArea, ChangeInAreaRatio, AreaMinusViewshed, LandAreaSQ_calc ) |>
  left_join(v_year_mb, join_by(MeshblockN == meshblock_2013_census))

 glimpse(wgtn_mbs_less_viewsheds_v)

 # now I can compare the two extreme ends of our data eg in the v dataset from 2019 to 2024 (since we're interested in full years)
 # want to view victimisations cf change in area cf 2019 versus 2024

 install.packages("viridis")
 library(viridis)

 ggplot(data = wgtn_mbs_less_viewsheds_v |> filter(year %in% c(2019, 2020, 2021, 2022, 2023, 2024))) +
  geom_point(mapping = aes(x = ChangeInArea, y = sum_v, colour = year)) +
  scale_colour_viridis(option = "B")


 wgtn_mbs_less_viewsheds_v |> filter(year %in% c(2019, 2024))

 # I need to pivot_wider wgtn_mbs_less_viewsheds_v for the two years 2019, and 2020 to get the difference in sum_v
 wgtn_mbs_less_viewsheds_v_2019_vrs_2024 <-
  wgtn_mbs_less_viewsheds_v |> filter(year %in% c(2019, 2024)) |>
    pivot_wider(
      names_from = year,
      values_from = sum_v
    )

 wgtn_mbs_less_viewsheds_v_2019_vrs_2024$v_2019_2024 <- wgtn_mbs_less_viewsheds_v_2019_vrs_2024$`2024` - wgtn_mbs_less_viewsheds_v_2019_vrs_2024$`2019`

 ggplot(data = wgtn_mbs_less_viewsheds_v_2019_vrs_2024) +
  geom_point(mapping = aes(x = ChangeInAreaRatio, y = v_2019_2024)) +
  scale_colour_viridis(option = "B") +
  labs(x = "Proportion of Meshblock in Viewshed", y = "Victimisations 2024 minus victimisations 2019")


 # let's include the ANZSOC Division this time and use it as a colour aesthetic

 v_year_mb_anzsocdivision <-
  v |>
  group_by(year = year(clean_occurrence_date), meshblock_2013_census, anzsoc_division) |>
  summarise( sum_v = sum(victimisations))

 wgtn_mbs_less_viewsheds_v_anzsocdivision <-
  wgtn_mbs_less_viewsheds |>
  select(MeshblockN, ChangeInArea, ChangeInAreaRatio, AreaMinusViewshed, LandAreaSQ_calc ) |>
  left_join(v_year_mb_anzsocdivision, join_by(MeshblockN == meshblock_2013_census))

 wgtn_mbs_less_viewsheds_v_anzsocdiv_2019_vrs_2024 <-
  wgtn_mbs_less_viewsheds_v_anzsocdivision |> filter(year %in% c(2019, 2024)) |>
  pivot_wider(
    names_from = year,
    values_from = sum_v
  )

 wgtn_mbs_less_viewsheds_v_anzsocdiv_2019_vrs_2024$v_2019_2024 <- wgtn_mbs_less_viewsheds_v_anzsocdiv_2019_vrs_2024$`2024` - wgtn_mbs_less_viewsheds_v_anzsocdiv_2019_vrs_2024$`2019`

 ggplot(data = wgtn_mbs_less_viewsheds_v_anzsocdiv_2019_vrs_2024) +
  geom_point(mapping = aes(x = ChangeInAreaRatio, y = v_2019_2024, colour = anzsoc_division)) +
  labs(x = "Proportion of Meshblock in Viewshed", y = "Victimisations 2024 minus victimisations 2019")



 # now I want to 
 # 1. repeat the exercise but use the vtp dataset - I think this might better highlight street level crime
 # 2. aggregate for the change in area - maybe 0-0.2, 0.2-0.4 ... 0.8-1.0
 # 3. figure out what are the meshblocks that show the largest change in area


 ###############################################################################
 # just quickly - what are these meshblocks that have a large change in area

 wgtn_mbs_less_viewsheds |> 
  select(MeshblockN, TLAName, AreaUnitNa, LandAreaSQ_calc, AreaMinusViewshed, ChangeInArea, ChangeInAreaRatio) |>
  arrange(desc(ChangeInAreaRatio))

 # that's bollocks! the LandAreaSQ_km2 just can't be right - have added a calculated area and recalculating the areaminusviewshed


 ########################
 # vtp dataset version

 library(stringr)

 vtp$year <- as.numeric( vtp$`Month Year` |> str_sub(-4) )

 glimpse(vtp)

 vtp |>select(`ANZSOC Division`) |>  distinct()

 vtp_year_mb <-
  vtp |>
  group_by(year, meshblock = Meshblock) |>
  summarise( sum_v = sum(Victimisations))

 glimpse(vtp_year_mb)

 wgtn_mbs_less_viewsheds_vtp <-
  wgtn_mbs_less_viewsheds |>
  select(MeshblockN, ChangeInArea, ChangeInAreaRatio, AreaMinusViewshed, LandAreaSQ_calc) |>
  left_join(vtp_year_mb, join_by(MeshblockN == meshblock))

 glimpse(wgtn_mbs_less_viewsheds_v)

 # the vtp data only goes back to june 2021 - that sux as it's impacted by covid and it's a midyear value
 wgtn_mbs_less_viewsheds_vtp_2022_vrs_2024 <-
  wgtn_mbs_less_viewsheds_vtp |> filter(year %in% c(2022, 2024)) |>
  pivot_wider(
    names_from = year,
    values_from = sum_v
  )

 wgtn_mbs_less_viewsheds_vtp_2022_vrs_2024$v_2022_2024 <- wgtn_mbs_less_viewsheds_vtp_2022_vrs_2024$`2024` - wgtn_mbs_less_viewsheds_vtp_2022_vrs_2024$`2022`

 # not sure there's a significant movement below the 0 line
 ggplot(data = wgtn_mbs_less_viewsheds_vtp_2022_vrs_2024) +
  geom_point(mapping = aes(x = ChangeInAreaRatio, y = v_2022_2024)) +
  scale_colour_viridis(option = "B") +
  labs(x = "Proportion of Meshblock in Viewshed", y = "Victimisations 2024 minus victimisations 2022")

 ## what happens if I just look at ANZSOC Division = 'Theft and Related Offences'
 vtp_year_mb_theft <-
  vtp |>
  filter(`ANZSOC Division` == 'Theft and Related Offences') |>
  group_by(year, meshblock = Meshblock) |>
  summarise( sum_v = sum(Victimisations))

 glimpse(vtp_year_mb_theft)

 wgtn_mbs_less_viewsheds_vtp_theft <-
  wgtn_mbs_less_viewsheds |>
  select(MeshblockN, ChangeInArea, ChangeInAreaRatio, AreaMinusViewshed, LandAreaSQ_calc) |>
  left_join(vtp_year_mb_theft, join_by(MeshblockN == meshblock))

 glimpse(wgtn_mbs_less_viewsheds_vtp_theft)

 # the vtp data only goes back to june 2021 - that sux as it's impacted by covid and it's a midyear value
 wgtn_mbs_less_viewsheds_vtp_2022_vrs_2024_theft <-
  wgtn_mbs_less_viewsheds_vtp_theft |> 
  filter(year %in% c(2022, 2024)) |>
  pivot_wider(
    names_from = year,
    values_from = sum_v
  )

 wgtn_mbs_less_viewsheds_vtp_2022_vrs_2024_theft$v_2022_2024 <- wgtn_mbs_less_viewsheds_vtp_2022_vrs_2024_theft$`2024` - wgtn_mbs_less_viewsheds_vtp_2022_vrs_2024_theft$`2022`

 # not sure there's a significant movement below the 0 line
 ggplot(data = wgtn_mbs_less_viewsheds_vtp_2022_vrs_2024_theft) +
  geom_point(mapping = aes(x = ChangeInAreaRatio, y = v_2022_2024)) +
  scale_colour_viridis(option = "B") +
  labs(x = "Proportion of Meshblock in Viewshed", y = "Victimisations 2024 minus victimisations 2022", title = 'For ANZSOC Division = Theft and Related Offences')


 ###########################################
 ## why not just using the ANZSOC Division as an asthetic to colour the points?

 vtp_year_mb <-
  vtp |>
  group_by(year, meshblock = Meshblock, anzsoc_division = `ANZSOC Division`) |>
  summarise( sum_v = sum(Victimisations))

 glimpse(vtp_year_mb)

 wgtn_mbs_less_viewsheds_vtp <-
  wgtn_mbs_less_viewsheds |>
  select(MeshblockN, ChangeInArea, ChangeInAreaRatio, AreaMinusViewshed, LandAreaSQ_calc) |>
  left_join(vtp_year_mb, join_by(MeshblockN == meshblock))

 glimpse(wgtn_mbs_less_viewsheds_vtp)

 # the vtp data only goes back to june 2021 - that sux as it's impacted by covid and it's a midyear value
 wgtn_mbs_less_viewsheds_vtp_2022_vrs_2024 <-
  wgtn_mbs_less_viewsheds_vtp |> filter(year %in% c(2022, 2024)) |>
  pivot_wider(
    names_from = year,
    values_from = sum_v
  )

 wgtn_mbs_less_viewsheds_vtp_2022_vrs_2024$v_2022_2024 <- wgtn_mbs_less_viewsheds_vtp_2022_vrs_2024$`2024` - wgtn_mbs_less_viewsheds_vtp_2022_vrs_2024$`2022`

 # not sure there's a significant movement below the 0 line
 ggplot(data = wgtn_mbs_less_viewsheds_vtp_2022_vrs_2024) +
  geom_point(mapping = aes(x = ChangeInAreaRatio, y = v_2022_2024, colour = anzsoc_division)) +
  labs(x = "Proportion of Meshblock in Viewshed", y = "Victimisations 2024 minus victimisations 2022")


 ### todo - I could simply count the below the line versus above the line victimisations for ChangeInAreaRatio > some amount eg 0.1
 # or maybe bracket it between values

 outcome_vtp <-
  wgtn_mbs_less_viewsheds_vtp_2022_vrs_2024 |> 
    group_by(Improved = v_2022_2024 < 0, CameraImpact = ChangeInAreaRatio > 0.2) |>
    summarise(sum_sum_v = sum(v_2022_2024))


 #                Got better	Got worse
 # Camera impact	      -...	      ...
 # No camera impact	  -...	    ...

 # So, basically... 
 #   Areas under camera impact had twice the victimisations in 2024 versus 2022
 #   While areas without camera impact had about 1.5x worse



 # comparing back to the retail victimisation data obtained through data sharing
 outcome_v <-
  wgtn_mbs_less_viewsheds_v_2019_vrs_2024 |> 
  group_by(Improved = v_2019_2024 < 0, CameraImpact = ChangeInAreaRatio > 0.05) |>
  summarise(sum_sum_v = sum(v_2019_2024))

 # gives
 #                Got better	Got worse
 # Camera impact	      -...	    ...
 # No camera impact	  -...	    ...




 outcome_v_anzsocd <-
  wgtn_mbs_less_viewsheds_v_anzsocdiv_2019_vrs_2024 |> 
  group_by(Improved = v_2019_2024 < 0, CameraImpact = ChangeInAreaRatio > 0.05, anzsoc_division) |>
  summarise(sum_sum_v = sum(v_2019_2024))




 # gives for 'Theft'
 #                Got better	Got worse
 # Camera impact	      -...	    ...
 # No camera impact	  -...	    ...


 # why don't we try this a different way? How about just the victimisations in 2019 compared to 2024 for the two cases of camera impact?

 outcome_v1 <- 
  wgtn_mbs_less_viewsheds_v |>
    filter(year %in% c(2019, 2024)) |>
    group_by(CameraImpact = ChangeInAreaRatio > 0.05, year) |>
    summarise(sum_sum_v = sum(sum_v))

 # I think this is easier to understand
 #                     2019    2024    Percentage Increase
 # Camera Impact       ...    ...    ...
 # No camera impact    ...    ...    ...%

 # Now we can repeat but for ANZSOC Division
 outcome_v_abs_anzsocd <- 
  wgtn_mbs_less_viewsheds_v_anzsocdivision |>
  filter(year %in% c(2019, 2024)) |>
  group_by(CameraImpact = ChangeInAreaRatio > 0.05, year, anzsoc_division) |>
  summarise(sum_sum_v = sum(sum_v))




 outcome_v <-
  wgtn_mbs_less_viewsheds_v_2019_vrs_2024 |> 
  group_by(CameraImpact = ChangeInAreaRatio > 0.05) |>
  summarise(sum_sum_v = sum(v_2019_2024))


 # todo - double check if 2022 to 2024 makes for a sensible time range



 ## what happens if I just look at ANZSOC Division = 'Theft and Related Offences'
 vtp_year_mb_theft <-
  vtp |>
  filter(`ANZSOC Division` == 'Theft and Related Offences') |>
  group_by(year, meshblock = Meshblock) |>
  summarise( sum_v = sum(Victimisations))

 glimpse(vtp_year_mb_theft)

 wgtn_mbs_less_viewsheds_vtp_theft <-
  wgtn_mbs_less_viewsheds |>
  select(MeshblockN, ChangeInArea, AreaMinusViewshed, LandAreaSQ_km2) |>
  left_join(vtp_year_mb_theft, join_by(MeshblockN == meshblock))

 glimpse(wgtn_mbs_less_viewsheds_vtp_theft)

 # the vtp data only goes back to june 2021 - that sux as it's impacted by covid and it's a midyear value
 wgtn_mbs_less_viewsheds_vtp_2022_vrs_2024_theft <-
  wgtn_mbs_less_viewsheds_vtp_theft |> 
  filter(year %in% c(2022, 2024)) |>
  pivot_wider(
    names_from = year,
    values_from = sum_v
  )

 wgtn_mbs_less_viewsheds_vtp_2022_vrs_2024_theft$v_2022_2024 <- wgtn_mbs_less_viewsheds_vtp_2022_vrs_2024_theft$`2024` - wgtn_mbs_less_viewsheds_vtp_2022_vrs_2024_theft$`2022`

 # not sure there's a significant movement below the 0 line
 ggplot(data = wgtn_mbs_less_viewsheds_vtp_2022_vrs_2024_theft) +
  geom_point(mapping = aes(x = ChangeInAreaRatio, y = v_2022_2024)) +
  scale_colour_viridis(option = "B") +
  labs(x = "Proportion of Meshblock in Viewshed", y = "Victimisations 2024 minus victimisations 2022", title = 'For ANZSOC Division = Theft and Related Offences')

 # no crime category stands out

 ################################################
	library(tidyverse)
	library(sf)

	# quick aside... testing out sql querying in R - never done this before
	# works well
	# library(sqldf)
	# VARB <- read_csv("C:\\Users\\...\\VictimisationsAgeROVBoundary.csv")
	# slice <- sqldf("select * from VARB limit 10")

	###################################################################
	# comparing monitored street areas in meshblocks with victimisation

	library(remotes)
	install_github("yonghah/esri2sf")

	library("esri2sf")
	wgtn_cctv_url <- "https://services1.arcgis.com/CPYspmTk3abe6d7i/arcgis/rest/services/CCTV_Viewsheds_(View_layer)/FeatureServer/0"

	wgtn_cctv <- esri2sf(wgtn_cctv_url)

	hv_cctv_url <- "https://maps.huttcity.govt.nz/hosting/rest/services/Hosted/CCTV_Camera_Location/FeatureServer/1"

	hv_cctv <- esri2sf(hv_cctv_url)

	# at this stage I guess... I could either extract and load into ArcGIS, or load in the MB2013 areas and do a tally up
	# of cctv locations/meshblock
	# might do both

	glimpse(wgtn_cctv)

	# I want to look at the geometry column - normal glimpse won't show me enough
	sf::st_as_text(wgtn_cctv$geoms) # this works

	ggplot(wgtn_cctv) +
	geom_sf(aes(colour = "red")) +
	coord_sf(xlim = c(174.77, 174.78), ylim = c(-41.275, -41.30)) +
	geom_sf_label(aes(label = Camera_Name), size = 2)

	# I think I should look at the monitored areas as a fraction of
	# the meshblock area. This isn't great actually - the crime is probably
	# largely happening in the streets and some meshblocks will have
	# a lot of street area, others very little.

	# load in the meshblocks 2013 for wellington - it's already been cropped down to just
	# the wellington region using mapshaper.org first
	mb2013_wgtn <- st_read("C:\\Users\\...\\wd\\Areas\\statsnz-meshblock-2013-SHP-wgtn\\meshblock-2013.shp")
	plot(mb2013_wgtn) # lots of columns to colourise

	# let's just choose one eg LandAreaSQ
	ggplot(data = mb2013_wgtn) +
	geom_sf(aes(fill = LandAreaSQ)) +
	theme_minimal()

	# ahhh... a performance problem. This operation takes a couple few minutes to complete, we need to figure out how to
	# parallelize these st_* operations
	erased <- st_difference(mb2013_wgtn, wgtn_cctv)

	glimpse(erased)

	# this won't plot - I think ggplot gives up due to the complexity over the total area being plotted
	ggplot(data = erased) +
	geom_sf() +
	coord_sf(xlim = c(174.773, 174.777), ylim = c(-41.2815, -41.2855))
	theme_minimal()

	# if I make the overall area smaller with less meshblocks then it works, here I concentrate on just the intersection from willis st to lambton quay
	glimpse(erased \|> filter(MeshblockN %in% c('2124400', '2125700', '2126200')))

	# why so many rows?? because wgtn_cctv has many overlaying geometries - multiple viewsheds from nearby cameras

	# this time it plots ok
	ggplot(data = erased \|> filter(MeshblockN %in% c('2124400', '2125700', '2126200', '2125900', '2126502', '2127200', '2126600', '2127300', '2125600', '2124000', '2125300'))) +
	geom_sf() +
	theme_minimal()



	# it appears we have some LandAreaSQ measures that don't tally with what is calculated by st_area() so let's add a calculated area
	mb2013_wgtn$LandAreaSQ_calc <- mb2013_wgtn \|> st_area()

	# I've just noticed that the order of geom_sf to coord_sf is important - I guess
	# that figures - the operations here are additive
	ggplot(data = mb2013_wgtn) +
	geom_sf() +
	coord_sf(xlim = c(174.772, 174.778), ylim = c(-41.281, -41.285)) +
	theme_minimal()

	# we can erase the cctv viewsheds from the meshblocks
	erase_viewsheds_from_mbs <- st_difference(mb2013_wgtn, wgtn_cctv)
	# or we can erase the meshblocks from the viewsheds - we don't want that
	# erase_mbs_from_viewsheds <- st_difference(wgtn_cctv, mb2013_wgtn)

	# hmmm - this doesn't work - I think it's due to the overlying viewsheds
	ggplot(data = erase_viewsheds_from_mbs) +
	geom_sf() +
	# coord_sf(xlim = c(174.772, 174.778), ylim = c(-41.281, -41.285)) +
	theme_minimal()

	# and notice when you run this that the viewsheds consist of many overlaying geometries
	ggplot(data = erase_mbs_from_viewsheds) +
	geom_sf() +
	coord_sf(xlim = c(174.772, 174.778), ylim = c(-41.281, -41.285)) +
	theme_minimal()

	# let's union together all those overlaying viewsheds to simplify the geometry
	wgtn_cctv_union <- wgtn_cctv \|> st_union()
	ggplot(data = wgtn_cctv_union) +
	geom_sf() +
	theme_minimal()


	# this works much better!!!
	wgtn_mbs_less_viewsheds <- st_difference(mb2013_wgtn, wgtn_cctv_union) # a tiny bit faster with mb2013_wgtn_small
	ggplot(data = wgtn_mbs_less_viewsheds) +
	geom_sf() +
	theme_minimal()


	#################################################################

	# Next task is to figure out by how much the area of a meshblock has decreased as a result of
	# the viewshed - usefully, the meshblocks contain a km^2 area - but without units
	# st_area() generates areas but with units - so let's assign some units
	# hmmm, I think there's something odd about some of those LandAreaSQ quantities - I'm going to add a calculated column to compare against

	library(units)

	wgtn_mbs_less_viewsheds$LandAreaSQ_km2 <- set_units(wgtn_mbs_less_viewsheds$LandAreaSQ, km^2)

	wgtn_mbs_less_viewsheds$AreaMinusViewshed <- st_area(wgtn_mbs_less_viewsheds)
	wgtn_mbs_less_viewsheds$AreaMinusViewshed_km2 <- set_units(st_area(wgtn_mbs_less_viewsheds), km^2)

	# this doesn't give sensible answers because some of the LandAreaSQ values aren't right
	#wgtn_mbs_less_viewsheds$ChangeInArea <- 1 - drop_units(wgtn_mbs_less_viewsheds$AreaMinusViewshed_km2 / wgtn_mbs_less_viewsheds$LandAreaSQ_km2)

	glimpse(wgtn_mbs_less_viewsheds)
	wgtn_mbs_less_viewsheds$ChangeInArea <- wgtn_mbs_less_viewsheds$LandAreaSQ_calc - wgtn_mbs_less_viewsheds$AreaMinusViewshed

	wgtn_mbs_less_viewsheds$ChangeInAreaRatio <- 1 - drop_units(wgtn_mbs_less_viewsheds$AreaMinusViewshed / wgtn_mbs_less_viewsheds$LandAreaSQ_calc)
	wgtn_mbs_less_viewsheds \|> mutate (ChangeInArea = drop_units(ChangeInArea)) \|> filter(ChangeInArea > 1 )
	wgtn_mbs_less_viewsheds \|> filter(ChangeInAreaRatio > 0.1)


	#################################################################

	# and now need to join to the crime data
	# I have two choices for the victimisation data
	# the first choice is retail specific obtained through OIA
	v <- read_csv("C:\\Users\\...\\wd\\RetailVictimisation\\v.csv")
	glimpse(v)

	# the second is public - the Victimisations Time and Place data set on https://www.police.govt.nz/about-us/publications-statistics/data-and-statistics/policedatanz
	# I think actually that second data set might be better, it excludes victimisations occurring in dwellings but we're interested
	# not in dwellings but crime on the street and retail crime
	vtp <- read_csv("C:\\Users\\...\\wd\\VictimisationByMeshblock\\vtp_jul2025.csv",
	col_types = cols(
	`Meshblock` = col_character()
	))
	glimpse(vtp)

	# I could try and assign the datetime of columns on the import but... unfortunately there
	# are a few rows where the occurrence datetime is not like "%d%b%Y %H:%M:%S" - just hours:mins unfortunately
	# v <- read_csv("C:\\Users\\...\\wd\\RetailVictimisation\\v.csv",
	# col_types = cols(
	# occurrence_datetime = col_datetime("%d%b%Y %H:%M:%S"),
	# reported_date = col_date("%d%b%Y")
	# ))

	# so maybe better to read in and then create a new column for a clean date that we calculate based on both
	# occurred and reported dates

	# lubridate can handle multiple datetime formats with parse_date_time()
	v$parsed_occurrence_datetime <- parse_date_time(v$occurrence_datetime, orders = c("%d/%m/%Y %H:%M", "%d%b%Y %H:%M:%S", "%d%b%Y %H:%M"))
	v$parsed_reported_date <- parse_date_time(v$reported_date, orders = c("%d/%m/%Y", "%d%b%Y"))

	glimpse(v)

	# turns out that in some of the victimisation data we have some occurrence datetimes that are plainly wrong - either last century or beyond! or in the future!!
	library(lubridate)
	arrange(v, occurrence_datetime) \|> slice_head(n = 20) \|> select(occurrence_datetime)
	v \|> arrange(desc(occurrence_datetime)) \|> slice_head(n = 20) \|> select(occurrence_datetime)


	# so best apply some logic to generate a clean occurrence date
	invalid_year_picker <- function(date1, date2) {
	return
	if_else (year(as.Date(date1)) < 2019 \| year(as.Date(date1)) > year(now()),
	as.Date(date2),
	as.Date(date1)
	)
	}


	invalid_year_picker(as.Date("2016-01-01"), as.Date("2020-01-01"))

	v <- v \|>
	mutate(clean_occurrence_date = invalid_year_picker(parsed_occurrence_datetime, parsed_reported_date))

	glimpse(v)


	# I want to aggregate the victimisation data up to year and meshblock
	# still not sure which dataset I should use - maybe use v for the moment, and then swap to vtp

	v_year_mb <-
	v \|>
	group_by(year = year(clean_occurrence_date), meshblock_2013_census) \|>
	summarise( sum_v = sum(victimisations))

	# so joining on v.meshblock_2013_census = wgtn_mbs_less_viewsheds.MeshblockN
	# and I don't need all the columns in wgtn_mbs_less_viewsheds so I'll filter that

	wgtn_mbs_less_viewsheds_v <-
	wgtn_mbs_less_viewsheds \|>
	select(MeshblockN, ChangeInArea, ChangeInAreaRatio, AreaMinusViewshed, LandAreaSQ_calc ) \|>
	left_join(v_year_mb, join_by(MeshblockN == meshblock_2013_census))

	glimpse(wgtn_mbs_less_viewsheds_v)

	# now I can compare the two extreme ends of our data eg in the v dataset from 2019 to 2024 (since we're interested in full years)
	# want to view victimisations cf change in area cf 2019 versus 2024

	install.packages("viridis")
	library(viridis)

	ggplot(data = wgtn_mbs_less_viewsheds_v \|> filter(year %in% c(2019, 2020, 2021, 2022, 2023, 2024))) +
	geom_point(mapping = aes(x = ChangeInArea, y = sum_v, colour = year)) +
	scale_colour_viridis(option = "B")


	wgtn_mbs_less_viewsheds_v \|> filter(year %in% c(2019, 2024))

	# I need to pivot_wider wgtn_mbs_less_viewsheds_v for the two years 2019, and 2020 to get the difference in sum_v
	wgtn_mbs_less_viewsheds_v_2019_vrs_2024 <-
	wgtn_mbs_less_viewsheds_v \|> filter(year %in% c(2019, 2024)) \|>
	pivot_wider(
	names_from = year,
	values_from = sum_v
	)

	wgtn_mbs_less_viewsheds_v_2019_vrs_2024$v_2019_2024 <- wgtn_mbs_less_viewsheds_v_2019_vrs_2024$`2024` - wgtn_mbs_less_viewsheds_v_2019_vrs_2024$`2019`

	ggplot(data = wgtn_mbs_less_viewsheds_v_2019_vrs_2024) +
	geom_point(mapping = aes(x = ChangeInAreaRatio, y = v_2019_2024)) +
	scale_colour_viridis(option = "B") +
	labs(x = "Proportion of Meshblock in Viewshed", y = "Victimisations 2024 minus victimisations 2019")


	# let's include the ANZSOC Division this time and use it as a colour aesthetic

	v_year_mb_anzsocdivision <-
	v \|>
	group_by(year = year(clean_occurrence_date), meshblock_2013_census, anzsoc_division) \|>
	summarise( sum_v = sum(victimisations))

	wgtn_mbs_less_viewsheds_v_anzsocdivision <-
	wgtn_mbs_less_viewsheds \|>
	select(MeshblockN, ChangeInArea, ChangeInAreaRatio, AreaMinusViewshed, LandAreaSQ_calc ) \|>
	left_join(v_year_mb_anzsocdivision, join_by(MeshblockN == meshblock_2013_census))

	wgtn_mbs_less_viewsheds_v_anzsocdiv_2019_vrs_2024 <-
	wgtn_mbs_less_viewsheds_v_anzsocdivision \|> filter(year %in% c(2019, 2024)) \|>
	pivot_wider(
	names_from = year,
	values_from = sum_v
	)

	wgtn_mbs_less_viewsheds_v_anzsocdiv_2019_vrs_2024$v_2019_2024 <- wgtn_mbs_less_viewsheds_v_anzsocdiv_2019_vrs_2024$`2024` - wgtn_mbs_less_viewsheds_v_anzsocdiv_2019_vrs_2024$`2019`

	ggplot(data = wgtn_mbs_less_viewsheds_v_anzsocdiv_2019_vrs_2024) +
	geom_point(mapping = aes(x = ChangeInAreaRatio, y = v_2019_2024, colour = anzsoc_division)) +
	labs(x = "Proportion of Meshblock in Viewshed", y = "Victimisations 2024 minus victimisations 2019")



	# now I want to
	# 1. repeat the exercise but use the vtp dataset - I think this might better highlight street level crime
	# 2. aggregate for the change in area - maybe 0-0.2, 0.2-0.4 ... 0.8-1.0
	# 3. figure out what are the meshblocks that show the largest change in area


	###############################################################################
	# just quickly - what are these meshblocks that have a large change in area

	wgtn_mbs_less_viewsheds \|>
	select(MeshblockN, TLAName, AreaUnitNa, LandAreaSQ_calc, AreaMinusViewshed, ChangeInArea, ChangeInAreaRatio) \|>
	arrange(desc(ChangeInAreaRatio))

	# that's bollocks! the LandAreaSQ_km2 just can't be right - have added a calculated area and recalculating the areaminusviewshed


	########################
	# vtp dataset version

	library(stringr)

	vtp$year <- as.numeric( vtp$`Month Year` \|> str_sub(-4) )

	glimpse(vtp)

	vtp \|>select(`ANZSOC Division`) \|> distinct()

	vtp_year_mb <-
	vtp \|>
	group_by(year, meshblock = Meshblock) \|>
	summarise( sum_v = sum(Victimisations))

	glimpse(vtp_year_mb)

	wgtn_mbs_less_viewsheds_vtp <-
	wgtn_mbs_less_viewsheds \|>
	select(MeshblockN, ChangeInArea, ChangeInAreaRatio, AreaMinusViewshed, LandAreaSQ_calc) \|>
	left_join(vtp_year_mb, join_by(MeshblockN == meshblock))

	glimpse(wgtn_mbs_less_viewsheds_v)

	# the vtp data only goes back to june 2021 - that sux as it's impacted by covid and it's a midyear value
	wgtn_mbs_less_viewsheds_vtp_2022_vrs_2024 <-
	wgtn_mbs_less_viewsheds_vtp \|> filter(year %in% c(2022, 2024)) \|>
	pivot_wider(
	names_from = year,
	values_from = sum_v
	)

	wgtn_mbs_less_viewsheds_vtp_2022_vrs_2024$v_2022_2024 <- wgtn_mbs_less_viewsheds_vtp_2022_vrs_2024$`2024` - wgtn_mbs_less_viewsheds_vtp_2022_vrs_2024$`2022`

	# not sure there's a significant movement below the 0 line
	ggplot(data = wgtn_mbs_less_viewsheds_vtp_2022_vrs_2024) +
	geom_point(mapping = aes(x = ChangeInAreaRatio, y = v_2022_2024)) +
	scale_colour_viridis(option = "B") +
	labs(x = "Proportion of Meshblock in Viewshed", y = "Victimisations 2024 minus victimisations 2022")

	## what happens if I just look at ANZSOC Division = 'Theft and Related Offences'
	vtp_year_mb_theft <-
	vtp \|>
	filter(`ANZSOC Division` == 'Theft and Related Offences') \|>
	group_by(year, meshblock = Meshblock) \|>
	summarise( sum_v = sum(Victimisations))

	glimpse(vtp_year_mb_theft)

	wgtn_mbs_less_viewsheds_vtp_theft <-
	wgtn_mbs_less_viewsheds \|>
	select(MeshblockN, ChangeInArea, ChangeInAreaRatio, AreaMinusViewshed, LandAreaSQ_calc) \|>
	left_join(vtp_year_mb_theft, join_by(MeshblockN == meshblock))

	glimpse(wgtn_mbs_less_viewsheds_vtp_theft)

	# the vtp data only goes back to june 2021 - that sux as it's impacted by covid and it's a midyear value
	wgtn_mbs_less_viewsheds_vtp_2022_vrs_2024_theft <-
	wgtn_mbs_less_viewsheds_vtp_theft \|>
	filter(year %in% c(2022, 2024)) \|>
	pivot_wider(
	names_from = year,
	values_from = sum_v
	)

	wgtn_mbs_less_viewsheds_vtp_2022_vrs_2024_theft$v_2022_2024 <- wgtn_mbs_less_viewsheds_vtp_2022_vrs_2024_theft$`2024` - wgtn_mbs_less_viewsheds_vtp_2022_vrs_2024_theft$`2022`

	# not sure there's a significant movement below the 0 line
	ggplot(data = wgtn_mbs_less_viewsheds_vtp_2022_vrs_2024_theft) +
	geom_point(mapping = aes(x = ChangeInAreaRatio, y = v_2022_2024)) +
	scale_colour_viridis(option = "B") +
	labs(x = "Proportion of Meshblock in Viewshed", y = "Victimisations 2024 minus victimisations 2022", title = 'For ANZSOC Division = Theft and Related Offences')


	###########################################
	## why not just using the ANZSOC Division as an asthetic to colour the points?

	vtp_year_mb <-
	vtp \|>
	group_by(year, meshblock = Meshblock, anzsoc_division = `ANZSOC Division`) \|>
	summarise( sum_v = sum(Victimisations))

	glimpse(vtp_year_mb)

	wgtn_mbs_less_viewsheds_vtp <-
	wgtn_mbs_less_viewsheds \|>
	select(MeshblockN, ChangeInArea, ChangeInAreaRatio, AreaMinusViewshed, LandAreaSQ_calc) \|>
	left_join(vtp_year_mb, join_by(MeshblockN == meshblock))

	glimpse(wgtn_mbs_less_viewsheds_vtp)

	# the vtp data only goes back to june 2021 - that sux as it's impacted by covid and it's a midyear value
	wgtn_mbs_less_viewsheds_vtp_2022_vrs_2024 <-
	wgtn_mbs_less_viewsheds_vtp \|> filter(year %in% c(2022, 2024)) \|>
	pivot_wider(
	names_from = year,
	values_from = sum_v
	)

	wgtn_mbs_less_viewsheds_vtp_2022_vrs_2024$v_2022_2024 <- wgtn_mbs_less_viewsheds_vtp_2022_vrs_2024$`2024` - wgtn_mbs_less_viewsheds_vtp_2022_vrs_2024$`2022`

	# not sure there's a significant movement below the 0 line
	ggplot(data = wgtn_mbs_less_viewsheds_vtp_2022_vrs_2024) +
	geom_point(mapping = aes(x = ChangeInAreaRatio, y = v_2022_2024, colour = anzsoc_division)) +
	labs(x = "Proportion of Meshblock in Viewshed", y = "Victimisations 2024 minus victimisations 2022")


	### todo - I could simply count the below the line versus above the line victimisations for ChangeInAreaRatio > some amount eg 0.1
	# or maybe bracket it between values

	outcome_vtp <-
	wgtn_mbs_less_viewsheds_vtp_2022_vrs_2024 \|>
	group_by(Improved = v_2022_2024 < 0, CameraImpact = ChangeInAreaRatio > 0.2) \|>
	summarise(sum_sum_v = sum(v_2022_2024))


	# Got better Got worse
	# Camera impact -... ...
	# No camera impact -... ...

	# So, basically...
	# Areas under camera impact had twice the victimisations in 2024 versus 2022
	# While areas without camera impact had about 1.5x worse



	# comparing back to the retail victimisation data obtained through data sharing
	outcome_v <-
	wgtn_mbs_less_viewsheds_v_2019_vrs_2024 \|>
	group_by(Improved = v_2019_2024 < 0, CameraImpact = ChangeInAreaRatio > 0.05) \|>
	summarise(sum_sum_v = sum(v_2019_2024))

	# gives
	# Got better Got worse
	# Camera impact -... ...
	# No camera impact -... ...




	outcome_v_anzsocd <-
	wgtn_mbs_less_viewsheds_v_anzsocdiv_2019_vrs_2024 \|>
	group_by(Improved = v_2019_2024 < 0, CameraImpact = ChangeInAreaRatio > 0.05, anzsoc_division) \|>
	summarise(sum_sum_v = sum(v_2019_2024))




	# gives for 'Theft'
	# Got better Got worse
	# Camera impact -... ...
	# No camera impact -... ...


	# why don't we try this a different way? How about just the victimisations in 2019 compared to 2024 for the two cases of camera impact?

	outcome_v1 <-
	wgtn_mbs_less_viewsheds_v \|>
	filter(year %in% c(2019, 2024)) \|>
	group_by(CameraImpact = ChangeInAreaRatio > 0.05, year) \|>
	summarise(sum_sum_v = sum(sum_v))

	# I think this is easier to understand
	# 2019 2024 Percentage Increase
	# Camera Impact ... ... ...
	# No camera impact ... ... ...%

	# Now we can repeat but for ANZSOC Division
	outcome_v_abs_anzsocd <-
	wgtn_mbs_less_viewsheds_v_anzsocdivision \|>
	filter(year %in% c(2019, 2024)) \|>
	group_by(CameraImpact = ChangeInAreaRatio > 0.05, year, anzsoc_division) \|>
	summarise(sum_sum_v = sum(sum_v))




	outcome_v <-
	wgtn_mbs_less_viewsheds_v_2019_vrs_2024 \|>
	group_by(CameraImpact = ChangeInAreaRatio > 0.05) \|>
	summarise(sum_sum_v = sum(v_2019_2024))


	# todo - double check if 2022 to 2024 makes for a sensible time range



	## what happens if I just look at ANZSOC Division = 'Theft and Related Offences'
	vtp_year_mb_theft <-
	vtp \|>
	filter(`ANZSOC Division` == 'Theft and Related Offences') \|>
	group_by(year, meshblock = Meshblock) \|>
	summarise( sum_v = sum(Victimisations))

	glimpse(vtp_year_mb_theft)

	wgtn_mbs_less_viewsheds_vtp_theft <-
	wgtn_mbs_less_viewsheds \|>
	select(MeshblockN, ChangeInArea, AreaMinusViewshed, LandAreaSQ_km2) \|>
	left_join(vtp_year_mb_theft, join_by(MeshblockN == meshblock))

	glimpse(wgtn_mbs_less_viewsheds_vtp_theft)

	# the vtp data only goes back to june 2021 - that sux as it's impacted by covid and it's a midyear value
	wgtn_mbs_less_viewsheds_vtp_2022_vrs_2024_theft <-
	wgtn_mbs_less_viewsheds_vtp_theft \|>
	filter(year %in% c(2022, 2024)) \|>
	pivot_wider(
	names_from = year,
	values_from = sum_v
	)

	wgtn_mbs_less_viewsheds_vtp_2022_vrs_2024_theft$v_2022_2024 <- wgtn_mbs_less_viewsheds_vtp_2022_vrs_2024_theft$`2024` - wgtn_mbs_less_viewsheds_vtp_2022_vrs_2024_theft$`2022`

	# not sure there's a significant movement below the 0 line
	ggplot(data = wgtn_mbs_less_viewsheds_vtp_2022_vrs_2024_theft) +
	geom_point(mapping = aes(x = ChangeInAreaRatio, y = v_2022_2024)) +
	scale_colour_viridis(option = "B") +
	labs(x = "Proportion of Meshblock in Viewshed", y = "Victimisations 2024 minus victimisations 2022", title = 'For ANZSOC Division = Theft and Related Offences')

	# no crime category stands out

	################################################
No results found