Multiple Regions

Author

Juan Fonseca

Based on the main analysis

library(sf)

Linking to GEOS 3.13.0, GDAL 3.10.1, PROJ 9.5.1; sf_use_s2() is TRUE

library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.4

── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(tmap)

NHS boundaries

boundaries for the NSH regions

CCG_boundaries <- geojsonsf::geojson_sf(
  "https://openprescribing.net/api/1.0/org_location/?org_type=ccg"
  ) |>
  st_transform(27700)

Loading the built-up areas

Reading the built-up areas data

builtup_bounds <- st_read("OS Open Built Up Areas.gpkg",
                          layer = "os_open_built_up_areas")

Reading layer `os_open_built_up_areas' from data source 
  `C:\Users\ts18jpf\OneDrive - University of Leeds\03_PhD\00_Misc_projects\Eng-Presc-Data\OS Open Built Up Areas.gpkg' 
  using driver `GPKG'
Simple feature collection with 8585 features and 7 fields
Geometry type: MULTIPOLYGON
Dimension:     XY
Bounding box:  xmin: 65300 ymin: 10000 xmax: 655625 ymax: 1177650
Projected CRS: OSGB36 / British National Grid

Identifying the largest built-up areas within each region

all_BA_selected <- do.call(
  bind_rows,
  lapply(CCG_boundaries$code,
         \(t_code){
           
           t_nhs <- CCG_boundaries[CCG_boundaries$code==t_code,]
           t_nhs_buffered <- st_buffer(t_nhs,dist = 2e3)               
                          
           # Subsetting the biggest built-up NHS area
           builtup_bounds[t_nhs,
                          ][t_nhs_buffered,
                            op = st_within] |>
             slice_max(geometry_area_m) |>
                        mutate(
                          org_code  = t_code) 
  }))

A quick check of the largest built-up areas within each NHS region

tmap_mode("view")

ℹ tmap mode set to "view".

tm_shape(CCG_boundaries)+
  tm_polygons(border.col = "black",col = "white",alpha = 0.2)+
  tm_shape(all_BA_selected)+
  tm_fill("dodgerblue3")


── tmap v3 code detected ───────────────────────────────────────────────────────
[v3->v4] `tm_polygons()`: use 'fill' for the fill color of polygons/symbols
(instead of 'col'), and 'col' for the outlines (instead of 'border.col').[v3->v4] `tm_polygons()`: use `fill_alpha` instead of `alpha`.

Running the analisys for all regions

all_saba_raw <- do.call(
  bind_rows,
  lapply(CCG_boundaries$code, \(t_code) {
    read_csv(
      paste0(
        "https://openprescribing.net/api/1.0/measure_by_practice/?format=csv&org=",
        t_code,
        "&parent_org_type=ccg&measure=saba"
      ),
      col_types = cols(
        measure = col_character(),
        org_type = col_character(),
        org_id = col_character(),
        org_name = col_character(),
        date = col_date(format = ""),
        numerator = col_double(),
        denominator = col_double(),
        calc_value = col_double(),
        percentile = col_double()
      )
    )
  }))

all_practices_raw <- do.call(
  bind_rows,
  lapply(CCG_boundaries$code, \(t_code) {
    geojsonsf::geojson_sf(
      paste0("https://openprescribing.net/api/1.0/org_location/?q=",
         t_code)) |>
      st_transform(27700) |> 
      mutate(par_code = t_code)
  }))

Overall trends

Identifying the practices without all records

count_records <- all_saba_raw |>
        drop_na() |>
        summarise(n_reports = n(), .by = org_id)

Plotting the distribution of records per practice

ggplot(count_records,
       aes(n_reports))+
  stat_ecdf()+
  scale_y_continuous(labels = scales::label_percent())

Extracting the IDs of the practices with all records

practices_complete <- count_records |>
  filter(n_reports == max(n_reports)) |> 
  pull(org_id)

Calculating overall trend using the prectices with complete records

overall_saba <- all_saba_raw |>
  filter(org_id %in% practices_complete) |> 
  summarise(across(numerator:denominator, sum), .by = c(date)) |>
        mutate(calc_value = numerator / denominator)


# Defining function for month delta
diff_month <- function(start, end) {
  length(seq(from = start, to = end, by = 'month')) - 1
}

# Doing all the month transforming for fitting a model
start_month <- min(overall_saba$date)
overall_saba$month <- vapply(overall_saba$date, \(x) {
  diff_month(start_month, x)
}, numeric(1))

Averaged time series

overall_saba |> 
ggplot(aes(x = date, y = calc_value))+
  geom_line(col = "dodgerblue2", alpha = 0.3)+
  # geom_smooth(method = "lm",se = F,
  #             col = "dodgerblue4",
  #             linewidth = 1)+
  scale_y_continuous(labels = scales::label_percent())+
  theme_minimal()

Average linear trend

overall_saba |> 
ggplot(aes(x = date, y = calc_value))+
  geom_line(col = "dodgerblue2", alpha = 0.3)+
  geom_smooth(method = "lm",se = F,
              col = "dodgerblue4",
              linewidth = 1)+
  scale_y_continuous(labels = scales::label_percent())+
  theme_minimal()

`geom_smooth()` using formula = 'y ~ x'

Time series analysis using `timetk`

library(timetk)
anomalised_saba <- overall_saba |> anomalize(date,calc_value)

frequency = 12 observations per 1 year

trend = 12 observations per 1 year

Decomposing the time series

anomalised_saba |>
plot_anomalies_decomp(
.date_var = date,
.interactive = FALSE
)

Showing anomalies

anomalised_saba |>
  plot_anomalies(date,
                 .interactive = F)

Comparing NHS regions

Summarising trends for the major built-up area in each region

all_ccg_summarised <- do.call(
  bind_rows,
  lapply(
    CCG_boundaries$code,
    \(t_code){
      # Subsetting the biggest built-up NHS area
      main_BA <- all_BA_selected[all_BA_selected$org_code == t_code, ]
      
      # Extracting the practices for that NHS area
      Practices <- all_practices_raw[all_practices_raw$par_code == t_code, ][main_BA, ]
      
      # Extracting the SABA results for that area
      saba <- all_saba_raw[all_saba_raw$org_id %in% Practices$code, ]
      # # Identifying the practices with more than 10 records
      # ids_to_include <- saba |>
      #   drop_na() |>
      #   summarise(n_reports = n(), .by = org_id) |>
      #   arrange(n_reports) |>
      #   filter(n_reports > 10) |>
      #   pull(org_id)
      
      # Defining function for month delta
      diff_month <- function(start, end) {
        length(seq(from = start, to = end, by = 'month')) - 1
      }
      
      # Doing all the month transforming for fitting a model
      start_month <- min(saba$date)
      saba$month <- vapply(saba$date, \(x) {
        diff_month(start_month, x)
      }, numeric(1))
      
      # Identifying the practices within the main built-up area
      city_practices <- Practices[main_BA, ] |> pull(code)
      
      # Producing a clean SABA dataset
      # (no outliers nor practices out of main urban area)
      
      clean_data <- saba |>
        filter(org_id %in% practices_complete, org_id %in% city_practices)
      
      clean_data |>
        summarise(across(numerator:denominator, sum), .by = c(date, month)) |>
        mutate(calc_value = numerator / denominator, org_code  = t_code)
    }
  )
)

Joining with the CCG table to obtain the names

all_data_names <- all_ccg_summarised |>
  left_join(CCG_boundaries |>
              st_drop_geometry() |>
              select(name,code),by = c("org_code"="code"))

A quick visualisation of the linear trends of Bradford and Liverpool

all_data_names|> 
  ggplot(aes(x = date,y = calc_value,group = org_code))+
  geom_line(col = "dodgerblue2",alpha = 0.3)+
  geom_smooth(data =all_data_names |> 
                filter(org_code %in% c("36J","99A")),
              aes(col = name,fill = name),
              method = "lm",
              # se = F,
              # col = "dodgerblue4",
              linewidth = 1)+
  theme_minimal()+
  theme(legend.position = "bottom")

`geom_smooth()` using formula = 'y ~ x'

Processing Built-up areas with CAZ

CAZ_lst <- read_csv("CAZ_list.csv") |> select(Area, Type)

Rows: 14 Columns: 5
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (4): CAZ, Area, Type, file
dbl (1): Start

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Sub-setting the built-up areas with CAZ zones

CAZ_BA <- builtup_bounds |> semi_join(CAZ_lst,by=c("gsscode"="Area"))

all_caz_summarised <- do.call(
  bind_rows,
  lapply(
    CAZ_BA$gsscode,
    \(t_code){
      # Subsetting the biggest built-up NHS area
      main_BA <- CAZ_BA[CAZ_BA$gsscode == t_code, ]
      
      # Extracting the practices for that NHS area
      Practices <- all_practices_raw[main_BA, ]
      
      # Extracting the SABA results for that area
      saba <- all_saba_raw[all_saba_raw$org_id %in% Practices$code, ]
      
      # Defining function for month delta
      diff_month <- function(start, end) {
        length(seq(from = start, to = end, by = 'month')) - 1
      }
      
      # Doing all the month transforming for fitting a model
      start_month <- min(saba$date)
      saba$month <- vapply(saba$date, \(x) {
        diff_month(start_month, x)
      }, numeric(1))
      
      
      # Producing a clean SABA dataset
      # (no outliers nor practices out of main urban area)
      
      clean_data <- saba |>
        filter(org_id %in% practices_complete)
      
      clean_data |>
        summarise(across(numerator:denominator, sum), .by = c(date, month)) |>
        mutate(calc_value = numerator / denominator,
               name  = CAZ_BA$name1_text[CAZ_BA$gsscode == t_code],
               type = CAZ_lst$Type[CAZ_lst$Area==t_code])
    }
  )
)