Model eel abundance in the Vilaine bassin

Summary

In this notebook, using ASPE data about eel abundance in the Vilaine bassin, I explore

  • the package INLA to fit bayesian spatio-temporal models
  • how to model temporal autocorrelation (AR, RW)
  • how to model spatial autocorrelation (spde, besag)

0. Setup

Load dependencies and targets.

library(renv)
renv::load()
renv::restore() # Install missing librairies.
renv::status() # Check the project state.

library(targets)
library(here)
tar_config_set(
  store = here::here("_targets")
)
library(sf)
library(sfnetworks)
library(dplyr)
library(tidyr)
library(ggplot2)
set_theme(theme_minimal())
options(
    ggplot2.continuous.colour = "viridis",
    ggplot2.continuous.fill = "viridis"
)
library(terra)
library(INLA)
library(bayestestR)
library(fmesher)
library(gstat)
library(tidygraph)
library(igraph)
library(mapview)
library(Matrix)

1. River data

First, we focus on the river geographic data. Let’s load it.

dir <- tar_read(workshop_unzipped)
base::load(here(dir, "processed_data", "reseaux_hydrographiques.rda"))

For now, we will focus on the Vilaine bassin. We can view it using sfnetworks.

net <- as_sfnetwork(rht_vilaine)
plot(net, cex = 0.5)

Caution

We can see that the hydrographic network is not entirely connected. For example, a small portion in the south west is disconnected from the rest of the bassin. We should keep this in mind for later, as it could bias our statistical model.

We want to check that the network data is valid. In particular, we want to verify that there is no duplicate in the nodes, or edges data.

nodes <- net |>
    activate("nodes") |>
    st_as_sf()
sum(duplicated(nodes))
[1] 0
edges <- net |>
    activate("edges") |>
    st_as_sf()
sum(duplicated(select(edges, c("from", "to"))))
[1] 0

We see that there is no duplication for the Vilaine network. We can proceed to the next step.

2. Environmental data

2.1 Load data

base::load(here(dir, "processed_data", "poissons_env_temp.RData"))

env
# A tibble: 5,777 × 14
   sta_id pop_id ope_id ope_date            annee distance_mer altitude
    <int>  <int>  <int> <dttm>              <dbl>        <int>    <int>
 1   9655  37720  22622 1996-09-24 10:09:00  1996           NA      305
 2   9225  35420  16755 1998-06-19 09:30:00  1998           NA       80
 3   9622  37544  16012 2000-05-22 14:45:00  2000          328       98
 4   9762  38357  17091 2006-09-28 14:00:00  2006          216       31
 5   9384  36157  17349 2006-06-21 14:30:00  2006          482      140
 6   9712  38066  17621 2016-09-08 10:30:00  2016          400      130
 7   9454  36580  17379 2010-09-14 15:00:00  2010          500      124
 8   9520  36941  17079 2007-06-14 14:30:00  2007          224       37
 9   9381  36135  17154 2006-06-20 00:00:00  2006          387      100
10   9284  35724  17596 2016-09-29 10:00:00  2016          212       32
# ℹ 5,767 more rows
# ℹ 7 more variables: surface_bv <dbl>, distance_source <dbl>, largeur <dbl>,
#   pente <dbl>, profondeur <dbl>, temp_juillet <dbl>, temp_janvier <dbl>

Let’s rename column for clarity.

env <- env |> rename(year = annee)

Station geographical coordinates are stored in points_geo_<river>. We visualize them over the hydrographic network.

plot(net, col = "black", cex = 0.5)
plot(points_geo_vilaine, col = "red", add = TRUE, pch = 16, cex = 0.5)

2.2. Visualisation of altitude data

To begin with, let’s focus on a single environment variable: the altitude.

d_env <- env |>
    select(c("sta_id", "pop_id", "altitude")) |>
    distinct()
d_env |> filter(is.na(altitude))
# A tibble: 2 × 3
  sta_id pop_id altitude
   <int>  <int>    <int>
1   8853  33379       NA
2  13994  51300       NA

We see that we have missing values for two stations: 8853 and 13994. Next, we want to relate altitude values to geographical coordinates.

pts_vilaine <- points_geo_vilaine |>
    select(c("pop_id", "geometry")) |>
    left_join(d_env, by = "pop_id") |>
    filter(!is.na(altitude))
pts_vilaine
Simple feature collection with 37 features and 3 fields
Geometry type: POINT
Dimension:     XY
Bounding box:  xmin: -2.965245 ymin: 47.46923 xmax: -1.051178 ymax: 48.3493
Geodetic CRS:  WGS 84
First 10 features:
   pop_id sta_id altitude                   geometry
1   46885  12253       25 POINT (-2.429369 47.71654)
2   46166  11854      176 POINT (-2.965245 48.32017)
3   47601  12434       11 POINT (-1.796815 47.46923)
4   46671  12118        9 POINT (-2.332695 47.76529)
5   47659  12447        3  POINT (-2.142875 47.5878)
6   47613  12436        5 POINT (-1.867647 47.70129)
7   46484  12007       41  POINT (-2.35679 48.01553)
8   47460  12407       48 POINT (-1.410521 47.71718)
9   47677  12451       16 POINT (-2.563233 47.59471)
10  47326  12380       30 POINT (-1.562385 48.03841)

We see that the altitude is not measured on every sampling points, but only in 37 locations. Let’s plot them.

ggplot() +
    geom_sf(data = points_geo_vilaine, color = "grey80", alpha = 0.5) +
    geom_sf(data = pts_vilaine, aes(color = altitude), size = 5) +
    labs(
        color = "Altitude (m)"
    )

We see in grey every location were we have sample of fish abundances, and in colour where we have altitude data. We have to interpolate altitude data for missing locations. The simplest way to do it is to the “nearest neighbour” interpolation.

2.3. Interpolation of altitude data

We first build proximity polygons with the terra::voronoi function.

d_elev <- select(pts_vilaine, c("geometry", "altitude"))
d_elev.vect <- terra::vect(d_elev)
v <- voronoi(d_elev.vect)
plot(v, "altitude")
points(d_elev.vect)

v.sf <- st_as_sf(v)
pred <- st_intersection(v.sf, points_geo_vilaine)
d_elev.nn <- select(pred, c("altitude", "pop_id", "geometry"))

ggplot() +
    geom_sf(data = pred, aes(color = altitude)) +
    geom_sf(data = pts_vilaine, aes(color = altitude), size = 5) +
    labs(
        color = "Altitude (m)"
    )

Predicted altitudes seem consistent with the nearest neighbour interpolation.

2.4. Cross-validation of altitude interpolation

Now, that we have a prediction pipeline working we will estimate its performance using cross-validation. To do so, we will do a LOOCV (Leave-One-Out Cross-Validation).

n <- nrow(d_elev)
preds <- numeric(n) # Vector filled with 0s.
for (i in 1:n) {
    train <- d_elev[-i, ]
    test <- d_elev[i, ]
    v_train <- voronoi(terra::vect(train))
    preds[i] <- st_intersection(st_as_sf(v_train), test)$altitude
}
d_elev$altitude.loo <- preds

ggplot(d_elev, aes(x = altitude, y = altitude.loo)) +
    geom_point() +
    geom_abline(slope = 1, intercept = 0, linetype = "dashed") +
    labs(
        x = "Observed altitude (m)",
        y = "Predicted altitude (m)"
    )

rmse <- sqrt(mean((d_elev$altitude - d_elev$altitude.loo)^2))
rmse
[1] 18.74797

3. Abundance data

3.1. Load data

catches <- catches |>
    rename(
        species = esp_code_alternatif,
        year = annee,
        abundance = effectif
    )
catches
# A tibble: 69,823 × 6
   sta_id pop_id ope_id  year species abundance
    <int>  <int>  <int> <dbl> <chr>       <int>
 1   8530  31870  19943  2017 TRF           159
 2   8530  31870  19943  2017 VAI            82
 3   8530  31870  19944  2011 TRF           217
 4   8530  31870  19944  2011 VAI             7
 5   8530  31870  19945  2009 TRF           232
 6   8530  31870  19945  2009 VAI             9
 7   8530  31870  19946  2007 TRF           213
 8   8530  31870  19946  2007 VAI            36
 9   8530  31870  83029  2019 TRF           126
10   8530  31870  83029  2019 VAI           104
# ℹ 69,813 more rows
species_names <- unique(catches$species)

Next, let’s add zeros where species are not found.

catches <- catches |>
    pivot_wider(
        names_from = species,
        values_from = abundance,
        values_fill = list(abundance = 0)
    ) |>
    pivot_longer(
        cols = species_names,
        names_to = "species",
        values_to = "abundance"
    )

catches
# A tibble: 450,606 × 6
   sta_id pop_id ope_id  year species abundance
    <int>  <int>  <int> <dbl> <chr>       <int>
 1   8530  31870  19943  2017 TRF           159
 2   8530  31870  19943  2017 VAI            82
 3   8530  31870  19943  2017 LOF             0
 4   8530  31870  19943  2017 CHA             0
 5   8530  31870  19943  2017 SDF             0
 6   8530  31870  19943  2017 PFL             0
 7   8530  31870  19943  2017 BAF             0
 8   8530  31870  19943  2017 CHE             0
 9   8530  31870  19943  2017 GOU             0
10   8530  31870  19943  2017 OBR             0
# ℹ 450,596 more rows

3.2. Preliminary plots

Let’s plot abundance histogram for each species. As most species abundance are dominated by zeros, because of absence data, we plot log(abundance + 1) instead of raw abundances. This also implies that we will need to use zero-inflated distributions in our statistical model to account for this absence data.

catches |>
    ggplot(aes(x = abundance + 1)) + # add 1 to avoid log(0)
    geom_histogram(binwidth = 0.5) +
    scale_x_log10() +
    facet_wrap(~species)

Let’s plot abundance time series for the 10 most abundant species.

n_species <- 10
common_species <- catches |>
    group_by(species) |>
    summarise(mean_abundance = mean(abundance)) |>
    arrange(desc(mean_abundance)) |>
    slice(1:n_species) |>
    pull(species)

catches |>
    filter(species %in% common_species) |>
    group_by(year, species) |>
    summarise(mean_abundance = mean(abundance)) |>
    ggplot(aes(x = year, y = mean_abundance, colour = species)) +
    geom_line()

There is no clear trend, but we can’t say much at this point because trend may be masked by sampling efforts and environmental covariates.

Note

Here it would be interesting to plot the number of operations per year to see the evolution of the sampling effort.

d_eel <- catches |> filter(species == "ANG")
d_eel.avg <- d_eel |>
    group_by(pop_id) |>
    summarise(abundance_mean = mean(abundance))
data <- inner_join(d_elev.nn, d_eel.avg, by = "pop_id")

ggplot(data, aes(x = altitude, y = abundance_mean)) +
    geom_point() +
    labs(
        x = "Altitude (m)",
        y = "Eel abundance"
    )

We clearly observe the expected trend of eel abundance with altitude.

Next, we want to model the effect of altitude, while accounting for time using INLA.

4. Statistical models

Data is ready to model eel abundances in space, time and with altitude. In this section, we will build increasingly complex models.

Before doing anything fancy, let’s load INLA, and scale our predictor variable (altitude) as it is good practice.

data <- inner_join(d_eel, d_elev.nn, by = "pop_id")
data$altitude_s <- scale(data$altitude)

4.1. Abundance distribution and overdisperion

Here, we focus on the choice of the distribution of our target variable: eel abundance. We will focus particularly on overdispersion, as we know that eel distribution is zero-inflated due to absence data. For the following model, we add a random effect for stations and year. We will discuss in further details these points in following sections.

4.1.1. Poisson model

We begin with a poisson GLM.

m.poisson <- inla(
    abundance ~ 1 + altitude_s +
        f(year, model = "iid") +
        f(pop_id, model = "iid"),
    family = "poisson",
    data = data,
    control.compute = list(config = TRUE, waic = TRUE, dic = TRUE)
)
summary(m.poisson)
Time used:
    Pre = 0.423, Running = 0.192, Post = 0.0507, Total = 0.666 
Fixed effects:
              mean    sd 0.025quant 0.5quant 0.975quant   mode kld
(Intercept)  1.921 0.165      1.594    1.922      2.244  1.922   0
altitude_s  -1.404 0.169     -1.739   -1.403     -1.075 -1.403   0

Random effects:
  Name    Model
    year IID model
   pop_id IID model

Model hyperparameters:
                      mean    sd 0.025quant 0.5quant 0.975quant  mode
Precision for year   10.86 2.719      6.395    10.57      17.03 10.03
Precision for pop_id  1.26 0.331      0.723     1.22       2.02  1.15

Deviance Information Criterion (DIC) ...............: 4552.01
Deviance Information Criterion (DIC, saturated) ....: 2685.27
Effective number of parameters .....................: -737.88

Watanabe-Akaike information criterion (WAIC) ...: 10992.89
Effective number of parameters .................: 2810.03

Marginal log-Likelihood:  -3209.39 
 is computed 
Posterior summaries for the linear predictor and the fitted values are computed
(Posterior marginals needs also 'control.compute=list(return.marginals.predictor=TRUE)')

We can plot the distribution of the fixed effect (altitude).

marginal.altitude <- m.poisson$marginals.fixed$altitude_s
ggplot(as.data.frame(marginal.altitude)) +
    geom_line(aes(x = x, y = y)) +
    labs(
        x = "Altitude effect size",
        y = "Probability"
    )

We can compute the highest posterior density (HDP) credible interval.

inla.hpdmarginal(0.97, m.poisson$marginals.fixed$altitude_s)
                 low      high
level:0.97 -1.774432 -1.037384

We can also sample from the posterior distribution.

m.poisson.samples <- inla.posterior.sample(100, m.poisson)

We represent the sampled effect size of altitude against the marginal distribution obtained before.

fun <- function(...) {
    altitude_s
}
altitude.sample <- inla.posterior.sample.eval(fun, m.poisson.samples)
tab <- data.frame(altitude = altitude.sample[1, ])
ggplot(tab, aes(x = altitude)) +
    geom_histogram(aes(y = after_stat(density)), bins = 18) +
    geom_line(data = as.data.frame(marginal.altitude), aes(x = x, y = y)) +
    labs(
        x = "Altitude effect size",
        y = "Density"
    )

Next, we want to simulate data from the model. This step is important as it allows us to see if our model makes sense or not. In particular, we identify where our model fails and improve it.

alt_vals <- seq(-1, 3, length.out = 100)
y.sample <- inla.posterior.sample.eval(
    fun = function(...) {
        mu <- exp(Intercept + altitude_s * alt_vals)
        abundance <- rpois(length(mu), mu)
    },
    samples = m.poisson.samples
)

y.sum <- apply(y.sample, 1, function(x) {
    y.hdi <- hdi(x, ci = 0.89)
    c(mean = mean(x), hdi_low = y.hdi$CI_low, hdi_high = y.hdi$CI_high)
})
data.sim <- data.frame(
    altitude_s = alt_vals,
    t(y.sum)
)

ggplot(data.sim, aes(x = altitude_s, y = mean)) +
    # geom_point(data = data, aes(x = altitude_s, y = abundance), colour = "grey80") +
    geom_ribbon(aes(ymin = hdi_low, ymax = hdi_high), fill = "steelblue", alpha = 0.3) +
    geom_line(colour = "steelblue") +
    labs(
        x = "Scaled altitude",
        y = "Eel abundance"
    )

Next, we want to plot actual data against this prediction. Because we have not included spatial and temporal random effects while sampling the posterior, we are making predictions for an average site and year. Therefore, we want to average abundance data over site and year.

data.avg <- data |>
    group_by(altitude_s) |>
    summarise(
        abundance_avg = mean(abundance),
        abundance_var = var(abundance)
    )
data.avg
# A tibble: 32 × 3
   altitude_s[,1] abundance_avg abundance_var
            <dbl>         <dbl>         <dbl>
 1         -0.979         13.0           52.4
 2         -0.954         35.1          416. 
 3         -0.929         50.2         1692. 
 4         -0.904         30.3          235. 
 5         -0.830         43.1         2204. 
 6         -0.780         50.1         1004. 
 7         -0.705         37.6          350. 
 8         -0.680         42.4          624. 
 9         -0.655         26.6          332. 
10         -0.630          9.44          15.3
# ℹ 22 more rows
ggplot(data.sim, aes(x = altitude_s, y = mean)) +
    geom_point(data = data.avg, aes(x = altitude_s, y = abundance_avg), colour = "grey80") +
    geom_ribbon(aes(ymin = hdi_low, ymax = hdi_high), fill = "steelblue", alpha = 0.3) +
    geom_line(colour = "steelblue") +
    labs(
        x = "Scaled altitude",
        y = "Eel abundance"
    )

We see that our model capture the clear decreasing trend of eel abundances with altitude.

Let’s investigate overdispersion with a posterior predictive check.

y_pred_avg <- apply(y.sample, 1, mean)
y_pred_var <- apply(y.sample, 1, var)
pred_df <- data.frame(
    abundance_avg = y_pred_avg,
    abundance_var = y_pred_var
)
pred_df$type <- "Posterior predictive"
pred_df$altitude_s <- alt_vals
data.avg$type <- "Observed"
plot_df <- rbind(pred_df, data.avg)

ggplot(plot_df, aes(x = abundance_avg, y = abundance_var, colour = type)) +
    geom_point() +
    geom_abline(slope = 1, intercept = 1) +
    labs(
        x = "Observed mean abundance",
        y = "Observed variance in abundace"
    )

We see clearly that observed data is overdispersed (variance >> mean) compared to the data generated by our model. This suggest that the poisson distribution is not suited for this task.

4.1.2. Negative binomial model

To account for overdispersion, we will test the negative binomial distribution. Basically, we will repeat the previous section but with a model where poisson distribution is replaced with a negative binomial one.

m.nbinom <- inla(
    abundance ~ 1 + altitude_s +
        f(year, model = "iid") +
        f(pop_id, model = "iid"),
    family = "nbinomial",
    data = data,
    control.compute = list(config = TRUE, waic = TRUE, dic = TRUE)
)
summary(m.nbinom)
Time used:
    Pre = 0.262, Running = 0.191, Post = 0.0357, Total = 0.49 
Fixed effects:
              mean    sd 0.025quant 0.5quant 0.975quant   mode kld
(Intercept)  2.036 0.158      1.721    2.037      2.344  2.037   0
altitude_s  -1.412 0.164     -1.737   -1.412     -1.092 -1.412   0

Random effects:
  Name    Model
    year IID model
   pop_id IID model

Model hyperparameters:
                                                        mean    sd 0.025quant
size for the nbinomial observations (1/overdispersion)  2.18 0.185      1.836
Precision for year                                     18.59 8.698      7.432
Precision for pop_id                                    1.46 0.411      0.806
                                                       0.5quant 0.975quant
size for the nbinomial observations (1/overdispersion)     2.17       2.56
Precision for year                                        16.73      40.80
Precision for pop_id                                       1.41       2.41
                                                        mode
size for the nbinomial observations (1/overdispersion)  2.16
Precision for year                                     13.62
Precision for pop_id                                    1.31

Deviance Information Criterion (DIC) ...............: 3440.05
Deviance Information Criterion (DIC, saturated) ....: 622.13
Effective number of parameters .....................: 53.89

Watanabe-Akaike information criterion (WAIC) ...: 3443.63
Effective number of parameters .................: 50.78

Marginal log-Likelihood:  -1783.55 
 is computed 
Posterior summaries for the linear predictor and the fitted values are computed
(Posterior marginals needs also 'control.compute=list(return.marginals.predictor=TRUE)')

Let’s plot the model prediction against actual data.

m.nbinom.samples <- inla.posterior.sample(1000, m.nbinom)

alt_vals <- seq(-1, 3, length.out = 100)
y.sample <- inla.posterior.sample.eval(
    fun = function(...) {
        mu <- exp(Intercept + altitude_s * alt_vals)
        abundance <- rnbinom(length(mu), mu = mu, size = theta[1])
    },
    samples = m.nbinom.samples
)

y.sum <- apply(y.sample, 1, function(x) {
    y.hdi <- hdi(x, ci = 0.89)
    c(mean = mean(x), hdi_low = y.hdi$CI_low, hdi_high = y.hdi$CI_high)
})
data.sim <- data.frame(
    altitude_s = alt_vals,
    t(y.sum)
)

ggplot(data.sim, aes(x = altitude_s, y = mean)) +
    geom_point(data = data.avg, aes(x = altitude_s, y = abundance_avg), colour = "grey80") +
    geom_ribbon(aes(ymin = hdi_low, ymax = hdi_high), fill = "steelblue", alpha = 0.3) +
    geom_line(colour = "steelblue") +
    labs(
        x = "Scaled altitude",
        y = "Eel abundance"
    )

We see that the negative binomial distribution better account for the variance. We will stick to this distribution for now, although we could have tried other distributions. Notably, the ZIP-poisson could be useful when we have many 0s.

4.2. Modeling time

Let’s now focus on modelling the effect of time. To begin with, let’s inspect the raw data.

ggplot(data, aes(x = year, y = abundance, color = altitude_s)) +
    geom_jitter(width = 0.2) +
    stat_summary(fun = mean, geom = "line", colour = "grey", size = 1) +
    labs(x = "Year", ylab = "Eel abundance", color = "Altitude (scaled)")

The plain line indicates the trend of the mean abundance over all sites. This trend is decreasing, we expect to retrieve this trend in our temporal random effects.

4.2.1. IID random effect

We begin by assuming that year random effects are independent from one another. We expect this assumption to be wrong, but let’s start from there and then explore more realistic modeling choices.

m.iid <- inla(
    abundance ~ 1 + altitude_s +
        f(year, model = "iid") +
        f(pop_id, model = "iid"),
    family = "nbinomial",
    data = data,
    control.compute = list(config = TRUE, waic = TRUE)
)
re.year.iid <- m.iid$summary.random$year

Here was our first try, yet we expect some correlation from one year to another. For this reason, time random effect are often modelled with autoregressive model (AR1 if lag 1).

4.2.2. AR1 Random effect

We modify the previous model changing the year random effect from IID to AR1.

m.ar <- inla(
    abundance ~ 1 + altitude_s +
        f(year, model = "ar1") +
        f(pop_id, model = "iid"),
    family = "nbinomial",
    data = data,
    control.compute = list(config = TRUE, waic = TRUE)
)
re.year.ar <- m.ar$summary.random$year

4.2.3. RW1 random effect

Same with RW1 random effect.

m.rw <- inla(
    abundance ~ 1 + altitude_s +
        f(year, model = "rw1") +
        f(pop_id, model = "iid"),
    family = "nbinomial",
    data = data,
    control.compute = list(config = TRUE, waic = TRUE)
)
re.year.rw <- m.rw$summary.random$year

4.2.4. Model comparisons

Now that we have the three models, we can compare them.

First, we can plot random effect vs year.

re.year.iid$type <- "iid"
re.year.ar$type <- "ar"
re.year.rw$type <- "rw"
re.year.all <- rbind(re.year.iid, re.year.ar, re.year.rw)

ggplot(re.year.all, aes(x = ID, y = mean, color = type)) +
    geom_hline(yintercept = 0, linetype = "dashed", color = "grey50") +
    geom_ribbon(aes(ymin = `0.025quant`, ymax = `0.975quant`), alpha = 0.1) +
    geom_line() +
    geom_point() +
    labs(
        x = "Year",
        y = "Year random effect",
    )

We see that AR1 and RW1 produce random effects with a smooth trend. By contrast, IID random effects produces ‘chaotic’ random effects due to their independence. Second, we can inspect model WAIC, where lower values implies better out-of-sample predictive performance.

m.iid$waic$waic
[1] 3443.629
m.ar$waic$waic
[1] 3428.845
m.rw$waic$waic
[1] 3428.495

We see that AR1 and RW1 have lower WAIC signaling better predictive ability, as we could have expected.

We can also do a predictive check by plotting predicted trends against the observed one.

pred_iid <- tapply(m.iid$summary.fitted.values$mean, data$year, mean)
pred_ar1 <- tapply(m.ar$summary.fitted.values$mean, data$year, mean)
pred_rw1 <- tapply(m.rw$summary.fitted.values$mean, data$year, mean)

obs <- tapply(data$abundance, data$year, mean)

df <- data.frame(
    year = sort(unique(data$year)),
    obs = obs,
    iid = pred_iid,
    ar1 = pred_ar1,
    rw1 = pred_rw1
)
df <- df |> pivot_longer(
    cols = c("obs", "iid", "ar1", "rw1"),
    names_to = "type",
    values_to = "mean_abundance"
)

ggplot(df, aes(x = year, y = mean_abundance, color = type)) +
    geom_line() +
    labs(
        y = "Mean abundance",
        x = "Year"
    )

Here, we see that there is no strong differences: all three models track well the observed trend. The IID model might be a bit overfitting the trend at the beginning, but otherwise there is not much to say.

4.3. Modeling space

Now that we have seen the different option to model time dynamics, we will focus on space.

4.3.1. IID random effect

First, we begin with IID random effects, assuming site are independent from one another. This assumption is probably unrealistic but still provide a useful baseline.

We fit the model, extract spatial random effects and join them to the dataframe containing their geographical positions.

m.iid <- inla(
    abundance ~ 1 + altitude_s +
        f(year, model = "ar1") +
        f(pop_id, model = "iid"),
    family = "nbinomial",
    data = data,
    control.compute = list(config = TRUE, waic = TRUE)
)

summary(m.iid)
Time used:
    Pre = 0.297, Running = 0.196, Post = 0.0389, Total = 0.532 
Fixed effects:
              mean    sd 0.025quant 0.5quant 0.975quant   mode kld
(Intercept)  2.068 0.216      1.645    2.066      2.509  2.067   0
altitude_s  -1.414 0.164     -1.738   -1.413     -1.093 -1.413   0

Random effects:
  Name    Model
    year AR1 model
   pop_id IID model

Model hyperparameters:
                                                         mean    sd 0.025quant
size for the nbinomial observations (1/overdispersion)  2.216 0.186      1.875
Precision for year                                     14.427 7.434      4.427
Rho for year                                            0.766 0.118      0.485
Precision for pop_id                                    1.457 0.411      0.804
                                                       0.5quant 0.975quant
size for the nbinomial observations (1/overdispersion)    2.208      2.608
Precision for year                                       12.945     32.880
Rho for year                                              0.786      0.937
Precision for pop_id                                      1.405      2.409
                                                         mode
size for the nbinomial observations (1/overdispersion)  2.188
Precision for year                                     10.174
Rho for year                                            0.832
Precision for pop_id                                    1.309

Watanabe-Akaike information criterion (WAIC) ...: 3428.85
Effective number of parameters .................: 44.95

Marginal log-Likelihood:  -1777.68 
 is computed 
Posterior summaries for the linear predictor and the fitted values are computed
(Posterior marginals needs also 'control.compute=list(return.marginals.predictor=TRUE)')
re.site.iid <- m.iid$summary.random$pop_id

pts.re.iid <- pts_vilaine |> left_join(
    re.site.iid |> rename(pop_id = "ID"),
    by = "pop_id"
)

Next, we can plot the random effect on a map, to check for spatial correlation.

ggplot(pts.re.iid) +
    geom_sf(aes(color = mean), size = 2) +
    scale_color_gradient2(
        low = "blue",
        mid = "white",
        high = "red",
        midpoint = 0
    ) +
    labs(
        color = "RE site",
    )

We do see a clear spatial correlation, from west to east going from negative to positive random effects. This spatial correlation in the residuals motivates the use of spatially structured residuals. We explore the different options to do so in the following subsections.

4.3.2. SPDE random effect

First step is to get site coordinates in meters, because angles distort distances between sites, which is going to bias our spatial model.

pts_vilaine.proj <- st_transform(pts_vilaine, 2154) # Coordinates in meters.
coords <- st_coordinates(pts_vilaine.proj)

From these coordinates we can build the mesh using fmesher and visualise it.

mesh <- fm_mesh_2d_inla(loc = coords)
plot(mesh)
points(coords, col = "red", pch = 16, cex = 2)

Note

The mesh can be customised to increase or decrease its resolution using additional parameters. For our simple case study, we won’t focus on these details.

Next, we define the SPDE model and the projection matrix \(A\). \(A\) projects the spatial random field to actual observation locations.

spde <- inla.spde2.matern(mesh)
A <- inla.spde.make.A(mesh = mesh, loc = coords)
spatial_index <- inla.spde.make.index(
    name = "spatial.field",
    n.spde = spde$n.spde
)
dim(A)
[1] 37 79
nrow(mesh$loc)
[1] 79

We see that \(A\) is of dimension (number of sites x number of mesh knots).

Because we have multiple observations per sites we have to expand A so the spatial random field is projected for each observations (and not locations). To do so, we have to match each observation to its location.

site_idx <- match(data$pop_id, pts_vilaine$pop_id)
A_expanded <- A[site_idx, ]
dim(A_expanded)
[1] 526  79
nrow(data)
[1] 526

Next, we can build the data stack which combines: data, effects (i.e. predictors) and observation matrices.

data_stack <- inla.stack(
    data = list(y = data$abundance),
    A = list(A_expanded, 1),
    effects = list(
        spatial.field = spatial_index,
        data.frame(
            intercept = rep(1, nrow(data)),
            altitude_s = data$altitude_s,
            year = data$year,
            pop_id = data$pop_id
        )
    ),
    tag = "est"
)

data_stack
Data stack with
  data:    (y), size: 526
  effects: (spatial.field, spatial.field.group, spatial.field.repl, intercept, altitude_s, year, pop_id), size: 563
  A:       526 times 563

Now everything is (finally) ready to fit the model.

m.spde <- inla(
    y ~ -1 + intercept + altitude_s +
        f(year, model = "ar1") +
        f(spatial.field, model = spde),
    family = "nbinomial",
    data = inla.stack.data(data_stack),
    control.predictor = list(A = inla.stack.A(data_stack)),
    control.compute = list(config = TRUE, waic = TRUE)
)

summary(m.spde)
Time used:
    Pre = 0.357, Running = 0.292, Post = 0.0473, Total = 0.696 
Fixed effects:
             mean    sd 0.025quant 0.5quant 0.975quant   mode kld
intercept   2.056 0.314      1.433    2.055      2.692  2.055   0
altitude_s -1.396 0.229     -1.848   -1.396     -0.940 -1.396   0

Random effects:
  Name    Model
    year AR1 model
   spatial.field SPDE2 model

Model hyperparameters:
                                                        mean    sd 0.025quant
size for the nbinomial observations (1/overdispersion)  2.21 0.186      1.868
Precision for year                                     14.12 7.220      4.363
Rho for year                                            0.77 0.116      0.494
Theta1 for spatial.field                                7.51 0.398      6.687
Theta2 for spatial.field                               -8.77 0.323     -9.373
                                                       0.5quant 0.975quant
size for the nbinomial observations (1/overdispersion)     2.20      2.599
Precision for year                                        12.69     32.010
Rho for year                                               0.79      0.938
Theta1 for spatial.field                                   7.53      8.250
Theta2 for spatial.field                                  -8.78     -8.105
                                                         mode
size for the nbinomial observations (1/overdispersion)  2.187
Precision for year                                     10.009
Rho for year                                            0.836
Theta1 for spatial.field                                7.594
Theta2 for spatial.field                               -8.835

Watanabe-Akaike information criterion (WAIC) ...: 3431.85
Effective number of parameters .................: 45.45

Marginal log-Likelihood:  -1778.47 
 is computed 
Posterior summaries for the linear predictor and the fitted values are computed
(Posterior marginals needs also 'control.compute=list(return.marginals.predictor=TRUE)')

First, let’s compare the WAIC of the SPDE model with the IID one.

m.iid$waic$waic
[1] 3428.847
m.spde$waic$waic
[1] 3431.848

We see that they perform similarly in terms of predictive performance. This inspection doesn’t indicate that a model is clearly better than another.

Important

WAIC is a measure of out-of-sample predictive performance. A model can be wrong (in the sense of causal structure) and predict well, while a model can be right and predict poorly. It is important to keep this fact in mind while comparing modelling performance with WAIC.

To go further, we extract spatial random effects and want to visual them.

re.site.spde <- m.spde$summary.random$spatial.field
proj <- fm_evaluator(mesh)

# Get spatial field.
spatial_mean <- proj$proj$A %*% re.site.spde$mean
spatial_sd <- proj$proj$A %*% re.site.spde$sd

# Put it in a dataframe for ggplot.
spatial_proj <- data.frame(
    x = rep(proj$x, length(proj$y)),
    y = rep(proj$y, each = length(proj$x)),
    mean = as.vector(spatial_mean),
    sd = as.vector(spatial_sd)
)

We also extract the edges of the river network to plot it on the subsequent figure.

edges_sf <- net %>%
    activate("edges") %>%
    st_as_sf()
edges_sf <- st_transform(edges_sf, st_crs(pts_vilaine.proj))

Visualize the spatial random effects on a map.

ggplot(spatial_proj, aes(x = x, y = y, fill = mean)) +
    geom_raster() +
    scale_fill_gradient2(
        low = "blue",
        mid = "white",
        high = "red",
        midpoint = 0
    ) +
    geom_sf(
        data = edges_sf,
        inherit.aes = FALSE,
        size = 0.3,
        alpha = 0.3
    ) +
    geom_sf(
        data = pts_vilaine.proj,
        aes(geometry = geometry),
        inherit.aes = FALSE
    ) +
    labs(
        fill = "SPDE RE",
        x = "",
        y = ""
    )

As well as their uncertainty.

ggplot(spatial_proj, aes(x = x, y = y, fill = sd)) +
    geom_raster() +
    geom_sf(data = edges_sf, inherit.aes = FALSE, alpha = 0.3, size = 0.3) +
    geom_sf(
        data = pts_vilaine.proj,
        aes(geometry = geometry),
        inherit.aes = FALSE
    ) +
    labs(
        fill = "SD RE",
        x = "",
        y = ""
    )

As expected we see that uncertainty is lower near sites, and increases as we go further from them.

We see from plot above that SPDE makes sense for continuous process in space, as it can interpolate anywhere in the plane based on some assumed correlation structure. However, we know that eel do not live on land and therefore that their abundane is highly discontinuous. Some methods have adpated the SPDE framework to account for spatial barriers. Yet, this method doesn’t seem suited for river networks as we would have mostly barrier with a very complex topology. So let’s try something else.

4.3.3. Variograms (euclidian distance)

We can visualise the spatial correlation between sites with variograms. Variograms represent the spatial dependence between pair of points dependending on their distance.

For a spatial variable \(z(s)\), the semivariaogram \(\gamma(h)\) is

\[ \gamma(h) = \frac{1}{2} \mathbb{E}[(z(s) - z(s + h))^2] \]

where \(h\) is the distance between points. For our case study, we can compare semivariograms where we use the euclidian distance and the hydrographic distance. This comparison may allow us to identify which of the distances is more relevant to model spatial dependence.

Prepare data for variogram analysis.

site_vals <- data |>
    group_by(pop_id) |>
    summarise(mean_abundance = mean(abundance), .groups = "drop")

pts_sites <- pts_vilaine.proj %>%
    left_join(site_vals, by = "pop_id") %>%
    as_Spatial() # gstat works with Spatial* objects

Compute and plot empirical semivariogram using Euclidean distance.

vg_euclidian <- variogram(mean_abundance ~ 1, locations = pts_sites)
plot(vg_euclidian, xlab = "Euclidian distance (m)", ylab = "Semivariance")

4.3.4. Fixing river network

First, let’s check the connectivity of the network. Disconnected nodes will break the path calculation.

net_proj <- st_transform(net, st_crs(pts_vilaine.proj))
net_proj <- net_proj %>% to_undirected()

components <- components(net_proj)
components$no
[1] 15

We see that we have more than one connected components. Let’s extract the largest one.

comp_sizes <- table(components$membership)
largest_comp <- as.numeric(names(sort(comp_sizes, decreasing = TRUE)[1]))

net_largest <- net_proj |>
    activate("nodes") |>
    filter(components$membership == largest_comp) |>
    activate("edges")

plot(net_largest, cex = 0.5)

OK, so we see that there is probably in issue with the edges because a large part of the Vilaine bassin is removed.

Let’s identify which edges are broken by finding edges that connect nodes from different components.

After visual inspection, I’ve identified that there are two nodes very close (1504 and 1167) but not connected, although they are the same point. I’ll simply add a new edge to connect them.

link_nodes <- function(net) {
    n1 <- 1167
    n2 <- 1504
    nodes_sf <- net |>
        activate("nodes") |>
        st_as_sf()
    edges_sf <- net |>
        activate("edges") |>
        st_as_sf()
    coords <- rbind(
        st_coordinates(nodes_sf[n1, ]),
        st_coordinates(nodes_sf[n2, ])
    )
    crs <- st_crs(nodes_sf)
    line <- st_sfc(st_linestring(coords), crs = crs)
    new_edge <- edges_sf[1, ]
    for (col in names(new_edge)) new_edge[[col]] <- NA
    new_edge$from <- n1
    new_edge$to <- n2
    st_geometry(new_edge) <- line
    edges_sf2 <- bind_rows(edges_sf, new_edge)
    net_fixed <- sfnetwork(nodes_sf, edges_sf2, directed = FALSE)
    net_fixed
}

net_fixed <- link_nodes(net)

Let’s check the connected components.

net_proj <- st_transform(net_fixed, st_crs(pts_vilaine.proj))

largest_component <- function(net) {
    components <- components(net)

    comp_sizes <- table(components$membership)
    largest_comp <- as.numeric(names(sort(comp_sizes, decreasing = TRUE)[1]))

    net_largest <- net |>
        activate("nodes") |>
        filter(components$membership == largest_comp) |>
        activate("edges")
    net_largest
}

net_largest <- largest_component(net_proj)
plot(net_largest, cex = 0.5)

OK it seems that we have fixed the disconnection issues. Let’s try to compute hydrographic distances. First, we have to snap the stations nodes to the network.

net_with_sites <- net_largest %>%
    st_network_blend(pts_vilaine.proj)

nodes <- net_with_sites |>
    activate("nodes") |>
    st_as_sf()
site_node_ids <- which(!is.na(nodes$pop_id))

plot(net_with_sites, cex = 0.3)
plot(pts_vilaine.proj, add = TRUE, col = "red", pch = 16, cex = 1.5)
text(nodes[site_node_ids, ]$geometry, labels = site_node_ids)

We see that a station is away from the remaining connected component. After another inspection with mapview, we can see that the points in the estuary are disconnected. So I suggest to create a new point, representing the estuary which connects all of these points.

This involves multiple steps. First, create the estuary point. Second, connect the new node the the one in the estuary. Third, check the new network with visual inspection.

estuary_nodes <- c(1664, 1661, 1365) # Do not keep branch with only a few points.

nodes_all <- net |>
    activate("nodes") |>
    st_as_sf()

centroid <- nodes_all |>
    slice(estuary_nodes) |>
    st_union() |>
    st_centroid()

nodes_all_up <- rbind(nodes_all, centroid)
n_centroid <- nrow(nodes_all_up)

edges_all <- net |>
    activate("edges") |>
    st_as_sf()
crs <- st_crs(net)
edges_all_up <- edges_all

for (n in estuary_nodes) {
    coords <- rbind(
        st_coordinates(nodes_all_up[n_centroid, ]),
        st_coordinates(nodes_all_up[n, ])
    )
    geom <- st_sfc(st_linestring(coords), crs = crs)
    new_edge <- edges_all[1, ]
    for (col in names(new_edge)) new_edge[[col]] <- NA
    new_edge$to <- n_centroid
    new_edge$from <- n
    st_geometry(new_edge) <- geom
    edges_all_up <- bind_rows(edges_all_up, new_edge)
}

net_up <- sfnetwork(nodes_all_up, edges_all_up, directed = FALSE)
net_up <- link_nodes(net_up)
# Extract nodes and edges as sf
nodes_sf <- net_up %>%
    activate("nodes") %>%
    st_as_sf()
edges_sf <- net_up %>%
    activate("edges") %>%
    st_as_sf()

nodes_sf$label <- as.character(seq_len(nrow(nodes_sf)))
edges_sf$label <- paste(edges_sf$from, edges_sf$to, sep = "-")

# Basic map: nodes (with labels) over edges
mapview(edges_sf, color = "steelblue", alpha.regions = 0.5, legend = FALSE) +
    mapview(nodes_sf, zcol = NULL, popup = nodes_sf$label, label = nodes_sf$label, col.region = "orange")
net_largest <- largest_component(net_up)
net_largest <- st_transform(net_largest, st_crs(pts_vilaine.proj))

net_with_sites <- net_largest |>
    st_network_blend(pts_vilaine.proj)

nodes <- net_with_sites |>
    activate("nodes") |>
    st_as_sf()
site_node_ids <- which(!is.na(nodes$pop_id))

plot(net_with_sites, cex = 0.3)
plot(pts_vilaine.proj, add = TRUE, col = "red", pch = 16, cex = 1.5)
text(nodes[site_node_ids, ]$geometry, labels = site_node_ids)

OK, so now all of our stations are on the hydrographic network. We can now compute hydrographic distances.

4.3.5. Variograms (hydrographic distance)

net_with_sites <- net_with_sites |>
    activate("edges") |>
    mutate(weight = edge_length())

n1 <- 2453
n2 <- 2454
paths <- st_network_paths(net_with_sites, from = n1, to = n2)
paths
# A tibble: 1 × 2
  node_paths edge_paths
  <list>     <list>    
1 <int [18]> <int [17]>
node_path <- paths |>
    slice(1) |>
    pull(node_paths) |>
    unlist()
node_path
 [1] 2453  933  934 1321  523  524  945 1333 1634 2425 1341 1340  958  956 1867
[16] 1329 1325 2454
plot(net_with_sites, col = "grey", cex = 0.3)
plot(slice(activate(net_with_sites, "nodes"), node_path), col = "red", add = TRUE)

The path calculated seems right. We can further proceed and compute the distance. We begin by defining a function between a pair of points, and then we will loop between all possible pairs.

get_distance <- function(net, i, j) {
    paths <- st_network_paths(net, from = i, to = j)
    edges <- net |>
        activate("edges") |>
        st_as_sf()
    edge_path <- paths |>
        slice(1) |>
        pull(edge_paths) |>
        unlist()
    edges |>
        slice(edge_path) |>
        pull(weight) |>
        sum()
}

get_distance(net_with_sites, n1, n2)
52358.15 [m]

The pairwise function appears to work, let’s loop now.

n_sites <- length(site_node_ids)
D_hydro <- matrix(NA, n_sites, n_sites)
for (i in 1:n_sites) {
    node_i <- site_node_ids[i]
    for (j in i:n_sites) {
        if (i != j) {
            node_j <- site_node_ids[j]
            D_hydro[i, j] <- get_distance(net_with_sites, node_i, node_j)
            D_hydro[j, i] <- D_hydro[i, j]
        }
    }
}

Now that we have pairwise hydrographic distance between sites, we can build a new statistical model that account for this spatial dependency.

4.3.6. Besag random effect

Build adjacency matrix connecting each site to its nearest neighbors along the river.

k <- 2 # Connect to 2 nearest neighbors (typically upstream/downstream)
adj_list <- vector("list", n_sites)
for (i in seq_len(n_sites)) {
    drow <- D_hydro[i, ]
    drow[i] <- NA # Exclude self-link.
    nn <- order(drow, na.last = NA)[1:k]
    nn <- nn[!is.na(nn)]
    adj_list[[i]] <- unique(nn)
}

# Make adjacency symmetric (if i~j then j~i).
for (i in seq_len(n_sites)) {
    for (j in adj_list[[i]]) {
        adj_list[[j]] <- unique(c(adj_list[[j]], i))
    }
}

Let’s check the resulting graph.

g <- make_empty_graph(n_sites, directed = FALSE)
for (i in seq_len(n_sites)) {
    if (length(adj_list[[i]]) > 0) {
        for (j in adj_list[[i]]) g <- add_edges(g, c(i, j))
    }
}
n_components <- components(g)$no
cat("Number of connected components:", n_components, "\n")
Number of connected components: 3 

Now we visualize the adjacency structure on a map.

# Create edges sf object from adjacency list.
edges_adj <- data.frame()
for (i in seq_len(n_sites)) {
    for (j in adj_list[[i]]) {
        if (i < j) { # avoid duplicate edges
            edges_adj <- rbind(edges_adj, data.frame(from = i, to = j))
        }
    }
}

nodes_sf <- net_with_sites |>
    activate("nodes") |>
    st_as_sf()

# Get coordinates for each site.
coords_sites <- st_coordinates(nodes_sf[site_node_ids, ])
coords_sites <- data.frame(site_idx = 1:n_sites, coords_sites)

# Create line geometries between neighbors.
edges_geom <- list()
for (k_edge in 1:nrow(edges_adj)) {
    i <- edges_adj$from[k_edge]
    j <- edges_adj$to[k_edge]
    coords <- rbind(
        coords_sites[i, c("X", "Y")],
        coords_sites[j, c("X", "Y")]
    )
    line <- st_linestring(as.matrix(coords))
    edges_geom[[k_edge]] <- line
}
edges_geom <- st_sfc(edges_geom, crs = st_crs(nodes_sf))

edges_sf <- st_sf(
    data = edges_adj,
    geometry = edges_geom
)

edges_river <- net_with_sites %>%
    activate("edges") %>%
    st_as_sf()

# Plot: sites with adjacency edges.
ggplot() +
    geom_sf(data = edges_sf, color = "steelblue", size = 0.8, alpha = 0.6) +
    geom_sf(data = nodes_sf[site_node_ids, ], color = "darkred", size = 2) +
    geom_sf(
        data = edges_river,
        inherit.aes = FALSE,
        size = 0.3,
        alpha = 0.3
    ) +
    labs(
        x = "",
        y = ""
    )

Edges are really ruled by hydrographic and not euclidian distance. For example, the two sites in the North-West are very close according to the euclidian distance, but are not connected because they belong to two different branch arms.

Caution

We have several connected components. This is probably an issue because it means that part of our river are independent from another, although belonging to the hydrographic bassin. A solution might be to connect components by their closest sites.

To give the graph to inla let’s create a sparse adjacency matrix.

# 1. Build sparse adjacency matrix from adj_list
adj_matrix <- matrix(0, nrow = n_sites, ncol = n_sites)
for (i in seq_len(n_sites)) {
    for (j in adj_list[[i]]) {
        adj_matrix[i, j] <- 1
    }
}
adj_matrix_sparse <- as(adj_matrix, "dgCMatrix")

node_sites <- nodes_sf[site_node_ids, ]
data$node_id <- match(data$pop_id, node_sites$pop_id)

m.besag <- inla(
    abundance ~ 1 + altitude_s +
        f(year, model = "ar1") +
        f(node_id, model = "besagproper", graph = adj_matrix_sparse),
    family = "nbinomial",
    data = data,
    control.compute = list(waic = TRUE, config = TRUE)
)

summary(m.besag)
Time used:
    Pre = 0.4, Running = 0.2, Post = 0.0439, Total = 0.644 
Fixed effects:
              mean    sd 0.025quant 0.5quant 0.975quant   mode kld
(Intercept)  2.068 0.319      1.433    2.067      2.714  2.067   0
altitude_s  -1.285 0.196     -1.668   -1.286     -0.896 -1.286   0

Random effects:
  Name    Model
    year AR1 model
   node_id Proper version of Besags ICAR model

Model hyperparameters:
                                                         mean    sd 0.025quant
size for the nbinomial observations (1/overdispersion)  2.213 0.185      1.869
Precision for year                                     14.102 7.404      4.285
Rho for year                                            0.769 0.120      0.481
Precision for node_id                                   0.499 0.208      0.212
Diagonal for node_id                                    1.296 1.027      0.240
                                                       0.5quant 0.975quant
size for the nbinomial observations (1/overdispersion)     2.21      2.599
Precision for year                                        12.59     32.589
Rho for year                                               0.79      0.939
Precision for node_id                                      0.46      1.014
Diagonal for node_id                                       1.02      4.008
                                                        mode
size for the nbinomial observations (1/overdispersion) 2.191
Precision for year                                     9.810
Rho for year                                           0.838
Precision for node_id                                  0.392
Diagonal for node_id                                   0.608

Watanabe-Akaike information criterion (WAIC) ...: 3429.82
Effective number of parameters .................: 44.93

Marginal log-Likelihood:  -1775.97 
 is computed 
Posterior summaries for the linear predictor and the fitted values are computed
(Posterior marginals needs also 'control.compute=list(return.marginals.predictor=TRUE)')

Plot spatial RE.

re.site.besag <- m.besag$summary.random$node_id

# Create sf for plotting
pts_besag <- node_sites %>%
    bind_cols(re.site.besag)

# Plot Besag RE on map
ggplot(pts_besag) +
    geom_sf(
        data = edges_river,
        inherit.aes = FALSE,
        size = 0.3,
        alpha = 0.3
    ) +
    geom_sf(aes(color = mean), size = 3) +
    scale_color_gradient2(low = "blue", mid = "white", high = "red", midpoint = 0) +
    labs(
        color = "Mean RE"
    )

pred_df <- data.frame(
    pop_id = data$pop_id,
    year = data$year,
    true_abundance = data$abundance,
    pred_abundance = m.besag$summary.fitted.values$mean
) %>%
    left_join(pts_vilaine %>% st_drop_geometry() %>% select(pop_id), by = "pop_id")

d_plot <- pred_df %>%
    filter(year %in% seq(min(year), max(year), by = 6)) %>% # every 5 years
    left_join(pts_vilaine, by = "pop_id") %>%
    st_as_sf()

d_plot$res <- log(abs(d_plot$true_abundance - d_plot$pred_abundance))

ggplot() +
    geom_sf(data = edges_river, color = "lightgrey", size = 0.3, inherit.aes = FALSE) +
    geom_sf(data = d_plot, aes(color = true_abundance), size = 3) +
    facet_wrap(~year) +
    labs(color = "Obs. abund.")

ggplot() +
    geom_sf(data = edges_river, color = "lightgrey", size = 0.3, inherit.aes = FALSE) +
    geom_sf(data = d_plot, aes(color = pred_abundance), size = 3) +
    facet_wrap(~year) +
    labs(color = "Pred. abund.")

ggplot() +
    geom_sf(data = edges_river, color = "lightgrey", size = 0.3, inherit.aes = FALSE) +
    geom_sf(data = d_plot, aes(color = res), size = 3) +
    facet_wrap(~year) +
    labs(color = "Res.")

TODO