Infer missing fish sizes

Summary

In this notebook, we

  • we infer missing fish sizes for sea campagnes assuming a normal distribution
  • validate our inference using cross-validation

0. Setup

library(renv)
renv::load()
renv::restore() # Install missing librairies.
renv::status() # Check the project state.

library(targets)
library(here)
tar_config_set(
  store = here::here("_targets")
)
library(dplyr)
library(tidyr)
library(ggplot2)
library(ggVennDiagram)
set_theme(theme_minimal())
library(riversea)

1. Illustrate the inference with synthetic data

The idea of the inference is quite simple. We observe a given distribution of values, representing fish sizes. We assume that the underlying distribution is a normal distribution. This choice can be seen as a choice toward simplicity. We say that we only know about the mean and the variance of the distribution, and in this case the most probable distribution is a normal distribution.

To begin with, we illustrate the procedure on synthetic data. So let’s simulate values from a normal distribution.

n <- 1000 # Sample size.
obs <- rnorm(n)
n_missing <- 100
obs_missing <- data.frame(true = rnorm(n_missing))

infer_missing_sizes <- function(obs, n_missing) {
  mu <- mean(obs)
  sigma <- sd(obs)
  rnorm(n_missing, mu, sigma)
}

obs_missing$pred <- infer_missing_sizes(obs, n_missing)

ggplot(obs_missing, aes(x = true, y = pred)) +
  geom_point() +
  geom_abline(intercept = 0, slope = 1)

We see that this method is quite bad at predicting fish size at the individual level. But what really matters is whether the infered distribution of missing sizes corresponds to the true one.

obs_missing_long <- obs_missing |>
  pivot_longer(
    cols = c(true, pred),
    names_to = "type",
    values_to = "value"
  )

ggplot(obs_missing_long, aes(x = value, fill = type)) +
  geom_histogram(alpha = 0.5, position = "identity")

We see that here it works quite well, because the sample size is large, and we are sampling missing values from a normal distribution, which won’t be the case for real data.

2. Apply the methods to campagne data

First, we load our data set and summarise the different surveys.

tidy_data <- tar_read(sea_data_tidy)
head(tidy_data$size)
     trait       species length year survey species_valid is_valid
1 61839708       Abramis    5.2 2012  pomet          <NA>    FALSE
2 61836630 Abramis brama    6.3 2016  pomet Abramis brama     TRUE
3 61836630 Abramis brama    5.5 2016  pomet Abramis brama     TRUE
4 61836630 Abramis brama    7.0 2016  pomet Abramis brama     TRUE
5 61836630 Abramis brama    6.5 2016  pomet Abramis brama     TRUE
6 61836630 Abramis brama   19.3 2016  pomet Abramis brama     TRUE

The first step of the processing is to check the validity of the species name. For now, we only verify whether the name is recognized by fishbase. We filter out unrecognized species.

valid_data <- filter_valid(tidy_data)
head(valid_data$size)
     trait length year survey species_valid
1 61836630    6.3 2016  pomet Abramis brama
2 61836630    5.5 2016  pomet Abramis brama
3 61836630    7.0 2016  pomet Abramis brama
4 61836630    6.5 2016  pomet Abramis brama
5 61836630   19.3 2016  pomet Abramis brama
6 61836630    6.1 2016  pomet Abramis brama
tar_read(venn_diagram)
[1] "/home/ilajaaiti/Documents/riversea/figures/venn-diagram.png"

Next, we compute the minimal, mean and maximal length for each species.

size_extrema <- valid_data$size |>
  group_by(species_valid) |>
  summarise(
    length_min = min(length),
    length_mean = mean(length),
    length_max = max(length),
    n = n(),
    .groups = "drop"
  ) |>
  arrange(desc(n))
head(size_extrema)
# A tibble: 6 × 5
  species_valid          length_min length_mean length_max     n
  <chr>                       <dbl>       <dbl>      <dbl> <int>
1 Pomatoschistus minutus        0.1        5.60       13.4 97057
2 Solea solea                   1         12.5        55.5 91992
3 Trisopterus luscus            1         10.8        35   75035
4 Callionymus lyra              2.5        9.95       24   41395
5 Pomatoschistus microps        0.1        4.00       10   34296
6 Buglossidium luteum           0.4        7.90       19.6 27748
data <- add_observation_number(valid_data)
head(data$catch)
     trait year abundance survey species_valid n_measured abundance_cor
1 61836379 2018        49  pomet Abramis brama         30            49
2 61831377 2018         1  pomet Abramis brama          1             1
3 61837714 2009        53  pomet Abramis brama         32            53
4 61839934 2013         1  pomet Abramis brama          1             1
5 61832476 2010         1  pomet Abramis brama          1             1
6 61836364 2012        16  pomet Abramis brama         16            16
data_imp <- impute_size(data)

For sanity we can check that we have added the right number of sizes to the data frame.

head(data_imp$size)
     trait length year survey species_valid measured
1 61836630    6.3 2016  pomet Abramis brama     TRUE
2 61836630    5.5 2016  pomet Abramis brama     TRUE
3 61836630    7.0 2016  pomet Abramis brama     TRUE
4 61836630    6.5 2016  pomet Abramis brama     TRUE
5 61836630   19.3 2016  pomet Abramis brama     TRUE
6 61836630    6.1 2016  pomet Abramis brama     TRUE
data_imp$catch |>
  mutate(n_miss = abundance_cor - n_measured) |>
  pull(n_miss) |>
  sum()
[1] 254915
sum(!data_imp$size$measured)
[1] 254915