2 Data processing and analysis

Due to the large volumes of data involved, and confidentiality agreements for the majority of the data, we will share the codes starting when the metrics for the dyads have already been computed. The data shared in this repository are only at the scale of dyads, and not locations.

2.1 Setting up configuration previous to the analysis

Calling packages, setting up data, auxiliary code, stats and figure paths, and sourcing functions:

knitr::opts_chunk$set(echo = FALSE, warning = FALSE, message = FALSE)
library(lubridate) # date function
library(magrittr) # pipes
library(stringr) #str_replace_all
library(ggplot2) 
library(xtable)
library(maps) # get background
library(maptools) # map2SpatialPolygons
library(mapdata) # worldHires
library(animation) # videos
library(igraph) # network representation
library(viridis)
library(dplyr) # filter, mutate
library(Rmixmod)
library(FactoMineR)
library(tidyr) # gather
library(mvtnorm) # for overlap index

dossier.data.outputs <- './DataOutputs/' 
dossier.stats.outputs <- './images/' 
dossier.codes <- './R/'



source(paste0(dossier.codes, 'dyad_info_general_allgears.r')) # get summary of dyads
source(paste0(dossier.codes, 'pre_video_modif_allgears.r')) # get data into format for video functions
source(paste0(dossier.codes, 'VideoHtmlDyadAllGears.r')) # get animation in html
source(paste0(dossier.codes, 'VideoGIFDyadAllGears.R')) # get animation in gif
source(paste0(dossier.codes, 'plot_dyad_general_allgears.R')) # plot tracks
source(paste0(dossier.codes, 'subplot_uni.r')) # plot metric time series
source(paste0(dossier.codes, 'PlotVideoDyadsGMM.R')) # general function to plot and animate, using functions above
source(paste0(dossier.codes, 'FleetStats-AllGears.r')) # get fleet stats and network graph
source(paste0(dossier.codes, 'stats_clusters.R')) # get general statistics of groups
source(paste0(dossier.codes, 'dyad_classif.R')) # uses model for pair trawlers to classify the dyads from the other fleets and produce graphs

These functions are here.

2.2 Gaussian mixture modeling

# load(paste0(dossier.data.outputs, "all_dyads.RData")) # dyads and metrics
dyads <- readRDS(file = paste0(dossier.data.outputs,"all_dyads_vJ.rds")) 
# dyads and metrics

variables <- c("Prox", "DI.theta", "DI.d")
gear <- "all"

# just PTM and variables for analysis
dyads %>%  filter(gear.1 == "PTM") %>% select(variables) -> dta

##################### GMM estimation ####################################

# specifying some parameters for the GMM parameter estimation
strat <-
mixmodStrategy(algo = "EM", initMethod = "smallEM")

# Then GMM with the 3 metrics, for 3 clusters
N_rep <- 30 # number of replicates
test <- list()
for(i in 1:N_rep){
  test[[i]] <-
    Rmixmod::mixmodCluster(
      data = as.data.frame(dta[, 1:3]),
      nbCluster = 3,
      strategy = strat,
      criterion = c("ICL"),
      models = Rmixmod::mixmodGaussianModel(
        family = "general",
        free.proportions = TRUE,
        equal.proportions = FALSE,
        listModels =
          c("Gaussian_pk_Lk_Ck")
      )
    )
  
}

# choosing the best model from the replicates based on ICL
BestModel <-
test[[order(rapply(test, function(x)
x@bestResult@criterionValue))[2]]]

# saving the best model
saveRDS(
BestModel,
file = paste0(
dossier.stats.outputs,
'BestFreeModel-PTM-',
gear,
'-',
paste0(variables, collapse = '-'),
'.rds'
)
)

##################### Extracting output stats ####################################

# now we are extracting variance values to compare the variance between clusters
var_par <- BestModel@bestResult@parameters@variance
# var_par is a list with a number of elements equal to the number of clusters, and each element is a variance matrix for the metrics (for a given cluster)
# we'll pay attention to the variances and not covariances, so we'll use the list to make a matrix where each column is a metric and each row is a cluster, and each element ij is the variance of the metric j in cluster i.
# if we wanted correlations:
correlation_list <- lapply(var_par, function(x){
  y <- matrix(NA,ncol=ncol(x),nrow = ncol(x))
  for (i in 1:ncol(x)){
    for (j in 1:ncol(x)){
      if (i != j){
        y[i,j] <- x[i,j]/(sqrt(x[i,i])*sqrt(x[j,j]))
      }
    }
  }
  return(y)
})

variances <-
t(sapply(BestModel@bestResult@parameters@variance, function(mat_) {
diag(mat_)
}))
# we then get the minimum variance per metric accross clusters. And, now that I think about it, this is biased by the fact that not all variables are in the same range. DI.theta goes from -1 to 1.
minimal_var <- apply(variances, 2, min)
# now, how much is variance of each metric in each cluster is higher than the minimum variance of each metric throughout clusters?
ratio_var <- sweep(variances, 2, STATS = minimal_var, FUN = '/')
# another form of comparison can be done by using the variances (not covariances) and means to calculate the coefficients of variation
cv <- sqrt(t(
sapply(BestModel@bestResult@parameters@variance,
function(mat_) {
diag(mat_)
})
)) /  BestModel@bestResult@parameters@mean

# saving these results
cluster.file = paste0(
dossier.stats.outputs,
'cluster_var_PTM_',
gear,
'-',
paste(str_replace_all(variables, "[[:punct:]]", ""), collapse = '-'),
'.txt'
)
sink(cluster.file)
cat("Number of clusters: ", BestModel@bestResult@nbCluster, "\n")
cat("\n")
cat("\n")
cat("Metric variances: ", "\n")
print(variances)
cat("\n")
cat("Ratios: ", "\n")
print(ratio_var)
cat("\n")
cat("CV: ", "\n")
print(cv)
cat("\n")
cat("\n")
sink()

# adding the cluster classification and associated probability to the PTM df
dta %>% mutate(
clust_prox = factor(BestModel@bestResult@partition),
p_cluster = apply(BestModel@bestResult@proba, 1, max)
) -> dta

# overlapping index
mean_variance <- NULL
mean_variance$means <- BestModel@bestResult@parameters@mean

mean_variance$variance <- BestModel@bestResult@parameters@variance

means <- mean_variance$means
variances <- mean_variance$variance
delta <- 0.025 # pas de discrétisation des lois gaussiennes
x <- seq(-1, 2, delta) # discrétisation de l'[] de fluctuation +/- 1
y <- seq(-1, 2, delta) # discrétisation de l'[] de fluctuation +/- 1
z <- seq(-1, 2, delta) # discrétisation de l'[] de fluctuation +/- 1
grid <- expand.grid(x, y, z)
gaussian.overlap.trivar <-
array(NA, dim = c(3, 3), dimnames = list(c("c1", "c2", "c3"), c("c1", "c2", "c3")))
for (i in 1:2) {
for (j in (i + 1):3) {
#i <- 3 ; j <- 2
p1 <-
dmvnorm(x = grid,
mean = t(means[i, ]),
sigma = variances[[i]]) ### tri-Gaussian for cluster i
p2 <-
dmvnorm(x = grid,
mean = t(means[j, ]),
sigma = variances[[j]]) ### tri-Gaussian for cluster j
gaussian.overlap.trivar[i, j] <-
sum((p1 < p2) * p1 + (p2 < p1) * p2) * delta ^ 3
}
}
print("overlapping index")
print(gaussian.overlap.trivar)

##################### Ordering clusters ####################################

# Now, we need to find a way to automatically order the clusters by degree of joint movement.
# For that, we will use the mean of the scaled and centered variables
mean_metrics <- dta %>%
mutate(
prox_z = scale(Prox, center = TRUE, scale = TRUE),
di_theta_z = scale(DI.theta, center = TRUE, scale = TRUE),
di_d_z = scale(DI.d, center = TRUE, scale = TRUE)
) %>%
group_by(clust_prox) %>%
summarise(
m_prox_z = mean(prox_z),
m_di_theta_z = mean(di_theta_z),
m_di_d_z = mean(di_d_z)
)

order_clust <-
  order(apply(mean_metrics[, 2:4], 1, FUN = "mean"), decreasing = TRUE)
  
  # once we get the order in which clusters should be, we change the label of clusters so that 1 corresponds to high joint movement... and so on
  # the clust_prox column was a factor so let's first transform it into numeric
  dta$clust_prox <- as.numeric(as.character(dta$clust_prox))
  
  # stock the old label into a vector so that we change things around in this vector
  new_order_vector <- dta$clust_prox
  
  # I could have done this one before: stock the number of clusters in an object
  k <- length(unique(new_order_vector))
  # for each 'new label', we get its corresponding old label, then search in the old column for those observations and change them for the new label in the new vector
  for (cl in 1:k) {
  new_order_vector[which(dta$clust_prox == order_clust[cl])] <- cl
  }
  # then add a new column to the df with the new order
  dta$clust_new_order <- new_order_vector
  # and add a column that is a factor of that
  dta$group <- as.factor(dta$clust_new_order)
  
##################### Plots and stats ####################################
  
  # Now let's do some plots with the new order and save them as pdf (for resolution and vectorial graphics) and png (for quick views)
  # I'm using viridis color palette since it should be colorblind friendly
  # First plot: first metric vs. second metric
  dta %>% ggplot(aes(x = dta[, 1], y = dta[, 2], col = group)) + geom_point() + xlab('Prox') + ylab('DItheta') + scale_colour_viridis_d() + theme_bw()
  ggsave(
  paste0(
  dossier.stats.outputs,
  "clustFree_Prox_DItheta_PTM_",
  gear,
  "_ordered.pdf"
  )
  )
  ggsave(
  paste0(
  dossier.stats.outputs,
  "clustFree_Prox_DItheta_PTM_",
  gear,
  "_ordered.png"
  )
  )
  
  # First metric vs. third metric
  dta %>% ggplot(aes(x = dta[, 1], y = dta[, 3], col = group)) + geom_point() + xlab('Prox') + ylab('DId') + scale_colour_viridis_d() + theme_bw()
  ggsave(paste0(
  dossier.stats.outputs,
  "clustFree_Prox_DId_PTM_",
  gear,
  "_ordered.pdf"
  ))
  ggsave(paste0(
  dossier.stats.outputs,
  "clustFree_Prox_DId_PTM_",
  gear,
  "_ordered.png"
  ))
  
  # Now plotting distributions of probabilities per cluster
  
  # median posterior probability per cluster
  dta %>% group_by(group) %>% summarise(median(p_cluster)) 
  
  ggplot(data = dta, aes (x = group, y = p_cluster, fill = group)) +
  geom_boxplot() +
  scale_fill_viridis(discrete = TRUE)  +
  theme_bw() +
  xlab('') + ylab("") +
  theme(legend.position = "none", text = element_text(size = 18))
  ggsave(paste0(
  dossier.stats.outputs,
  "clustFree_proba_PTM_",
  gear,
  "_ordered.pdf"
  ))
  ggsave(paste0(
  dossier.stats.outputs,
  "clustFree_proba_PTM_",
  gear,
  "_ordered.png"
  ))
  
  # calculating the percentage of PTM dyads in each cluster and then saving the information
  perc_table <- round(prop.table(table(dta$group)) * 100, 1)
  
  sink(cluster.file, append = TRUE)
  cat("Order of old labels: ", "\n")
  print(order_clust)
  cat("\n")
  cat("\n")
  cat("Percentage of dyads in each ordered cluster: ", "\n")
  print(perc_table)
  cat("\n")
  cat("\n")
  sink()
  
  # Now I want to know what the distribution of each metric looks like in each cluster. I'll do that through violin plots. For that, I'll need to work on the format of the df to plot
  # Selecting just the columns of interest
  dta_facto  <- dta[, c(1, 2, 3, 4, 7)]
  # Gathering into more rows and less columns
  dta_gather <- dta_facto %>%
  gather(key = metric, value = value,-clust_prox,-group)
  # Transforming the new metric column into a factor with levels ordered based on the contribution of each metric to the first principal component
  dta_gather$metric <-
  factor(dta_gather$metric, levels = c("Prox", "DI.theta", "DI.d"))
  #names(weight_var)[order(weight_var,
  # decreasing = TRUE)])
  
  # histograms
  ggplot(data = dta_gather, aes(x = value, color = group)) +
  geom_histogram(fill = "white", position = "dodge") +
  # geom_density(alpha=0.6)+
  facet_wrap(facets = vars(metric), scales = "free") +
  scale_color_viridis(discrete = TRUE) +
  theme_bw() +
  xlab('') + ylab("") +
  theme(legend.position = "none", text = element_text(size = 16))
  ggsave(
  paste0(
  dossier.stats.outputs,
  "clustFree_MetricsHist_PTM_",
  gear,
  "_ordered.pdf"
  ),
  width = 8
  )
  ggsave(
  paste0(
  dossier.stats.outputs,
  "clustFree_MetricsHist_PTM_",
  gear,
  "_ordered.png"
  ),
  width = 8
  )
  
## Here we'll be computing stats about the number of dyads, couples, vessels, etc., in total and each cluster

# First, we'll save a df with everything we could care about for PTM dyads
dyads_cluster <- dyads %>% filter(gear.1 == "PTM")
dyads_cluster <-
  cbind.data.frame(dyads_cluster, dta[, c("p_cluster", "clust_new_order")])
saveRDS(
  dyads_cluster,
  file = paste0(
    dossier.data.outputs,
    'ClustPCAoutputs-PTM-',
    gear,
    '-',
    paste0(variables, collapse = '-'),
    '_ordered.rds'
  )
)

# now we compute the stats and save them in .RData and a LaTeX table format
stats.clusters <- stats_clusters(k, dyads_cluster)

save(
  stats.clusters,
  file = paste0(
    dossier.data.outputs,
    "ClusterTable-",
    k,
    '-PTM-',
    gear,
    '-',
    paste(variables, collapse = '-'),
    'ordered.RData'
  )
)

print(
  xtable(t(stats.clusters)),
  file = paste0(
    dossier.stats.outputs,
    "ClusterLatexTable-",
    k,
    '-PTM-',
    gear,
    '-',
    paste(variables, collapse = '-'),
    'ordered.txt'
  )
)

2.3 Classifying the dyads of the other fleets

We use the Best GMM for pelagic pair trawlers to classify the other dyads into the 3 groups, by fleet. We also do the same graphs as for the pair trawlers (and some extras) and produce the statistics by cluster. We use functions called above.

# Large bottom otter trawlers
dyad_classif(
gear = "LOTB",
variables,
dyads,
BestModel,
k,
order_clust,
cluster.file,
dossier.stats.outputs,
dossier.data.outputs,
dta
)

# Small bottom otter trawlers
dyad_classif(
gear = "SOTB",
variables,
dyads,
BestModel,
k,
order_clust,
cluster.file,
dossier.stats.outputs,
dossier.data.outputs,
dta
)

# Mid-water otter trawlers
dyad_classif(
gear = "OTM",
variables,
dyads,
BestModel,
k,
order_clust,
cluster.file,
dossier.stats.outputs,
dossier.data.outputs,
dta
)

# Tuna purse-seiners
dyad_classif(
gear = "thon",
variables,
dyads,
BestModel,
k,
order_clust,
cluster.file,
dossier.stats.outputs,
dossier.data.outputs,
dta
)

# Anchovy purse-seiners
dyad_classif(
gear = "APS-10",
variables,
dyads,
BestModel,
k,
order_clust,
cluster.file,
dossier.stats.outputs,
dossier.data.outputs,
dta
)

2.4 Plots and animations of dyad’s tracks and time series, and network graphs

In order to reproduce this code, you would need the actual VMS data from the fleets, which I cannot provide. Due to the same confidentiality issues, these plots are anonymous, without coastlines and the longitude and latitude coordinates are transformed.

# We first set some values for parameters
video.par <- NULL
video.par$type <-
'plot' # choose between video, plot, series and all
video.par$mp4 <-
FALSE # IF SO, ffmpeg function must be able to run in the konsole
video.par$gif <- FALSE # Or TRUE
video.par$html <-
FALSE # we need this to be TRUE to create mp4 anyway
video.par$series_gif <-
FALSE # Or TRUE. To produce gifs of time series of the metrics
video.par$series_pdf <-
FALSE # Or TRUE. To produce pdf graphs of time series of the metrics
video.par$traza = 10 # number of hours for leaving a trace at each frame in the video
video.par$epais = 4 # width of trajectories
video.par$sleep = 0.25 # time between frames (quarter of a second)
video.par$point.size = 1.2 # size of points in the plots and animations

# VMS data
load(paste0(dossier.data.outputs,'DATA_TRAJECTORIES_YEAR_2012_GAP_3H.rdata'))
vms_data_2012 <- data.trajectories$DATA.TRAJECTORIES
load(paste0(dossier.data.outputs,'DATA_TRAJECTORIES_YEAR_2013_GAP_3H.rdata'))
vms_data_2013 <- data.trajectories$DATA.TRAJECTORIES
vms_trawlers <- rbind.data.frame(vms_data_2012, vms_data_2013)
old_names <- colnames(vms_trawlers)
colnames(vms_trawlers) <- c("x", "y", "date", old_names[4:8],
                             "abs.angle", old_names[10], "id", "burst",
                             old_names[13:30], "gear.maree", old_names[32],
                             "LONGITUDE", "LATITUDE", 'line', old_names[36:38])
rm(vms_data_2012, vms_data_2013, data.trajectories)

load(paste0(dossier.data.outputs, 'vms_thon_2011-2013.RData'))
vms_thon <- vms.data.years
load(paste0(dossier.data.outputs, "vms_data_10.RData"))
vms_aps <- vms.data

# For pelagic pair trawlers

vms_ptm <-
vms_trawlers[which(as.character(vms_trawlers$gear.maree) == "PTM"), ]
dyads_ptm <- readRDS(file = paste0(
dossier.data.outputs,
'ClustPCAoutputs-PTM-all-',
paste0(variables, collapse = '-'),
'_ordered.rds'
))

columns <-
c("x",
"y",
"date",
"id",
"burst",
"abs.angle",
"LONGITUDE",
"LATITUDE",
"line")
vms.data <- vms_ptm[, columns]

Prox_delta <- c(1, 3, 5)
DI_beta <- 1

plots.video.dyads.allgears(
vms.data,
dyads = dyads_ptm,
n = 1,
gear = "PTM",
new.variables = variables,
dossier.data.outputs,
dossier.stats.outputs,
text.dyads = TRUE,
video.par,
colores = c("#66c2a5", "#fc8d62", "#8da0cb")
)

Fleet.Analysis(
vms.data,
dyads = dyads_ptm,
dossier.data.outputs,
dossier.stats.outputs,
gear = "PTM"
)

# For larger bottom otter trawlers

vms_lotb <-
vms_trawlers[which(as.character(vms_trawlers$gear.maree) == "OTB"), ]
dyads_lotb <- readRDS(file = paste0(
dossier.data.outputs,
'ClustPCAoutputs-PTM-LOTB-',
paste0(variables, collapse = '-'),
'_ordered.rds'
))

columns <-
c("x",
"y",
"date",
"id",
"burst",
"abs.angle",
"LONGITUDE",
"LATITUDE",
"line")
vms.data <- vms_lotb[, columns]

Prox_delta <- c(1, 3, 5)
DI_beta <- 1

plots.video.dyads.allgears(
vms.data,
dyads = dyads_lotb,
n = 1,
gear = "LOTB",
new.variables = variables,
dossier.data.outputs,
dossier.stats.outputs,
text.dyads = TRUE,
video.par,
colores = c("#66c2a5", "#fc8d62", "#8da0cb")
)

Fleet.Analysis(
vms.data,
dyads = dyads_lotb,
dossier.data.outputs,
dossier.stats.outputs,
gear = "LOTB"
)

# For small bottom otter trawlers

vms_sotb <-
vms_trawlers[which(as.character(vms_trawlers$gear.maree) == "OTB_SMALL"), ]
dyads_sotb <- readRDS(file = paste0(
dossier.data.outputs,
'ClustPCAoutputs-PTM-SOTB-',
paste0(variables, collapse = '-'),
'_ordered.rds'
))

columns <-
c("x",
"y",
"date",
"id",
"burst",
"abs.angle",
"LONGITUDE",
"LATITUDE",
"line")
vms.data <- vms_sotb[, columns]

Prox_delta <- c(1, 3, 5)
DI_beta <- 1

plots.video.dyads.allgears(
vms.data,
dyads = dyads_sotb,
n = 1,
gear = "SOTB",
new.variables = variables,
dossier.data.outputs,
dossier.stats.outputs,
text.dyads = TRUE,
video.par,
colores = c("#66c2a5", "#fc8d62", "#8da0cb")
)

Fleet.Analysis(
vms.data,
dyads = dyads_sotb,
dossier.data.outputs,
dossier.stats.outputs,
gear = "SOTB"
)

# For mid-water otter trawlers

vms_otm <-
vms_trawlers[which(as.character(vms_trawlers$gear.maree) == "OTM"), ]
dyads_otm <- readRDS(file = paste0(
dossier.data.outputs,
'ClustPCAoutputs-PTM-OTM-',
paste0(variables, collapse = '-'),
'_ordered.rds'
))

columns <-
c("x",
"y",
"date",
"id",
"burst",
"abs.angle",
"LONGITUDE",
"LATITUDE",
"line")
vms.data <- vms_otm[, columns]

Prox_delta <- c(1, 3, 5)
DI_beta <- 1

plots.video.dyads.allgears(
vms.data,
dyads = dyads_otm,
n = 1,
gear = "OTM",
new.variables = variables,
dossier.data.outputs,
dossier.stats.outputs,
text.dyads = TRUE,
video.par,
colores = c("#66c2a5", "#fc8d62", "#8da0cb")
)

Fleet.Analysis(
vms.data,
dyads = dyads_otm,
dossier.data.outputs,
dossier.stats.outputs,
gear = "OTM"
)

# For tuna purse-seiners

dyads_tps <- readRDS(file = paste0(
dossier.data.outputs,
'ClustPCAoutputs-PTM-thon-',
paste0(variables, collapse = '-'),
'_ordered.rds'
))

columns <-
c("x",
"y",
"date",
"id",
"burst",
"abs.angle",
"LONGITUDE",
"LATITUDE",
"lignes")
vms.data <- vms_thon[, columns]
names(vms.data) <-
columns <-
c("x",
"y",
"date",
"id",
"burst",
"abs.angle",
"LONGITUDE",
"LATITUDE",
"line")

Prox_delta <- c(10, 30, 60)
DI_beta <- 1

plots.video.dyads.allgears(
vms.data,
dyads = dyads_tps,
n = 1,
gear = "TPS",
new.variables = variables,
dossier.data.outputs,
dossier.stats.outputs,
text.dyads = TRUE,
video.par,
colores = c("#66c2a5", "#fc8d62", "#8da0cb")
)
# Since there are no dyads in group 1, there is no network analysis for this fleet.

# For anchovy purse-seiners

dyads_aps <- readRDS(
file = paste0(
dossier.data.outputs,
'ClustPCAoutputs-PTM-APS-10-',
paste0(variables, collapse = '-'),
'_ordered.rds'
)
)

columns <-
c("x",
"y",
"date",
"id",
"burst",
"abs.angle",
"LONGITUDE",
"LATITUDE",
"ligne")
vms.data <- vms_aps[, columns]
names(vms.data) <-
columns <-
c("x",
"y",
"date",
"id",
"burst",
"abs.angle",
"LONGITUDE",
"LATITUDE",
"line")

Prox_delta <- c(1, 3, 5)
DI_beta <- 1

plots.video.dyads.allgears(
vms.data,
dyads = dyads_aps,
n = 1,
gear = "APS",
new.variables = variables,
dossier.data.outputs,
dossier.stats.outputs,
text.dyads = TRUE,
video.par,
colores = c("#66c2a5", "#fc8d62", "#8da0cb")
)

Fleet.Analysis(
vms.data,
dyads = dyads_aps,
dossier.data.outputs,
dossier.stats.outputs,
gear = "APS"
)