16S_Analysis_pipeline_v4-3.Rmd

title: "16S Data Analysis"
author: "Simeon Lim Rossmann, Marte Persdatter Tangvik"
date: "23.12.2024"
output:
  html_document: default
urlcolor: blue
# CHANGE ME to the directory that contains 'seqtab_nochim.rds'
path = "16S_DADA2_results_260821/"

# CHANGE ME to TRUE to list all samples and generate an empty metadata file
optional_sample_check = TRUE

# CHANGE ME to TRUE to update cuphyr
update_cuphyr = TRUE

# Initiate by loading packages and setting knit options
################# NO CHANGES NECESSARY BELOW #################
knitr::opts_chunk$set(echo = TRUE)
knitr::opts_chunk$set(root.dir = paste0(path))
knitr::opts_chunk$set(message = FALSE)
knitr::opts_chunk$set(warning = FALSE)

if (update_cuphyr) {
  devtools::install_github("simeross/cuphyr")
}

# Sequence and microbiome specific libraries
library(dada2)
library(Biostrings)
library(DECIPHER)
library(cuphyr)
# The export of phyloseq objects to a BIOM format and the generation of fancier
# ordination plots require the phyloseq-extended package. The first command
# installs the package that is currently on the dev brach of the author's
# repository, the second command sources some extra functions, including the
# better ordination plot implementation.
remotes::install_github("mahendra-mariadassou/phyloseq-extended", ref = "dev")
source("https://raw.githubusercontent.com/mahendra-mariadassou/phyloseq-extended/master/load-extra-functions.R" )

library(phyloseq)
library(SIAMCAT)

# Phylogeny libraries
library(phangorn)
library(ape)

# Plotting and figure export
library(gridExtra)
library(viridis)
library(ggpubr)

# Tidyverse
library(tidyverse)
library(stringr)

# Various packages for specific analysis
library(readxl)
library(openxlsx)
library(ggpmisc)
library(betareg)
library(BBmisc)
library(aod)
library(betareg)
#install.packages('MicrobiomeStat')
library(MicrobiomeStat)

# Checks whether output path exists and creates it if not. Throws warning if
# directory exists.
outp <- paste0(path,"/analysis_output")
dir.create(file.path(outp))

if (optional_sample_check) {
  seqtabcheck <- readRDS(paste0(path,"/seqtab_nochim.rds"))
  samps <- rownames(seqtabcheck)
  lensamps <- length(samps)
  blankcol <- vector(mode = "character", length = lensamps)
  blanktable <- data.frame(SampleIDs = samps, ExampleProperty1 = blankcol,
                           ExampleProperty2 = blankcol,
                           ExampleProperty3 = blankcol)
  write.table(blanktable, file = paste0(path, "/descriptors_blank.txt"),
              sep = "\t", row.names = F)
  cat("'seqtab_nochim.rds' contains samples in the following order:\n",
      samps, "\nThe number of samples in the file is:", lensamps, sep = "\n")
  rm(optional_sample_check, seqtabcheck, samps,
     lensamps, blankcol, blanktable, update_cuphyr)
  }else{rm(optional_sample_check, update_cuphyr)}
# Dedicated environment containing all global analysis settings for better
# overview and collected export of settings
parameters <- new.env()

# CHANGE ME to "TRUE" to remove control samples from the analysis or "FALSE" to
# analyse all samples.
parameters$prune_controls = "TRUE"
  # CHANGE ME to a list of unique identifiers that only occur in the names of
  # samples you do NOT want to analyse. Common examples are provided.
  parameters$controls = c("Pos","NegativK", "PositivK", "Vann", "H2O", "Neg", "Kontr", "Contr", "POSK")

# CHANGE ME to "TRUE" to remove certain taxonomic groups from the analysis by
# name. This is useful to exclude non-target organisms or noise from organelles
# such as Chloroplasts and Mitochondria. It is recommended to first look at all
# data before using this setting.
parameters$prune_noise_taxgroups = "FALSE"
  # CHANGE ME to define the taxonomic groups to be removed as noise.
  parameters$noise_taxgroups = c("Chloroplast", "Mitochondria")

# CHANGE ME to a number of ASV counts [~reads] that analyzed samples should
# minimally have. Samples with lower ASV counts than 'minread' will be pruned.
# Set to 0 to not prune any samples.
parameters$minASVcount = 3000

# CHANGE ME to "TRUE", if you want to provide a custom taxonomy table instead of
# using the default dada2 output ('taxa.rds').
parameters$customTax = "TRUE"
  # CHANGE ME to the location of the custom taxonomy file. This only matters if
  # parameters$customTax="TRUE", otherwise it will be ignored.
  parameters$taxfile = "16S_DADA2_results_260821/custom_BLAST_taxonomy_nt.txt"

# CHANGE ME to "TRUE" to generate a phylogenetic tree. This process takes a
# long time depending on the number of sequences (up to days for thousands).
# If a tree is provided as 'phylotree.rds' in 'path', then it will be used
# regardless of the value of 'parameters$maketree'
parameters$maketree = "FALSE"

# CHANGE ME to "TRUE" to root the used phylogenetic tree (if one exists) on the
# leaf with the longest branch (outgroup). This makes analyses that rely on the
# phylogenetic tree reproducible instead of picking a random leaf as root when
# calculating UNIFRAC distances. Implementation based on
# http://john-quensen.com/r/unifrac-and-tree-roots/ and answers
# in https://github.com/joey711/phyloseq/issues/597
parameters$roottree = "TRUE"

## CHANGE ME to "TRUE" to export all generated phyloseq objects as .biom objects
parameters$biom_export = "FALSE"
############### NO NEED FOR CHANGES BELOW ###############
# Make dedicated environments to contain temporary values and manage other objects
tmp <- new.env()
plots <- new.env()
set <- new.env()

# Read in variables
tmp$seqtabp <- readRDS(paste0(path,"/seqtab_nochim.rds"))
if (parameters$customTax == "TRUE") {
  tmp$taxap <- read.delim(parameters$taxfile, header = TRUE, sep = "\t")
  rownames(tmp$taxap) <- colnames(tmp$seqtabp)
  tmp$taxap <- as.matrix(tmp$taxap)
}else{
  tmp$taxap <- readRDS(paste0(path,"/taxa.rds"))}
tmp$samp_table <- read.delim(paste0(path, "/descriptors.txt"),
                             header = TRUE, sep = "\t")
tmp$samp_list <- rownames(tmp$seqtabp)

# Check if descriptors has the same samples as seqtabp
if (length(tmp$samp_table[,1]) != length(tmp$samp_list)) {
  stop("There are ", length(tmp$samp_table[,1]),
    " samples in 'descriptors.txt', but ", length(tmp$samp_list),
    " samples in 'seqtab_nochim.rds'. Please make sure that the correct samples
    are contained in descriptors.txt.

    You may use 'optional_sample_check <- TRUE' in the first chunk to generate an
    empty template for 'descriptors.txt'" )
} else if (!identical(tmp$samp_table[,1], tmp$samp_list)) {
  warning("Warning: The samples in 'descriptors.txt' do not have the same names
          or order as the samples in 'seqtab_nochim.rds'. This may be fine if
          abbreviated names were used or the sample names are not contained in
          the first column of 'descriptors.txt'. Double-checking never hurts!")
}


# generate phylogenetic tree of ASVs only if there is no file called
# 'phylotree.rds' in the working directory and 'parameters$maketree' is "TRUE"
if (!file.exists(paste0(path, "/phylotree.rds"))) {
  if (parameters$maketree == "TRUE") {
    tmp$ASVs <- getSequences(tmp$seqtabp)
    names(tmp$ASVs) <- tmp$ASVs
    tmp$ASV_align <- AlignSeqs(DNAStringSet(tmp$ASVs), anchor = NA)
    tmp$ASV_phang <- phyDat(as(tmp$ASV_align, "matrix"), type = "DNA")
    tmp$dm <- dist.ml(tmp$ASV_phang)
    tmp$treeNJ <- NJ(tmp$dm)
    tmp$fit <- pml(tmp$treeNJ, data = tmp$ASV_phang)
    tmp$fitGTR <- update(tmp$fit, k = 4, inv = 0.2)
    tmp$fitGTR <- optim.pml(tmp$fitGTR, model = "GTR", optInv = TRUE,
                            optGamma = TRUE, rearrangement = "stochastic",
                            control = pml.control(trace = 0))
    saveRDS(tmp$fitGTR, file = paste0(path, "/phylotree.rds"))}}

##parse into phyloseq object
row.names(tmp$samp_table) <- tmp$samp_list
if (file.exists(paste0(path, "/phylotree.rds"))) {
  tmp$treep <- readRDS(paste0(path, "/phylotree.rds"))
  p <- phyloseq(otu_table(tmp$seqtabp, taxa_are_rows = FALSE),
                sample_data(tmp$samp_table),
                tax_table(tmp$taxap),
                phy_tree(tmp$treep$tree))
}else{
  p <- phyloseq(otu_table(tmp$seqtabp, taxa_are_rows = FALSE),
                sample_data(tmp$samp_table), tax_table(tmp$taxap))}

##Adding nucleotide info and giving sequences ASV## identifiers
tmp$ASV_sequences <- Biostrings::DNAStringSet(taxa_names(p))
taxa_names(p) <- paste0("ASV", seq(ntaxa(p)))
names(tmp$ASV_sequences) <- taxa_names(p)
p <- merge_phyloseq(p, tmp$ASV_sequences)

##optional pruning
if (parameters$prune_controls == "TRUE") {
  if (!is.null(parameters$controls)) {
    tmp$samp_clean <- tmp$samp_list[!tmp$samp_list %in% grep(paste0(
      parameters$controls, collapse = "|"), tmp$samp_list, value = T)]
    tmp$contr_pruned <- setdiff(tmp$samp_list, tmp$samp_clean)
    ps <- prune_samples(tmp$samp_clean, p)
    #Physeq object for Just controls
    ps.contr <- prune_samples(tmp$contr_pruned, p)
    ps.contr <- prune_taxa(taxa_sums(ps.contr) > 0, ps.contr)
    ps.transcontr <- transform_sample_counts(
      ps.contr, function(ASV) ASV/sum(ASV))

    message(cat(
      "\n",
      "Number of control samples that were pruned and will not be analysed:\n",
      length(tmp$samp_list) - length(tmp$samp_clean),
      "\n",
      "The following controls were pruned:\n",
      tmp$contr_pruned,
      "The controls are contained in a separate phyloseq object: ps.contr",
      "\n",
      sep = "\n"))
  }else{warning(cat(
    "\n\nparameters$prune_controls is TRUE but 'parameters$controls' is empty.
    No samples were pruned.\n\n"))}
}else{ps <- p}

# Prune ASVs defined as noise
if (parameters$prune_noise_taxgroups == "TRUE") {
  tmp$ps_taxlvls <- colnames(tax_table(ps))
  tmp$noise_ASVs <- character(0)
  for (lvl in tmp$ps_taxlvls) {
    tmp$noise_ASVs <- c(tmp$noise_ASVs,
                        cuphyr::list_subset_ASVs(
                          physeq = ps, subv = parameters$noise_taxgroups,
                          taxlvlsub = lvl))
  }
  tmp$noise_ASVs <- unique(tmp$noise_ASVs)
  tmp$no_noise_ASVs <- colnames(otu_table(ps))
  tmp$no_noise_ASVs <- setdiff(tmp$no_noise_ASVs, tmp$noise_ASVs)
  if (length(tmp$noise_ASVs) > 0) {
    ps <- prune_taxa(tmp$no_noise_ASVs, ps)
    tmp$no_noise_ps <- ps
    cat(length(tmp$noise_ASVs),
        "ASVs were pruned because they belonged to the following
        taxonomic groups:\n")
    cat(parameters$noise_taxgroups, "\n", sep = "\n")}
  else{
    cat("No ASVs were recognized as belonging to the following taxonomic groups
        defined as noise:\n")
    cat(parameters$noise_taxgroups, "\n", sep = "\n")
  }
}

# Prune samples with fewer than reads than minASVcount
if (parameters$minASVcount > 0) {
  tmp$samp_pruned <- names(which(sample_sums(ps) < parameters$minASVcount))
  ps <- prune_samples(sample_sums(ps) >= parameters$minASVcount, ps)
  if (length(tmp$samp_pruned) > 0) {
    cat("The following samples were pruned because ASV counts were lower than",
        parameters$minASVcount,  ":\n")
    cat(tmp$samp_pruned, "\n", sep = "\n")
  }
}

# Remove 0 count ASVs (e.g. control ASVs that remain) from the base object
ps <- prune_taxa(taxa_sums(ps) > 0, ps)

# Get a tbl of the base object for easier access in some phyloseq-independent
# analyses. Takes some seconds, potentially up to minutes.
ps_tbl <- as_tibble(psmelt(ps))

# Transformed per sample (per-sample relative abundance)
ps.trans <- transform_sample_counts(ps, function(ASV) ASV/sum(ASV))

# Read NDVI values as numeric
sample_data(ps)[["ndvi"]] <- as.numeric(sample_data(ps)[["ndvi"]])
sample_data(ps.trans)[["ndvi"]] <- as.numeric(sample_data(ps.trans)[["ndvi"]])


# Get a tbl of the base object for easier access in some phyloseq-independent
# analyses. Takes some seconds, potentially up to minutes.
ps_tbl <- as_tibble(psmelt(ps))
ps_trans_tbl <- as_tibble(psmelt(ps.trans))

# Condensing to Abundance per Genus and Sample
genus_abundance_tbl_per_sample <- ps_trans_tbl %>%
  group_by(Genus, Sample) %>%
  mutate(Genus_Sample_Abundance = sum(Abundance)) %>%
  select(Genus, Sample, ndvi, Genus_Sample_Abundance, Alias) %>%
  unique()

if (parameters$roottree == "TRUE" && parameters$maketree == "TRUE") {
  phyloseq::phy_tree(ps) <- cuphyr::root_tree_in_outgroup(physeq = ps)}

if (parameters$biom_export == "TRUE") {
  suppressWarnings(phyloseq.extended::write_phyloseq(
    p, biom_file = paste0(path, "all_samples.biom"),
    biom_format = "standard"))
  suppressWarnings(phyloseq.extended::write_phyloseq(
    ps, biom_file = file.path(path, "samples_without_controls.biom"),
    biom_format = "standard"))
  suppressWarnings(phyloseq.extended::write_phyloseq(
    ps.trans, biom_file = file.path(
      path, "samples_without_controls_rel_abundance.biom"),
    biom_format = "standard"))
  suppressWarnings(phyloseq.extended::write_phyloseq(
    ps.contr, biom_file = file.path(path, "just_controls.biom"),
    biom_format = "standard"))
}

ps
##### Optional settings (sensible defaults) #####
# Can be changed to adjust the output format for all plots. Default "pdf",
# possible "eps"/"ps", "tex" (pictex), "jpeg", "tiff", "png", "bmp" and "svg"
parameters$output_format = "pdf"

# Can be changed to preferred ggplot2 theme. Recommended: "theme_bw()".
theme_set(theme_bw())

############### NO NEED FOR CHANGES BELOW ###############

my_scale_col <- scale_color_viridis(discrete = TRUE)
my_scale_fill <- scale_fill_viridis(discrete = TRUE)

# Custom, more narrow color ranges based on viridis
# Base order to have adjacent colors be distinct from each other
tmp$sort_colors <- c(rbind(c(1:5), c(6:10), c(11:15), c(16:20)))

# Customized vectors
tmp$n_col <- 20
tmp$viridis_greens <- viridis(tmp$n_col,  option = "D", begin = 0.85,
                              end = 0.7)[tmp$sort_colors]
tmp$viridis_reds <- viridis(tmp$n_col,  option = "B", begin = 0.7,
                            end = 0.5)[tmp$sort_colors]
tmp$viridis_blues <- viridis(tmp$n_col,  option = "D", begin = 0.2,
                             end = 0.4)[tmp$sort_colors]
tmp$viridis_yellows <- viridis(tmp$n_col,  option = "D", begin = 1,
                               end = 0.9)[tmp$sort_colors]
tmp$viridis_dark <- viridis(tmp$n_col,  option = "A", begin = 0,
                            end = 0.1)[tmp$sort_colors]
tmp$viridis_light <- viridis(tmp$n_col,  option = "A", begin = 1,
                             end = 0.9)[tmp$sort_colors]
# Collected list that is available in the global environment
sub_viridis <- list(tmp$viridis_greens, tmp$viridis_blues, tmp$viridis_yellows,
                    tmp$viridis_light, tmp$viridis_reds, tmp$viridis_dark)
names(sub_viridis) <- c("greens", "blues", "yellows", "lights", "reds", "darks")

tmp$out <- paste0(".", parameters$output_format)

#################### Function ############################

# Generic save function for plots that checks whether file exists and if so,
# creates a new one with d/m/y+time info to avoid overwriting. Overwriting can
# be triggered with overwrite = TRUE. Width, height and resolution are taken
# from parameters in the 'set' environment or set to 20x20 cm with 300dpi.
save_plot <- function(
  pl, filetype = ".pdf", plot_name = "my_plot", overwrite=FALSE){
  wp <- if (!is.null(set$wp)) set$wp else 20
  hp <- if (!is.null(set$hp)) set$hp else 20
  res <- if (!is.null(set$res)) set$res else 300
  name <- paste0("/", plot_name,filetype)
  if (file.exists(paste0(outp, name)) & !overwrite) {
  name <- paste0("/", plot_name, "_",
                 format(Sys.time(), "%d-%m-%y_%H%M%S"),filetype)}
  ggsave(file.path(outp, name), pl,
         width = wp, height = hp, unit = "cm", dpi = res)
}

################################################
# CHANGE ME to the sample group for color coding. Accepted values are the column
# headers in the descriptor file.
set$color_by = "mel_mol_asv"

##### Optional settings (sensible defaults) #####

# Can be changed to change the width (in cm) of the saved plot.
set$wp = 17
# Can be changed to change the height (in cm) of the saved plot.
set$hp = 20
# Can be changed to change the resolution (in dpi) of the saved plot.
set$res = 300

############### NO NEED FOR CHANGES BELOW ###############
# Rank samples
set$ranked <- cuphyr::make_ranked_sums(p, myset = tmp$subset_id)
set$ranked_ps <- cuphyr::make_ranked_sums(ps, myset = tmp$subset_id)
set$ymax <- max(set$ranked$Abundance)
set$ymax <- set$ymax + round(set$ymax/10)
set$xmax <- nrow(set$ranked) + 1
set$title2 <- "Samples (without controls)"

# Stabilize colors
set$color_vars <- set$ranked[,set$color_by]  %>%
  unlist() %>% as.character() %>% unique()
set$color_vars <- sort(set$color_vars)
set$color_varsPalette <- viridis(length(set$color_vars))
names(set$color_varsPalette) <- set$color_vars
set$my_scale_fill <- scale_fill_manual(values = set$color_varsPalette)

# plot
plots$overview_all <- ggplot(data = set$ranked, aes(x = Rank, y = Abundance)) +
  aes_string(fill = set$color_by) +
  geom_col() + set$my_scale_fill + ggtitle("All samples") + ylim(0, set$ymax) +
  xlim(0,set$xmax) + ylab("ASV counts ('reads')")

if (length(tmp$noise_ASVs) > 0) {
  set$ranked_nonoise <- cuphyr::make_ranked_sums(
    tmp$no_noise_ps, myset = tmp$subset_id)
  plots$overview_noise <- ggplot(
    data = set$ranked_nonoise, aes(x = Rank, y = Abundance)) +
  aes_string(fill = set$color_by) +
  geom_col() + set$my_scale_fill +
    ggtitle("Samples (without controls), noise ASVs removed") +
    ylim(0, set$ymax) +
    xlim(0,set$xmax) + ylab("ASV counts ('reads')")
}

if (parameters$minASVcount > 0) {
plots$overview_all <- plots$overview_all +
  geom_hline(yintercept = parameters$minASVcount, linetype = "dashed") +
    ggtitle("All samples (ASV count cutoff indicated)")
set$title2 <- "Samples (without controls and low count samps)"
}

plots$overview_ps <- ggplot(data = set$ranked_ps, aes(x = Rank, y = Abundance)) +
  aes_string(fill = set$color_by) +
  geom_col() + set$my_scale_fill + ggtitle(set$title2) + ylim(0, set$ymax) +
  xlim(0,set$xmax) + ylab("ASV counts ('reads')")
plots$combo_overview <- ggarrange(
  plots$overview_all, plots$overview_ps, nrow = 2, align = "v",
  common.legend = TRUE, legend = "right")

if (parameters$minASVcount > 0) {
plots$combo_overview <- ggarrange(
  plots$overview_all, plots$overview_noise, plots$overview_ps,
  nrow = 3, align = "v",
  common.legend = TRUE, legend = "right")
}

#Save plots
save_plot(plots$combo_overview, plot_name = "Overview_all_and_pruned",
          filetype = tmp$out)

#Clean up plot parameters
rm(list = ls(set), envir = set)

#Print plots
plots$combo_overview
# CHANGE ME to the desired sample categories on the x-axis. In this case it
# should be the Sample names.
set$x_axis_value = "SampleIDs"

# CHANGE ME to the taxonomic level for color coding. Use "OTU" for ASVs,
# "Genus", "Species" or "OTU" recommended to compare pos. controls.
set$color_by_taxlvl = "Species"

# CHANGE ME to the taxonomic level for labeling the tree tips (if phylogenetic
# tree is available). Use "OTU" for ASVs.
set$label_by_taxlvl = "OTU"

# CHANGE ME to a sample category to shape the tree tip labels by (if
# phylogenetic tree is available).
set$label_shape_by = "Symptoms"

##### Optional settings (sensible defaults) #####

# Can be changed to generate a tree for just the control sequences IF no
# phylogenetic tree for all seuquences is provided. This may slow down this
# chunk when running it for the first time
set$control_tree = TRUE

# Can be changed to change the width (in cm) of the saved plot.
set$wp = 17
# Can be changed to change the height (in cm) of the saved plot.
set$hp = 20
# Can be changed to change the resolution (in dpi) of the saved plot.
set$res = 300

############### NO NEED FOR CHANGES BELOW ###############
if (set$control_tree & class(try(phy_tree(ps.transcontr),
                                 silent = TRUE)) == "try-error") {
  # generate phylogenetic tree of ASVs only if there is no file called
  # 'phylotree.rds' in the working directory and 'parameters$maketree' is "TRUE"
  if (!file.exists(paste0(path, "/controls_phylotree.rds"))) {
    set$ASVs <- phyloseq::refseq(ps.transcontr)
    set$ASV_align <- AlignSeqs(set$ASVs, anchor = NA)
    set$ASV_phang <- phyDat(as(set$ASV_align, "matrix"), type = "DNA")
    set$dm <- dist.ml(set$ASV_phang)
    set$treeNJ <- NJ(set$dm)
    set$fit <- pml(set$treeNJ, data = set$ASV_phang)
    set$fitGTR <- update(set$fit, k = 4, inv = 0.2)
    set$fitGTR <- optim.pml(set$fitGTR, model = "GTR",
                            optInv = TRUE, optGamma = TRUE,
                            rearrangement = "stochastic",
                            control = pml.control(trace = 0))
    saveRDS(set$fitGTR, file = paste0(path, "/controls_phylotree.rds"))}
  set$fitGTR <- readRDS(paste0(path, "/controls_phylotree.rds"))
  phyloseq::phy_tree(ps.transcontr) <- set$fitGTR$tree
}

plots$topnpplot <- plot_bar(ps.contr, x = set$x_axis_value,
                            fill = set$color_by_taxlvl) + my_scale_fill +
  theme(axis.title.x = element_blank(), legend.position = "none",
        legend.key.size = unit(3, "mm")) +
  ylab("ASV counts") + guides(col = guide_legend(ncol = 3))

plots$topntplot <- plot_bar(ps.transcontr, x = set$x_axis_value,
                            fill = set$color_by_taxlvl) + my_scale_fill +
  theme(axis.title.x = element_blank(), legend.position = "none",
        legend.key.size = unit(3, "mm")) +
  ylab("Relative abundance") + guides(col = guide_legend(ncol = 3))

plots$combo_contr <- ggarrange(plots$topnpplot, plots$topntplot, ncol = 2,
                               labels = c("A", "B"), align = "hv",
                               common.legend = TRUE, legend = "right")

if (class(try(phy_tree(ps.transcontr), silent = TRUE)) != "try-error") {
plots$tre <- plot_tree(
          ps.transcontr, ladderize = "left", label.tips = set$label_by_taxlvl,
          color = "abundance", text.size = 2.5, shape = set$label_shape_by) +
          scale_color_viridis_c(aesthetics = c("color","fill")) +
          theme(legend.position = "left", panel.border = element_blank())
plots$combo_contr <- ggarrange(plots$tre, ggarrange(plots$topnpplot,
                                                    plots$topntplot, ncol = 2,
                               labels = c("B", "C"), align = "hv",
                               common.legend = TRUE, legend = "right"),
                               nrow = 2, legend = "right", labels = c("A"))
}

# save
save_plot(plots$combo_contr, plot_name = "Controls", filetype = tmp$out)

plots$combo_contr
#The character vector can later be accessed by calling 'tmp$tops'
tmp$tops <- cuphyr::abundant_tax_physeq(physeq = ps,
                            lvl = "Superkingdom",
                            top = 10,
                            output_format = "tops",
                            ignore_na = TRUE,
                            silent = FALSE)

ps_trans_tbl %>% select(OTU, Superkingdom) %>% unique() %>% group_by(Superkingdom) %>% summarise(length(OTU))

# Total ASV counts

ps_tbl %>% select(OTU, Abundance) %>% summarise(total_sum = sum(Abundance))

# Total ASV counts per class

ps_tbl %>% select(OTU, Superkingdom, Abundance) %>% group_by(Superkingdom) %>% summarise(Abundance_sum = sum(Abundance))

# Percentage of ASV counts belonging to Bacteria

bacteria_percentage = (201304 /209313)*100 # ASV counts Bacteria / Total ASV counts
bacteria_percentage
# CHANGE ME to the sample category that will be shown in separate panels.
# Accepted values are the column headers in your descriptor file.
set$panel_by = "Symptoms"

# CHANGE ME to the desired sample categories on the x-axis.
# Accepted values are the column headers in the descriptor file.
set$x_axis_value = "Alias"

# CHANGE ME to the count of top ASVs you want to plot (e.g. 'set$topASVs = 20'
# plots the 20 most abundant ASVs)
set$topASVs = 100

# CHANGE ME to the taxonomic level of interest (color coding). Accepted values
# are the column headers in your descriptor file.
set$taxlvl = "Genus"

# CHANGE ME to change the number of Top n taxa to be plotted at
# taxlvl.
set$top_n = 100

# CHANGE ME to an entry at the chosen taxonomic level you want to highlight.
# Comment out to not highlight anything.
set$highlight = "Arthrobotryss"

##### Optional settings (sensible defaults) #####

# Can be changed to turn unified coloring on or off (same taxonomy term = same
# color in both plots). Highlighting will unify colors even if unify_colors
# is FALSE.
set$unify_colors = TRUE

# Can be changed to include (FALSE) or exclude (TRUE) NA values in the barplot
set$ignore_na = TRUE

# Can be changed to remove ASV segmentation in the top n taxlvl plot. This
# improves visual clarity when a bar segment appears black due to the border of
# many small ASVs overlapping.
set$fuse_ASVs = TRUE

# Can be changed to change the width (in cm) of the saved plot.
set$wp = 40
# Can be changed to change the height (in cm) of the saved plot.
set$hp = 20
# Can be changed to change the resolution (in dpi) of the saved plot.
set$res = 300

# Can be changed to change the y-axis label
set$y_axis_label = "Relative abundance"

############### NO NEED FOR CHANGES BELOW ###############

# Make physeq objects of top n taxa and top n ASVs
set$ps.topnTax <- cuphyr::abundant_tax_physeq(ps.trans, lvl = set$taxlvl,
                                              top = set$top_n,
                                              ignore_na = set$ignore_na)
set$topnASVs <- names(sort(taxa_sums(ps), decreasing = TRUE))[1:set$topASVs]
set$ps.topnASVs <- prune_taxa(set$topnASVs, ps.trans)

if (set$unify_colors | exists("highlight", envir = set) | set$fuse_ASVs) {
  set$toptax <- union(phyloseq::tax_table(set$ps.topnTax)[,set$taxlvl],
                      phyloseq::tax_table(set$ps.topnASVs)[,set$taxlvl])
  set$toptax <- sort(set$toptax)
  set$taxlvlPalette <- viridis(length(set$toptax))
  names(set$taxlvlPalette) <- set$toptax
  if (exists("highlight", envir = set)) {
    # It is possible to change the highlight color here by substituting
    # 'sub_viridis$reds[4]' with a hexcode-string, e.g. '#ff7f7f"'
    set$taxlvlPalette[set$highlight] <- sub_viridis$reds[4]
  }
  set$taxlvlPalette <- set$taxlvlPalette[sort(names(set$taxlvlPalette))]
  set$my_scale_fill <- scale_fill_manual(values = set$taxlvlPalette,
                                         na.value = "grey")
}else{
  set$my_scale_fill <- my_scale_fill
}

# Plot
if (set$unify_colors | exists("highlight", envir = set) | set$fuse_ASVs) {
set$my_scale_fill <- scale_fill_manual(
  values = set$taxlvlPalette[
    sort(unique(phyloseq::tax_table(set$ps.topnTax)[,set$taxlvl]))],
  na.value = "grey")
}
plots$topn_tax <- plot_bar(set$ps.topnTax,
                           x = set$x_axis_value,
                           fill = set$taxlvl,
                           title = paste0("Top", set$top_n, "_", set$taxlvl)) +
  facet_grid(paste0("~", set$panel_by), scales = "free", space = "free") +
  set$my_scale_fill +
  ylab(set$y_axis_label)

if (set$fuse_ASVs) {
  plots$topn_tax <- plots$topn_tax + geom_bar(
    aes_string(color = set$taxlvl, fill = set$taxlvl),
    stat = "identity", position = "stack") +
    scale_color_manual(values = set$taxlvlPalette, na.value = NA)
}

if (set$unify_colors | exists("highlight", envir = set) | set$fuse_ASVs) {
set$my_scale_fill <- scale_fill_manual(
  values = set$taxlvlPalette[
    sort(unique(phyloseq::tax_table(set$ps.topnASVs)[,set$taxlvl]))],
  na.value = "grey")
}
plots$topn_ASVs <- plot_bar(set$ps.topnASVs,
                            x = set$x_axis_value,
                            fill = set$taxlvl,
                            title = paste0("Top", set$topASVs, "_ASVs")) +
  facet_wrap(paste0("~", set$panel_by), scales = "free_x") +
  set$my_scale_fill +
  ylab(set$y_axis_label)

# save
save_plot(plots$topn_tax, plot_name = paste0("Top", set$top_n, "_", set$taxlvl),
          filetype = tmp$out)
save_plot(plots$topn_ASVs, plot_name = paste0("Top", set$topASVs, "_ASVs"),
          filetype = tmp$out)

# Clean up plot parameters
rm(list = ls(set), envir = set)

# Print to standard out
plots$topn_tax
plots$topn_ASVs


# CHANGE ME to the desired sample categories on the x-axis.
# Accepted values are the column headers in the descriptor file.
set$x_axis_value = "ndvi"

# CHANGE ME to the taxonomic level of interest (color coding). Accepted values
# are the column headers in your descriptor file.
set$taxlvl = "Genus"

# CHANGE ME to change the number of Top n taxa to be plotted at
# taxlvl.
set$top_n = 10

# Can be changed to include (FALSE) or exclude (TRUE) NA values in the barplot
set$ignore_na = TRUE

# CHANGE ME to an entry at the chosen taxonomic level you want to highlight.
# Comment out to not highlight anything.
#set$highlight = "Meloidogyne"

##### Optional settings (sensible defaults) #####

# Can be changed to change the width (in cm) of the saved plot.
set$wp = 20
# Can be changed to change the height (in cm) of the saved plot.
set$hp = 13
# Can be changed to change the resolution (in dpi) of the saved plot.
set$res = 300

# Can be changed to change the y-axis label
set$y_axis_label = "Relative abundance"

# Can be changed to change the x-axis label
set$x_axis_label = "Sample"

############### NO NEED FOR CHANGES BELOW ###############
# Estimate Alpha-diversity (Shannon)
set$alpha_div_ps_trans <- estimate_richness(ps.trans, measures = "Shannon") %>%
  as_tibble(rownames = "Sample")

# Make physeq objects of top n taxa and top n ASVs
set$ps.topnTax <- cuphyr::abundant_tax_physeq(ps.trans, lvl = set$taxlvl,
                                              top = set$top_n,
                                              ignore_na = set$ignore_na)

# Plot
set$my_scale_fill <- my_scale_fill


set$topntax_tbl <- psmelt(set$ps.topnTax) %>%
                    as_tibble() %>%
                    left_join(set$alpha_div_ps_trans, by = "Sample") %>%
                    select(Genus, Alias, ndvi, Abundance, Shannon) %>%
                    filter(Abundance > 0) %>%
                    group_by(Genus, Alias, ndvi, Shannon) %>%
                    summarise(Abundance = sum(Abundance)) %>%
                    arrange(ndvi) %>%
                    mutate(ndvi_rank = c(1:length(ndvi)))


plots$topn_tax_custom <- ggplot(set$topntax_tbl, aes(x = fct_reorder(Alias, ndvi),
                           y = Abundance,
                           fill = Genus)) +
                           #title = paste0("Top ", set$top_n, " ", set$taxlvl))) +
  geom_col(color = "black") +
  scale_fill_viridis(discrete = TRUE, na.value = "grey")  +
  ylab(set$y_axis_label) +
  xlab(set$x_axis_label) +
  theme(strip.background = element_blank(),
        # strip.text = element_text(size = 16),
       # axis.title=element_text(size=16),
       # legend.text = element_text(size=14),
        legend.position = "bottom")

plots$ndvi_dot_plot <- ggplot(set$topntax_tbl, aes(fct_reorder(Alias, ndvi),
                           y = ndvi)) +
                             geom_point() +
    theme(strip.background = element_blank(),
          #strip.text = element_text(size = 16),
        # axis.title=element_text(size=16),
       # legend.text = element_text(size=14)
        axis.title.x=element_blank()) +
  ylab("NDVI")

plots$shannon_dot_plot <- ggplot(set$topntax_tbl,
                                 aes(fct_reorder(Alias, ndvi),
                           y = Shannon)) +
                             geom_point() +
    theme(strip.background = element_blank(),
          #strip.text = element_text(size = 16),
        #axis.title=element_text(size=16),
        #legend.text = element_text(size=14),
        axis.title.x=element_blank()) +
  ylab("Shannon")


plots$combo_topn_custom <- ggarrange(plots$ndvi_dot_plot,
                                     plots$shannon_dot_plot,
                                     plots$topn_tax_custom,
                                     nrow = 3,
                                     heights = c(1, 1, 3),
                                     align = "v")

save_plot(plots$combo_topn_custom, plot_name = paste0("Customized_NDVI_Shannon_plot"),
          filetype = tmp$out)
plots$combo_topn_custom


topntax_data <- set$topntax_tbl %>%
  mutate(Taxa = 'bacteria') %>%
  ungroup() %>%
  select(Alias, ndvi, Shannon, Taxa) %>%
  distinct()

write.csv(topntax_data, file = "../topntax_all_taxa/topntax_data_bacteria.csv")


ps_trans_tbl <- as_tibble(psmelt(ps.trans))

# Genera of interest

genera_of_interest <- c("Bacillus", "Paenibacillus", "Pseudarthrobacter", "Bradyrhizobium", "Sporosarcina", "Clostridium", "Candidatus Nitrosocosmicus", "Pseudomonas", "Rhizobium")
#genera_of_interest <- c("Globodera, Meloidogyne, Pristionchus, Rhabditis, Pellioditis, Cephaloboides, Aporcelaimellus, Acrobeloides, Nygolaimus, Cruznemas")

# Taking ps_tbl, grouping by Genus and summarizing (open heart surgery)
# Will give a table with two columns: "Genus" "Genus_total_Abundance"

genus_abundance_tbl <- ps_trans_tbl %>%
  group_by(Genus) %>%
  summarise(Genus_total_Abundance = sum(Abundance))


# If you want per genus and sample
#genus_abundance_tbl_per_sample <- ps_trans_tbl %>%
#  group_by(Genus, Sample) %>%
#  summarise(Genus_Sample_Abundance = sum(Abundance)) %>%
#  filter(Genus %in% genera_of_interest) %>%
#  group_by(Sample)


# To get back all the other info, one way is to fuse back to original table
ps_tbl_with_genus_abundance <- left_join(ps_trans_tbl, genus_abundance_tbl_per_sample,
                                         by = c("Genus", "Sample"))

library(readxl)

# Can be made more compact by removing ASV+Species info and original abundance
# values and then reducing to unique rows
genus_abundance_tbl_full_info <- ps_tbl_with_genus_abundance %>%
  select(-OTU, -Species) %>%
  unique()


write.xlsx(genus_abundance_tbl_per_sample, "genus_abundance_tbl_per_sample.xlsx")

# ndvi <- read_excel("~/Documents/GitHub/metamarte/Data_analysis/2021-patch-metabarcoding/16S/genus_abundance_tbl_per_sample_copy.xlsx")
#
#


# Create properly formatted tibble with columns Sample, ndvi_01 (ndvi translated to (0, 1) interval) and one column for each genus
# containing the sample abundances for that genus.
ldf <- data.frame(genus_abundance_tbl_per_sample %>% pivot_wider(id_cols = c('Sample', 'ndvi'), names_from = 'Genus', values_from = 'Genus_Sample_Abundance'))
ldf_genus_data <- data.frame(ldf) %>% select(!c('Sample', 'ndvi'))
colnames(ldf_genus_data) <- gsub(' ', '.', colnames(ldf_genus_data))
ldf <- cbind.data.frame(
Sample = ldf$Sample,
ndvi_01 = (ldf$ndvi + 1) / 2.0
)
ldf <- tibble(cbind(ldf, ldf_genus_data))

n_samples_by_genus <- data.frame(ldf_genus_data > 0) %>% mutate_if(is.logical, as.numeric) %>% colSums() %>% sort(decreasing = TRUE)
keep_n <- 100 # Maximum number of genuses to include in the analysis
top_n_occurence_genuses <- names(n_samples_by_genus[1:keep_n])
top_n_occurence_genuses <- top_n_occurence_genuses[!is.na(top_n_occurence_genuses)]

l_genus_ldf <- ldf %>% select(all_of(top_n_occurence_genuses))
l_genus_ldf_transposed <- data.frame(t(l_genus_ldf))
l_meta_ldf <- ldf %>% select('ndvi_01')

l_model <- linda(
l_genus_ldf_transposed,
l_meta_ldf,
formula = '~ ndvi_01',
feature.dat.type = 'proportion',
is.winsor = FALSE,
alpha = 0.05
)

# Print model info
l_model
# Show effect size and significance plots
linda.plot(
l_model,
variables.plot = c('ndvi_01'),
alpha = 0.05,
lfc.cut = 1,
legend = TRUE
)

l_model_df <- as.data.frame(l_model$output)

write.xlsx(l_model_df, file = "supplementary_table_ndvi_regression_16s.xlsx", rowNames = TRUE, colnames = TRUE)
sessionInfo()