From 2d3318b52ab5b8fdf0622e5bf866a78376bf0f1c Mon Sep 17 00:00:00 2001
From: Simeon <51403284+simeross@users.noreply.github.com>
Date: Fri, 20 Oct 2023 14:13:07 +0200
Subject: [PATCH] roxygen tags

---
 R/plot_repeat_positions.R            |  32 ++++---
 R/plot_repeat_quantity.R             |  31 +++++--
 R/plot_repeats.R                     |  42 +++++----
 R/plot_variants_per_sample.R         |  95 ++++++++++++--------
 R/quantify_repeats.R                 |  24 +++--
 R/read_and_write_cluster_abundance.R |  60 ++++++++-----
 R/similiarity_to_reference.R         |  38 +++++---
 R/subset_by_clusters.R               |  21 +++--
 R/subset_variant_table.R             |  29 ++++--
 R/test_clustering_thresholds.R       |  41 +++++++--
 R/translate_and_count_stops.R        |  38 ++++++--
 R/variant_classifier.R               | 129 +++++++++++++++------------
 R/veganify_asvcounts.R               |  20 +++--
 R/veganify_generic_wide_tbl.R        |  34 ++++++-
 14 files changed, 428 insertions(+), 206 deletions(-)

diff --git a/R/plot_repeat_positions.R b/R/plot_repeat_positions.R
index 544347f..1c449d7 100644
--- a/R/plot_repeat_positions.R
+++ b/R/plot_repeat_positions.R
@@ -1,13 +1,25 @@
-plot_repeat_positions <- function(repeat_positions) {
-  legend_name <- ""
-  pl <- ggplot(repeat_positions, aes(x = start, xend = end, 
-                                     y = seqname, yend = seqname, 
-                                     color = fragment, size = plot_intensity)) + 
+#' Plot repeat positions
+#'
+#' This function plots repeat positions on a sequence. It takes a data frame
+#' as input which should have columns 'start', 'end', 'seqname', 'fragment',
+#' and 'plot_intensity'. It uses ggplot2 to create the plot.
+#'
+#' @param repeat_positions A data frame with columns 'start', 'end', 'seqname',
+#' 'fragment', and 'plot_intensity'.
+#' @return A ggplot object representing the repeat positions plot.
+#' @import ggplot2
+#' @import viridis
+#' @keywords plot
+plot_repeat_positions <- function(repeat_positions){
+  legend_name <- ''
+  pl <- ggplot(repeat_positions, aes(x = start, xend = end,
+                                     y = seqname, yend = seqname,
+                                     color = fragment, size = plot_intensity)) +
     geom_segment() +
-    guides(size = "none") +
+    guides(size = 'none') +
     scale_color_viridis_d(begin = 0.25, end = 0.8, name = legend_name)  +
-    ylab("Sequence") + xlab("Sequence position (bp)") +
-    theme(legend.position = "top")
-  
+    ylab('Sequence') + xlab('Sequence position (bp)') +
+    theme(legend.position = 'top')
+
   return(pl)
-}
\ No newline at end of file
+}
diff --git a/R/plot_repeat_quantity.R b/R/plot_repeat_quantity.R
index a65b2b1..b38d49d 100644
--- a/R/plot_repeat_quantity.R
+++ b/R/plot_repeat_quantity.R
@@ -1,15 +1,30 @@
+#' Plot Repeat Quantity
+#'
+#' This function creates a plot to visualize the quantity of repeated sequences across different sequences.
+#'
+#' @param quantified_repeats A data frame containing information about quantified repeats.
+#'
+#' @return A ggplot object representing the plot.
+#'
+#' @export
+#'
+#' @examples
+#' quantified_repeats <- data.frame(seqname = c('A', 'B', 'C'),
+#'                                 repeat_count = c(1, 2, 3),
+#'                                 count_type = c('Type1', 'Type2', 'Type3'))
+#' plot_repeat_quantity(quantified_repeats)
 plot_repeat_quantity <- function(quantified_repeats) {
-  legend_name <- ""
-  
+  legend_name <- ''
+
   pl <- ggplot(quantified_repeats, aes(
-    x = repeat_count, y = seqname, fill = count_type)) + 
+    x = repeat_count, y = seqname, fill = count_type)) +
     geom_col(position = position_dodge()) +
     scale_fill_viridis_d(begin = 0.25, end = 0.8, name = legend_name) +
-    ylab("Sequence") + xlab("Repeat count") +
-    theme(legend.position = "top",
+    ylab('Sequence') + xlab('Repeat count') +
+    theme(legend.position = 'top',
           axis.title.y = element_blank(),
           axis.ticks.y = element_blank(),
           axis.text.y = element_blank())
-  
-  return(pl)  
-}
\ No newline at end of file
+
+  return(pl)
+}
diff --git a/R/plot_repeats.R b/R/plot_repeats.R
index dc4cc41..d986420 100644
--- a/R/plot_repeats.R
+++ b/R/plot_repeats.R
@@ -1,27 +1,35 @@
-plot_repeats <- function(sequences = DNAStringSet(), 
-                         repeat_sequence = "GATC"){
+#' Plot short sequence repeats
+#'
+#' This function generates a plot that shows the positions and quantities of repeated sequences in a given set of DNA sequences.
+#'
+#' @param sequences A DNAStringSet object containing the DNA sequences to be analyzed. Default is an empty DNAStringSet.
+#' @param repeat_sequence A character vector specifying the repeat sequence to be searched. Default is 'GATC'.
+#'
+#' @return A ggplot object displaying both positions and quantities of repeated sequences.
+#' @export
+#' @name plot_repeats
+
+plot_repeats <- function(sequences = DNAStringSet(),
+                         repeat_sequence = 'GATC') {
   clus_name <- names(sequences)[[1]]
-  # quantify repeats and 
+
   quant_data <- quantify_repeats(sequences, repeat_sequence) %>%
-    mutate(seqname = fct_reorder(
-      seqname, dplyr::desc(largest_repeat_contig))) %>% 
-    pivot_longer(cols = singlets:largest_repeat_contig, 
-                 names_to = "count_type", values_to = "repeat_count") %>%
-    mutate(count_type = str_replace_all(count_type, "singlets", 
-                                        "single instances"),
-           count_type = str_replace_all(count_type, "largest_repeat_contig", 
-                                        "largest contiguous instance"))
-  
+    mutate(seqname = fct_reorder(seqname, dplyr::desc(largest_repeat_contig))) %>%
+    pivot_longer(cols = singlets:largest_repeat_contig,
+                 names_to = 'count_type', values_to = 'repeat_count') %>%
+    mutate(count_type = str_replace_all(count_type, 'singlets', 'single instances'),
+           count_type = str_replace_all(count_type, 'largest_repeat_contig', 'largest contiguous instance'))
+
   pos_data <- find_repeat_positions(sequences, repeat_sequence) %>%
     mutate(seqname = factor(seqname, levels = levels(quant_data$seqname)))
-  
+
   pos_pl <- plot_repeat_positions(pos_data)
   quant_pl <- plot_repeat_quantity(quant_data)
-  
+
   combo_pl <- ggpubr::ggarrange(pos_pl, quant_pl)
   combo_pl <- ggpubr::annotate_figure(combo_pl, top = ggpubr::text_grob(
-    paste("Repeat", repeat_sequence, "in", clus_name, "cluster"), 
+    paste('Repeat', repeat_sequence, 'in', clus_name, 'cluster'),
     vjust = 0.5, x = 0.01, hjust = 0))
-  
+
   return(combo_pl)
-}
\ No newline at end of file
+}
diff --git a/R/plot_variants_per_sample.R b/R/plot_variants_per_sample.R
index d161dfa..3dc413e 100644
--- a/R/plot_variants_per_sample.R
+++ b/R/plot_variants_per_sample.R
@@ -1,78 +1,101 @@
+#' Plot amplicon sequence variants
+#'
+#' This function plots the amplicon variants per sample using a provided distance clustering table. The plot can be customized by specifying the sequence type and whether hierarchical clustering should be performed. The function also creates a custom color scheme based on the viridis color palette.
+#'
+#' @param distclust_table A tibble containing the distance clustering table (default: mytibble)
+#' @param sequence_type The type of sequence to consider (default: 'bp')
+#' @param hierarchical_clustering Whether to perform hierarchical clustering (default: FALSE)
+#'
+#' @return A ggplot object representing the variants per sample plot
+#'
+#' @import ggplot2
+#' @import dplyr
+#' @importFrom cowplot plot_grid
+#' @importFrom cowplot get_legend
+#' @importFrom viridisLite viridis
+#' @importFrom viridisLite viridisLite
+#' @importFrom gridExtra unit
+#' @importFrom gridExtra margin
+#'
+#' @examples
+#' plot_variants_per_sample()
+#'
+#' @export
 plot_variants_per_sample <- function(distclust_table = mytibble,
-                                     sequence_type = "bp",
+                                     sequence_type = 'bp',
                                      hierarchical_clustering = FALSE){
-  # Make a custom color scheme that assigns viridis 0 (purple) to ref/major, 
+  # Make a custom color scheme that assigns viridis 0 (purple) to ref/major,
   # viridis 1 (yellow) to major/minor
   # and a gradient between 0.7 and 0.3 to the rest.
   custom_colors <- c(viridisLite::viridis(1, begin = 0),
-                     viridisLite::viridis(1, begin = 1), 
+                     viridisLite::viridis(1, begin = 1),
                      viridisLite::viridis(
-                       length(unique(distclust_table$seq_var))-2, 
-                       option = "D", begin = 0.7 , end = 0.3))
-  
-  names(custom_colors) <- distclust_table$seq_var %>% 
-    unique() %>% 
+                       length(unique(distclust_table$seq_var))-2,
+                       option = 'D', begin = 0.7 , end = 0.3))
+
+  names(custom_colors) <- distclust_table$seq_var %>%
+    unique() %>%
     sort()
   if(hierarchical_clustering){
-    for_vegan <- distclust_table %>% 
+    for_vegan <- distclust_table %>%
       select(var_type, seqnames, rel_var_abundance) %>%
-      pivot_wider(id_cols = var_type, names_from = seqnames, 
+      pivot_wider(id_cols = var_type, names_from = seqnames,
                   values_from = rel_var_abundance)
-    
+
     # rearrange x_axis variable according to order in hierarchical cluster
     dat <- dendrogram_hclust(veganify_generic_wide_tbl(for_vegan), seed = 1)
-    
+
     distclust_table <- dplyr::mutate(
       distclust_table, var_type = factor(var_type, levels = dat$labels$label))
   }
-  length_measure <- ifelse(sequence_type != "aa", "seqs_lengths", 
-                           "longest_aa_seq")
-  
+  length_measure <- ifelse(sequence_type != 'aa', 'seqs_lengths',
+                           'longest_aa_seq')
+
   legend_text_master <- function(){
-    element_text(size = 10, margin = margin(t = 0, r = 15, 
-                                            b = 0, l = 0, unit = "pt"))
+    element_text(size = 10, margin = margin(t = 0, r = 15,
+                                            b = 0, l = 0, unit = 'pt'))
   }
-  
+
   # plot
-  pl <- ggplot(distclust_table, aes(x = .data[[length_measure]], 
+  pl <- ggplot(distclust_table, aes(x = .data[[length_measure]],
                                     y = var_type, fill = seq_var,
                                     alpha = rel_var_abundance)) +
     geom_col(position = position_dodge2(padding = 0)) +
     theme_classic() +
-    facet_grid(cols = vars(clus_name), scales = "free") +
+    facet_grid(cols = vars(clus_name), scales = 'free') +
     scale_fill_manual(values = custom_colors) +
     theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
           axis.text.y = element_text(hjust = 0),
           strip.text.x.top = element_text(angle = 90, hjust = 0, vjust = 0.5),
           strip.background = element_blank(),
           axis.title.y = element_blank()) +
-    guides(fill = guide_legend(title = "Sequence variants",
+    guides(fill = guide_legend(title = 'Sequence variants',
                                title.theme = legend_text_master(),
                                label.theme = legend_text_master(),
                                keywidth = 0.5,
                                keyheight = 0.5),
-           alpha = guide_legend(title = "Prevalence", 
+           alpha = guide_legend(title = 'Prevalence',
                                 title.theme = legend_text_master(),
                                 label.theme = legend_text_master(),
                                 keywidth = 0.5,
-                                keyheight = 0.5)) 
-  
-  
+                                keyheight = 0.5))
+
+
   if(hierarchical_clustering){
     d_pl <- plot_dendrogram(distclust_table = distclust_table)
     # extract a legend that is laid out horizontally
-    legend <- get_legend(
-      pl + theme(legend.position = "bottom")# +
-      #guides(fill = guide_legend(nrow = 6))
-    )
-    
-    pl_p <- cowplot::plot_grid(d_pl, pl + 
-                                 theme(legend.position = "none"), 
-                               align = "h", axis = "bt", 
+    legend <- get_legend(pl + theme(legend.position = "bottom")# +
+            #guides(fill = guide_legend(nrow = 6))
+                            )
+
+    pl_p <- cowplot::plot_grid(d_pl, pl +
+                               theme(legend.position = "none"),
+                               align = "h", axis = "bt",
                                rel_widths = c(1, 5), nrow = 1)
-    pl <- cowplot::plot_grid(pl_p, legend, ncol = 1, 
-                             rel_heights = c(1, .15)) + 
+    pl <- cowplot::plot_grid(pl_p, legend, ncol = 1,
+                             rel_heights = c(1, .15)) +
       theme(plot.background = element_rect(fill = "white"))
   }
+
   return(pl)
-}
\ No newline at end of file
+}
diff --git a/R/quantify_repeats.R b/R/quantify_repeats.R
index 37749b7..c7e854a 100644
--- a/R/quantify_repeats.R
+++ b/R/quantify_repeats.R
@@ -1,10 +1,22 @@
-quantify_repeats <- function(sequences = DNAStringSet, 
-                             repeat_sequence = "string") {
+#' Quantify Repeats
+#'
+#' This function quantifies the number of repeats of a given sequence in a set of DNA sequences.
+#'
+#' @param sequences A DNAStringSet object containing the DNA sequences to be analyzed.
+#' @param repeat_sequence A character string specifying the repeat sequence to be quantified.
+#' @return A tibble with the following columns:
+#' \itemize{
+#'   \item \code{seqname}: The name of the sequence.
+#'   \item \code{repeat_seq}: The specified repeat sequence.
+#'   \item \code{singlets}: The number of occurrences of the repeat sequence as singlets in each sequence.
+#'   \item \code{largest_repeat_contig}: The number of contiguous repeats of the repeat sequence in each sequence.
+#' }
+#' @export
+quantify_repeats <- function(sequences = DNAStringSet, repeat_sequence = 'string') {
   singlet_count <- str_count(as.character(sequences), repeat_sequence)
-  rep_count <- find_contiguous_multi_repeats(sequences, repeat_sequence,
-                                             singlet_count)
+  rep_count <- find_contiguous_multi_repeats(sequences, repeat_sequence, singlet_count)
   out <- tibble(seqname = names(sequences), repeat_seq = repeat_sequence,
                 singlets = singlet_count, largest_repeat_contig = rep_count)
-  
+
   return(out)
-}
\ No newline at end of file
+}
diff --git a/R/read_and_write_cluster_abundance.R b/R/read_and_write_cluster_abundance.R
index 0b98c0b..8ac0b24 100644
--- a/R/read_and_write_cluster_abundance.R
+++ b/R/read_and_write_cluster_abundance.R
@@ -1,49 +1,67 @@
+#' Read and Write Cluster Abundance
+#'
+#' This function reads a sequence table, calculates the abundance of sequences belonging to each cluster, and writes the results to a CSV file.
+#'
+#' @param cluster_sequence_list A list of DNAStringSet objects representing the sequences assigned to each cluster.
+#' @param reference_seqs A DNAStringSet object representing a set of reference sequences.
+#' @param seqtab_nochim Name of the file containing the sequence table with no chimeric sequences. Default is 'seqtab_nochim.rds'.
+#' @param outpath Path where the output files will be written. Default is 'path'.
+#'
+#' @return A list of data frames, with each data frame representing the abundance of sequences belonging to each cluster.
+#'
+#' @examples
+#' \dontrun{
+#' # Read and write cluster abundance
+#' read_and_write_cluster_abundance(cluster_sequence_list, reference_seqs, seqtab_nochim = 'seqtab_nochim.rds', outpath = path)
+#' }
+#'
+#' @export
 read_and_write_cluster_abundance <- function(
     cluster_sequence_list = DNAStringSetList,
     reference_seqs = NULL,
-    seqtab_nochim = "seqtab_nochim.rds",
+    seqtab_nochim = 'seqtab_nochim.rds',
     outpath = path) {
-  
-  stab <- readRDS(seqtab_nochim) %>% 
+
+  stab <- readRDS(seqtab_nochim) %>%
     t() %>% as.data.frame() %>%
-    rownames_to_column(var = "seqs")
-  
+    rownames_to_column(var = 'seqs')
+
   per_cluster_abundance <- function(seqs_of_one_cluster = DNAStringSet){
-    seq_tbl <- tibble(seqs = as.data.frame(seqs_of_one_cluster)[[1]], 
+    seq_tbl <- tibble(seqs = as.data.frame(seqs_of_one_cluster)[[1]],
                       ID = names(seqs_of_one_cluster))
-    named_stab <- left_join(seq_tbl, stab, by = "seqs") 
-    
+    named_stab <- left_join(seq_tbl, stab, by = 'seqs')
+
     if(!is.null(reference_seqs)){
-      named_stab <- named_stab %>% 
+      named_stab <- named_stab %>%
         filter(!(ID %in% names(reference_seqs)))
     }
-    
+
     stab_tbl <- named_stab %>%
       select(-seqs) %>%
-      pivot_longer(cols = -ID, names_to = "Sample", values_to = "count")
+      pivot_longer(cols = -ID, names_to = 'Sample', values_to = 'count')
   }
-  
+
   out <- lapply(cluster_sequence_list, per_cluster_abundance)
-  
+
   write_cluster_abundance_tbl <- function(clus_name){
     stab_tbl <- out[[clus_name]]
-    
+
     if(nrow(stab_tbl > 0)) {
-      outpath_subdir <- file.path(outpath, "cluster_seqtabs")
+      outpath_subdir <- file.path(outpath, 'cluster_seqtabs')
       dir.create(outpath_subdir, showWarnings = FALSE)
       path_csv <- file.path(outpath_subdir,
-                            paste0("seqtab_", clus_name, "_cluster.csv"))
+                            paste0('seqtab_', clus_name, '_cluster.csv'))
       stab_wide <- stab_tbl %>%
         pivot_wider(names_from = Sample, values_from = count)
       write_csv(stab_wide, file = path_csv)
-      cat("Abundance of ASVs belonging to the cluster", clus_name, 
-          "written to: \n", path_csv, "\n")
+      cat('Abundance of ASVs belonging to the cluster', clus_name,
+          'written to: \n', path_csv, '\n')
     } else {
       warning(paste(
-        "No table written for cluster", clus_name, 
-        "because no ASVs belong to this cluster"), call. = FALSE)
+        'No table written for cluster', clus_name,
+        'because no ASVs belong to this cluster'), call. = FALSE)
     }
   }
   lapply(names(out), write_cluster_abundance_tbl)
   return(out)
-}
\ No newline at end of file
+}
diff --git a/R/similiarity_to_reference.R b/R/similiarity_to_reference.R
index d9fbf33..f6af472 100644
--- a/R/similiarity_to_reference.R
+++ b/R/similiarity_to_reference.R
@@ -1,13 +1,31 @@
-# parse similiarity matrix to extract similarity to reference
-similiarity_to_reference <- function (seqs = DNAStringSet, 
+#' Parse similarity matrix to extract similarity to reference
+#'
+#' This function takes in a set of sequences and parses a similarity matrix to extract the similarity to a reference sequence. The resulting data is filtered to return only the similarity values for the reference sequence. The function supports parallel processing with multiple cores.
+#'
+#' @param seqs A DNAStringSet object containing the sequences.
+#' @param ncores An integer specifying the number of cores to use for parallel processing. Defaults to 1.
+#'
+#' @export
+#' @importFrom DNAtools alignment_based_distance_matrix
+#' @importFrom dplyr as_tibble filter mutate pivot_longer select
+#' @importFrom magrittr %>%
+#' @importFrom glue if_else
+#' @importFrom stringr str_c
+#' @importFrom Biostrings DNAStringSet
+
+similiarity_to_reference <- function (seqs = DNAStringSet,
                                       ncores = 1) {
   if(length(seqs) > 1){
-    out <- alignment_based_distance_matrix(seqs = seqs, 
-                                           ncores = ncores) %>% 
-      as_tibble(rownames = "query") %>% 
-      pivot_longer(cols = -query, names_to = "seqnames", 
-                   values_to = "sim_to_ref") %>% 
+    out <- alignment_based_distance_matrix(seqs = seqs,
+                                           ncores = ncores) %>%
+      as_tibble(rownames = "query") %>%
+      pivot_longer(cols = -query, names_to = "seqnames",
+                   values_to = "sim_to_ref") %>%
       mutate(seq_var = if_else(query == names(seqs)[1], "ref", "none")) %>%
-      filter(seq_var == "ref") %>% 
-      select(-seq_var, -query)}
-}
\ No newline at end of file
+      filter(seq_var == "ref") %>%
+      select(-seq_var, -query)
+    }
+}
+
+#' @rdname similiarity_to_reference
+#' @keywords internal
diff --git a/R/subset_by_clusters.R b/R/subset_by_clusters.R
index 4cf923b..605cddf 100644
--- a/R/subset_by_clusters.R
+++ b/R/subset_by_clusters.R
@@ -1,18 +1,27 @@
+#' Subset sequences by clusters
+#'
+#' This function subsets sequences based on cluster assignments provided in a cluster table. It can save the resulting sequences to files if specified.
+#'
+#' @param seqs A sequence object containing the sequences to be subsetted.
+#' @param cluster_tbl A data frame or tibble containing the cluster assignments. It should have two columns, 'cluster' and 'seqnames', where 'cluster' contains the cluster numbers and 'seqnames' contains the corresponding sequence names.
+#' @param save_to_file Logical value indicating whether to save the resulting sequences to separate files for each cluster.
+#' @return A list of sequence objects, where each list element corresponds to a cluster and contains the sequences in that cluster
+
 subset_by_clusters <- function(seqs, cluster_tbl, save_to_file = TRUE){
   cluster_seqs <- cluster_tbl %>%
-    select(cluster, seqnames) %>% 
+    select(cluster, seqnames) %>%
     arrange(cluster) %>%
     group_by(cluster) %>%
     group_split(.keep = FALSE)
-  
+
   seqs_one_cluster <- function(cluster){
     seqs[unname(unlist(cluster))]
   }
   out <- lapply(cluster_seqs, seqs_one_cluster)
-  names(out) <- map(lapply(out, names), 1) %>% 
-    unlist() %>% 
+  names(out) <- map(lapply(out, names), 1) %>%
+    unlist() %>%
     unname ()
-  
+
   save_cluster <- function(clusnum){
     clus_save <- file.path(
       clus_save_dir, paste0(names(out)[clusnum], "_cluster.fasta"))
@@ -24,4 +33,4 @@ subset_by_clusters <- function(seqs, cluster_tbl, save_to_file = TRUE){
     lapply(seq_along(out), save_cluster)
   }
   return(out)
-}
\ No newline at end of file
+}
diff --git a/R/subset_variant_table.R b/R/subset_variant_table.R
index dd234d0..32328c7 100644
--- a/R/subset_variant_table.R
+++ b/R/subset_variant_table.R
@@ -1,13 +1,30 @@
+#' Subset a variant table by excluding specific clusters, variants, and samples
+#'
+#' This function allows you to subset a variant table by excluding specific clusters, variants, and samples.
+#'
+#' @param variant_classified_table A tibble containing the variant classified table. Default: mytibble.
+#' @param exclude_clusters A character vector specifying the clusters to exclude. Default: c().
+#' @param exclude_variants A character vector specifying the variants to exclude. Default: c().
+#' @param exclude_samples A character vector specifying the samples to exclude. Default: c().
+#'
+#' @return A tibble containing the subsetted variant table.
+#'
+#' @import dplyr
+#' @importFrom stats setdiff
+#'
+#' @examples
+#' subset_variant_table(mytibble, c("cluster1", "cluster2"), c("variant1", "variant2"), c("sample1", "sample2"))
+
 subset_variant_table <- function(variant_classified_table = mytibble,
                                  exclude_clusters = c(),
                                  exclude_variants = c(),
                                  exclude_samples = c()){
-  
-  variant_classified_table_sub <- variant_classified_table %>% 
+
+  variant_classified_table_sub <- variant_classified_table %>%
     filter(clus_name %in%   setdiff(unique(variant_classified_table$clus_name),
-                                    exclude_clusters)) %>% 
-    filter(var_type %in% setdiff(unique(variant_classified_table$var_type), 
-                                 exclude_variants)) %>% 
+                                    exclude_clusters)) %>%
+    filter(var_type %in% setdiff(unique(variant_classified_table$var_type),
+                                 exclude_variants)) %>%
     filter(sample %in% setdiff(unique(variant_classified_table$sample),
                                exclude_samples))
-}
\ No newline at end of file
+}
diff --git a/R/test_clustering_thresholds.R b/R/test_clustering_thresholds.R
index 74014fc..1e56a2f 100644
--- a/R/test_clustering_thresholds.R
+++ b/R/test_clustering_thresholds.R
@@ -1,7 +1,30 @@
-# Iterate over threshold values
-test_clustering_thresholds <- function(MyDNAstring, step_size, 
+#' Iterate over threshold values
+#'
+#' This function iterates over a range of threshold values and performs clustering
+#' on a DNA string dataset.
+#'
+#' @param MyDNAstring A DNAStringSet object containing the DNA sequences to be clustered.
+#' @param step_size The step size for the threshold values. Default is 0.01.
+#' @param step_max The maximum threshold value. Default is 0.99.
+#' @param ncores The number of cores to use for parallel processing. Default is 1.
+#'
+#' @return A list of clustering results, where each element in the list corresponds to a specific threshold value.
+#'
+#' @import DECIPHER
+#' @import tibble
+#' @import dplyr
+#' @import tidyr
+#'
+#' @examples
+#' # Create a DNAStringSet object
+#' my_sequences <- DNAStringSet(c("ACGT", "AGCT", "ATGC"))
+#'
+#' # Run the clustering function
+#' results <- test_clustering_thresholds(my_sequences, step_size = 0.1, step_max = 0.9, ncores = 2)
+#' @export
+test_clustering_thresholds <- function(MyDNAstring, step_size,
                                        step_max = 0.99, ncores = 1) {
-  
+
   #DECIPHER removed the 'type' argument from IdClusters around v2.24. Currently (Feb23), the function is renamed "Clusterize"
   if(numeric_version(packageVersion("DECIPHER")) < 2.24){
     clus_tbl <- IdClusters(method = "inexact", myXStringSet = MyDNAstring,
@@ -9,18 +32,18 @@ test_clustering_thresholds <- function(MyDNAstring, step_size,
                            verbose = FALSE)
   }else{
     clus_tbl <- Clusterize(MyDNAstring,
-                           cutoff = seq(0, step_max, step_size), 
+                           cutoff = seq(0, step_max, step_size),
                            processors = ncores,
-                           verbose = FALSE)           
+                           verbose = FALSE)
   }
-  
+
   steps_temp <- seq(0, step_max, step_size)
   clus_tbl_list <- as_tibble(clus_tbl, rownames = "seqnames") %>%
     pivot_longer(cols = -seqnames, names_to = "cutoff", values_to = "cluster") %>%
     group_by(cutoff) %>%
     group_split(.keep = FALSE)
-  
-  
+
+
   names(clus_tbl_list) <- steps_temp
   return(clus_tbl_list)
-}
\ No newline at end of file
+}
diff --git a/R/translate_and_count_stops.R b/R/translate_and_count_stops.R
index 12bba83..9380240 100644
--- a/R/translate_and_count_stops.R
+++ b/R/translate_and_count_stops.R
@@ -1,17 +1,39 @@
-# Find nucleotide distance to first stop (half-open reading frames)
+#' Find nucleotide distance to first stop (half-open reading frames)
+#'
+#' This function takes a DNAStringSet as input and finds the nucleotide distance
+#' to the first stop codon in each of the three reading frames.
+#'
+#' @param seqs A DNAStringSet object containing DNA sequences
+#'
+#' @return A data frame with the following columns:
+#'   \itemize{
+#'     \item \code{seqnames}: The name of the DNA sequence
+#'     \item \code{rframe}: The reading frame number (1, 2 or 3)
+#'     \item \code{seq}: The translated peptide sequence
+#'     \item \code{fos_width}: The nucleotide distance to the first stop codon in each reading frame
+#'   }
+#'
+#' @examples
+#' seqs <- DNAStringSet("ATGTCGATAGCCTAGGTCAGTAA")
+#' translate_and_count_stops(seqs)
+#'
+#' @import Biostrings
+#' @import dplyr
+#' @import stringr
+#' @export
 translate_and_count_stops <- function(seqs = DNAStringSet) {
   # Make reading frames and translate to protein
-  asv_rfs <- lapply(1:3, function(pos) 
+  asv_rfs <- lapply(1:3, function(pos)
     subseq(c(seqs), start=pos))
-  
+
   asv_peps <- suppressWarnings(lapply(asv_rfs, translate))
-  
-  peps_tbl <- lapply(asv_peps, as.data.frame) %>% 
-    lapply(rownames_to_column, var = "seqnames") %>% 
+
+  peps_tbl <- lapply(asv_peps, as.data.frame) %>%
+    lapply(rownames_to_column, var = "seqnames") %>%
     bind_rows(.id = "rframe") %>%
     dplyr::rename(seq = x) %>%
     mutate(nostop = str_remove(seq, ".*\\*")) %>%
-    mutate(first_open_start = str_locate(nostop, "M")[, "start"]) %>% 
+    mutate(first_open_start = str_locate(nostop, "M")[, "start"]) %>%
     mutate(fos_width = (str_length(nostop) - first_open_start)*3) %>%
     mutate(fos_width = replace_na(fos_width, 0)) %>%
     select(-nostop, -first_open_start) %>%
@@ -23,4 +45,4 @@ translate_and_count_stops <- function(seqs = DNAStringSet) {
                               ((first_stop - 1)*3),
                               (first_stop * 3)))
   return(peps_tbl)
-}
\ No newline at end of file
+}
diff --git a/R/variant_classifier.R b/R/variant_classifier.R
index b64342d..e4bdd9e 100644
--- a/R/variant_classifier.R
+++ b/R/variant_classifier.R
@@ -1,59 +1,69 @@
+#' Variant Classifier Function
+#'
+#' This function classifies variants based on relative abundance and similarity to a reference sequence.
+#'
+#' @param seqtab_file Path to a sequence table file (default: file.path(path, 'seqtab_nochim.rds'))
+#' @param clustered_sequences A list of DNA sequences containing clustered ASVs (default: myDNAStringSetList)
+#' @param reference_informed Logical value indicating whether the classification should be reference informed (default: FALSE)
+#'
+#' @return A modified master table with variant classifications
+
 # Define a function called variant_classifier with two arguments: seqtab_file and clustered_sequences
 variant_classifier <- function(
     seqtab_file = file.path(path, 'seqtab_nochim.rds'),
     clustered_sequences = myDNAStringSetList){
-  
+
   # Convert seqtab into long format and calculate relative abundance per sample for each ASV
-  seqtab_tbl_long <- clean_seqtab(seqtab_file, output = FALSE) %>% 
-    pivot_longer(cols = where(is.integer), names_to = 'sample', 
-                 values_to = 'asv_counts') %>% 
-    group_by(sample) %>% 
-    mutate(sample_count_sum = sum(asv_counts)) %>% 
-    ungroup() %>% 
-    mutate(rel_abundance = asv_counts/sample_count_sum) %>% 
+  seqtab_tbl_long <- clean_seqtab(seqtab_file, output = FALSE) %>%
+    pivot_longer(cols = where(is.integer), names_to = 'sample',
+                 values_to = 'asv_counts') %>%
+    group_by(sample) %>%
+    mutate(sample_count_sum = sum(asv_counts)) %>%
+    ungroup() %>%
+    mutate(rel_abundance = asv_counts/sample_count_sum) %>%
     filter(sample_count_sum > 0)
-  
+
   # Create a table for clustered sequences and filter out clusters with only one sequence
-  clustab_tbl <- cluster_tbl_named(clustered_sequences) %>% 
-    left_join(tibble(seqnames = unlist(map(clustered_seqs, names)), 
+  clustab_tbl <- cluster_tbl_named(clustered_sequences) %>%
+    left_join(tibble(seqnames = unlist(map(clustered_seqs, names)),
                      seqs = as.character(unlist(clustered_sequences))),
               by = 'seqnames') %>%
-    filter(clus_size > 1) 
-  
+    filter(clus_size > 1)
+
   # If reference informed, add reference info and name variants
   if(reference_informed){
     # Join the clustered sequences table with the long seqtab table by sequence name
-    master_tbl <- left_join(clustab_tbl, seqtab_tbl_long, by = 'seqs') %>% 
+    master_tbl <- left_join(clustab_tbl, seqtab_tbl_long, by = 'seqs') %>%
       filter(seqnames != clus_name)
-    
+
     # Calculate the similarity of each cluster to the reference sequence
-    sim_to_ref_tbl <- map(clustered_seqs, similiarity_to_reference, 
-                          ref = refstrings) %>% 
-      discard(is_null) %>% 
+    sim_to_ref_tbl <- map(clustered_seqs, similiarity_to_reference,
+                          ref = refstrings) %>%
+      discard(is_null) %>%
       bind_rows(.id = 'clus_name') %>%
       unique()
-    
+
     # Join the master table with the reference similarity table
-    master_tbl <- left_join(master_tbl, sim_to_ref_tbl, by = c('clus_name', 
-                                                               'seqnames')) %>% 
+    master_tbl <- left_join(master_tbl, sim_to_ref_tbl, by = c('clus_name',
+                                                               'seqnames')) %>%
       # Add sequence variants and major variant information to the table and group by cluster name
-      mutate(seq_var = if_else(sim_to_ref == 0, 'Reference sequence', 
+      mutate(seq_var = if_else(sim_to_ref == 0, 'Reference sequence',
                                paste0('Sequence variant ', num-2))) %>%
       mutate(nums_not_ref = if_else(sim_to_ref == 0, 99999, as.double(num))) %>%
       group_by(clus_name) %>%
       mutate(maj_var = min(nums_not_ref)) %>%
       ungroup() %>%
       mutate(seq_var = if_else(num == maj_var, 'Major variant', seq_var)) %>%
-      mutate(seq_var = factor(seq_var, levels = c('Reference sequence', 
+      mutate(seq_var = factor(seq_var, levels = c('Reference sequence',
                                                   'Major variant',
-                                                  paste0('Sequence variant ', 
-                                                         1:max(num))))) %>% 
+                                                  paste0('Sequence variant ',
+                                                         1:max(num))))) %>%
       select(-maj_var, -nums_not_ref)
   }
-  
+
   # If not reference informed, join the clustered sequences table with the long seqtab table and add "x" to a new column sim_to_ref
   if(!reference_informed){
-    master_tbl <- left_join(clustab_tbl, seqtab_tbl_long, by = 'seqs') %>% 
+    master_tbl <- left_join(clustab_tbl, seqtab_tbl_long, by = 'seqs') %>%
       # Add sequence variants and major variant information to the table and group by cluster name
       mutate(seq_var = paste0('Sequence variant ', num-2)) %>%
       group_by(clus_name) %>%
@@ -61,53 +71,54 @@ variant_classifier <- function(
       mutate(min_var = min(num)+1) %>%
       ungroup() %>%
       mutate(seq_var = if_else(num == maj_var, 'Major variant', seq_var)) %>%
-      mutate(seq_var = if_else(num == min_var, 'Minor variant', seq_var)) %>% 
+      mutate(seq_var = if_else(num == min_var, 'Minor variant', seq_var)) %>%
       mutate(seq_var = factor(seq_var, levels = c('Major variant',
                                                   'Minor variant',
-                                                  paste0('Sequence variant ', 
-                                                         1:max(num))))) %>% 
+                                                  paste0('Sequence variant ',
+                                                         1:max(num))))) %>%
       select(-maj_var, -min_var)
   }
-  
-  
+
+
   # Call variant types using top 2 most abundant sequences per sample and cluster based on relative abundance
   variant_types <- master_tbl %>%
-    filter(rel_abundance > 0) %>% 
-    group_by(sample, clus_name) %>% 
-    top_n(2, rel_abundance) %>% 
-    ungroup() %>% 
+    filter(rel_abundance > 0) %>%
+    group_by(sample, clus_name) %>%
+    top_n(2, rel_abundance) %>%
+    ungroup() %>%
     select(sample, ASV) %>%
     group_by(sample) %>%# Concatenate the top 2 most abundant sequences per sample into a single string - "profile"
-    mutate(profile = paste0(ASV, collapse = '')) %>% 
-    
+    mutate(profile = paste0(ASV, collapse = '')) %>%
+
     # Remove duplicates
-    ungroup() %>% 
-    select(-ASV) %>% 
+    ungroup() %>%
+    select(-ASV) %>%
     unique()
-  
+
   # Name each variant type (Variant_type1, Variant_type2, etc.)
-  var_type_names <- select(variant_types, -sample) %>% 
-    unique() %>% 
-    mutate(var_type = factor(paste0('Variant_type ', 1:nrow(.)), 
-                             levels = paste0('Variant_type ', 1:nrow(.)))) %>% 
+  var_type_names <- select(variant_types, -sample) %>%
+    unique() %>%
+    mutate(var_type = factor(paste0('Variant_type ', 1:nrow(.)),
+                             levels = paste0('Variant_type ', 1:nrow(.)))) %>%
     # Join variant type names with variant profiles
-    left_join(variant_types, by = 'profile') %>% 
+    left_join(variant_types, by = 'profile') %>%
     select(-profile)
-  
+
   # Add variant type info to the master table and filter out sequences with zero relative abundance
-  master_tbl <- master_tbl %>% 
-    left_join(var_type_names, by = 'sample') %>% 
-    mutate(seqs_lengths = str_length(seqs)) %>% 
+  master_tbl <- master_tbl %>%
+    left_join(var_type_names, by = 'sample') %>%
+    mutate(seqs_lengths = str_length(seqs)) %>%
     group_by(sample, clus_name) %>%
-    mutate(cluster_sum_rel_abu = sum(rel_abundance)) %>% 
-    mutate(rel_var_abundance = rel_abundance/cluster_sum_rel_abu) %>% 
-    top_n(2, rel_abundance) %>% 
-    ungroup() %>% 
+    mutate(cluster_sum_rel_abu = sum(rel_abundance)) %>%
+    mutate(rel_var_abundance = rel_abundance/cluster_sum_rel_abu) %>%
+    top_n(2, rel_abundance) %>%
+    ungroup() %>%
     filter(rel_abundance > 0)
-  
-  aa_info <- find_longest_reading_frames(unlist(unname(clustered_sequences))) %>% 
-    mutate(longest_aa_seq = max_width/3) %>% 
+
+  aa_info <- find_longest_reading_frames(unlist(unname(clustered_sequences))) %>%
+    mutate(longest_aa_seq = max_width/3) %>%
     select(seqnames, longest_aa_seq)
-  
+
   master_tbl <- left_join(master_tbl, aa_info, by = "seqnames")
-}
\ No newline at end of file
+  return(master_tbl)
+}
diff --git a/R/veganify_asvcounts.R b/R/veganify_asvcounts.R
index 69388dd..2070bd4 100644
--- a/R/veganify_asvcounts.R
+++ b/R/veganify_asvcounts.R
@@ -1,10 +1,18 @@
+#' Veganify ASV Counts
+#'
+#' This function takes a cleaned sequence table and converts it to vegan format.
+#'
+#' @param cleaned_seqtab A cleaned sequence table.
+#'
+#' @return A vegan formatted count matrix.
+#' @export
 veganify_asvcounts <- function(cleaned_seqtab = my_cleaned_seqtab){
-  out <- cleaned_seqtab %>% 
-    select(-seqs, -ASV) %>% 
-    as.data.frame() 
+  out <- cleaned_seqtab %>%
+    select(-seqs, -ASV) %>%
+    as.data.frame()
   row.names(out) <- cleaned_seqtab[["ASV"]]
-  out <- out %>% 
-    select(where( ~ is.numeric(.x) && sum(.x) != 0)) %>% 
+  out <- out %>%
+    select(where( ~ is.numeric(.x) && sum(.x) != 0)) %>%
     t()
   return(out)
-}
\ No newline at end of file
+}
diff --git a/R/veganify_generic_wide_tbl.R b/R/veganify_generic_wide_tbl.R
index 9fbe40d..16f9fc9 100644
--- a/R/veganify_generic_wide_tbl.R
+++ b/R/veganify_generic_wide_tbl.R
@@ -1,9 +1,35 @@
-# Makes a vegdist compatible data.frame from a wide tibble that contains the 
-# rownames in the first column and input data for vegdist in all other columns
+#' Makes a vegdist compatible data.frame from a wide tibble that contains the rownames in the first column and input data for vegdist in all other columns
+#'
+#' This function takes a wide tibble with rownames in the first column and input data for vegdist in all other columns, and converts it into a data.frame that is compatible with the vegdist function. It replaces any na values with 0 and sets the rownames of the resulting data.frame to the values in the first column of the input tibble.
+#'
+#' @param data A wide tibble with rownames in the first column and input data for vegdist in all other columns.
+#'
+#' @return A data.frame that is compatible with the vegdist function.
+#'
+#' @importFrom dplyr %>%
+#' @importFrom dplyr replace
+#' @importFrom stats as.data.frame
+#'
+#' @examples
+#' library(tibble)
+#'
+#' # Create a wide tibble
+#' my_wide_tibble <- tibble::tribble(
+#'   ~Name, ~Var1, ~Var2, ~Var3,
+#'   "A",    1,     2,     3,
+#'   "B",   4,    5,     6,
+#'   "C",    7,     8,     9
+#' )
+#'
+#' # Apply the veganify_generic_wide_tbl function
+#' result <- veganify_generic_wide_tbl(data = my_wide_tibble)
+#' result
+#'
+#' @exports
 veganify_generic_wide_tbl <- function(data = my_wide_tibble){
   # Prepare data from the plotting table for vegdist/hclust clustering
-  for_vegan_df <- as.data.frame(data)[-1] %>% 
+  for_vegan_df <- as.data.frame(data)[-1] %>%
     replace(is.na(.), 0)
   row.names(for_vegan_df) <- data[[1]]
   return(for_vegan_df)
-}
\ No newline at end of file
+}
-- 
GitLab