roxygen tags added

f111274c · Simeon · 6941973c · f111274c · f111274c · f111274c
Commit f111274c authored 1 year ago by Simeon
--- a/R/count_clusters.R
+++ b/R/count_clusters.R
+#' Count Clusters
+#'
+#' This function takes a list of cluster tables and returns a tibble
+#' with cluster counts by threshold.
+#'
+#' @param clus_tbl_list A list of cluster tables.
+#'
+#' @return A tibble with cluster counts by threshold.
+#'
+#' @examples
+#'
+#' set.seed(1)
+#' clus_tbl_list <- list(
+#'   data.frame(seqnames = c("chr1", "chr1", "chr2"),
+#'              start = c(10, 50, 100), end = c(20, 60, 200), count = c(5, 7, 10)),
+#'   data.frame(seqnames = c("chr1", "chr1", "chr2"),
+#'              start = c(15, 55, 110), end = c(25, 65, 150), count = c(8, 6, 11))
+#' )
+#' count_clusters(clus_tbl_list)
+#'
+#' @import dplyr
+#' @import purrr
+#' @import tidyr
+#' @export
+
 count_clusters <- function(clus_tbl_list){
-  #remove non-numeric column from clus_tbl_list
-  clus_tbl_num <- map(clus_tbl_list, ~ dplyr::select(., -seqnames))
-  clus_counts <- lapply(clus_tbl_num, max)
-  names(clus_counts) <- names(clus_tbl_list)
-  clus_counts %>% 
-    bind_rows() %>% 
-    pivot_longer(everything()) %>% 
-    mutate(threshold = as.numeric(name)) %>%
-    dplyr::rename(cluster_number = value)
-}
\ No newline at end of file
+
+   # Remove the non-numeric column "seqnames" from each cluster table in the list.
+   clus_tbl_num <- map(clus_tbl_list, ~ dplyr::select(., -seqnames))
+
+   # Create a list of the maximum cluster values for each cluster table in clus_tbl_num using lapply()
+   clus_counts <- lapply(clus_tbl_num, max)
+
+   # Assign the names of the original cluster tables to the values in clus_counts
+   names(clus_counts) <- names(clus_tbl_list)
+
+   # Combine the values in clus_counts into a single data frame using bind_rows()
+   # Transform the data frame from wide to long using pivot_longer(), and create
+   # a new column called "threshold" with the values previously used as column names
+   # Rename the "value" column to "cluster_number" using dplyr::rename()
+   clus_counts %>%
+     bind_rows() %>%
+     pivot_longer(everything()) %>%
+     mutate(threshold = as.numeric(name)) %>%
+     dplyr::rename(cluster_number = value)
+
+   # Return the final data frame with the cluster numbers and their maximum values
+   return(clus_counts)
+}
--- a/R/define_plateau.R
+++ b/R/define_plateau.R
+#' Define a plateau within cluster counts
+#'
+#' This function takes in a tibble of cluster counts and defines
+#' the plateau within the cluster with the largest threshold window.
+#'
+#' @param cluster_counts A tibble of cluster counts with columns for
+#' cluster_number and threshold.
+#'
+#' @return A named vector with the starting and ending thresholds of the
+#' plateau and the corresponding cluster_number.
+#'
+#' @examples
+#' define_plateau(cluster_counts = cluster_counts_df)
+#'
+#' @import dplyr
+#' @export
+# The following code defines a function called "define_plateau"
 define_plateau <- function(cluster_counts){
+  # "cluster_counts" is a tibble of cluster counts passed as a parameter to the function
+  # "clus_plateau" filters the cluster counts by selecting only those with cluster_number greater than or equal to 2
+  # then group the remaining data by cluster_number and calculate the maximum and minimum threshold per cluster_number using the "summarise" function.
+  # This is done to exclude a potentially long tail of single cluster outcomes at high thresholds.
+  # Calculate the difference between the maximum and minimum threshold for each cluster and store in "thresh_win"
+  # "arrange" function arranges the rows in descending order of "thresh_win" and selects the first row using "slice_head"
+  # "clus_plateau" now contains the "plateau_start", "plateau_end", and "cluster_number" values for the selected cluster
  clus_plateau <- cluster_counts %>%
-    filter(cluster_number >= 2) %>% 
-    group_by(cluster_number) %>% 
-    summarise(max_thresh = max(threshold), min_thresh = min(threshold)) %>% 
-    mutate(thresh_win = max_thresh-min_thresh) %>% 
-    arrange(desc(thresh_win)) %>% 
-    slice_head(n = 1) %>% 
+    filter(cluster_number >= 2)%>%
+    group_by(cluster_number) %>%
+    summarise(max_thresh = max(threshold), min_thresh = min(threshold)) %>%
+    mutate(thresh_win = max_thresh-min_thresh) %>%
+    arrange(desc(thresh_win)) %>%
+    slice_head(n = 1) %>%
    select(min_thresh, max_thresh, cluster_number) %>%
    unlist() %>% unname()
-  names(clus_plateau) <- c("plateau_start", "plateau_end", "cluster_number")
+  # This line assigns the names "plateau_start", "plateau_end", and "cluster_number" to each of the values in "clus_plateau".
+  names(clus_plateau) <- c('plateau_start', 'plateau_end', 'cluster_number')
  return(clus_plateau)
-}
\ No newline at end of file
+}
--- a/R/dendrogram_hclust.R
+++ b/R/dendrogram_hclust.R
-dendrogram_hclust <- function(data = veganized_tibble, seed = 1, ...){
+#' Generate dendrogram data from numerical data
+#'
+#' This function generates a dendrogram from a hierarchical clustering of a dataset.
+#'
+#' @param data A tibble or data frame suited for vegdist().
+#' @param seed An integer that sets the seed for the random number generator. Default is 1.
+#' @param ... Additional arguments passed to `vegdist`.
+#'
+#' @return A `ggdendro::dendro_data` object, containing data for plotting the dendrogram.
+#'
+#' @import ggdendro
+#' @import vegan
+#' @import stats
+#'
+#' @examples
+#' # Generate dendrogram with default parameters
+#' dendrogram_hclust()
+#'
+#' # Generate dendrogram with custom data and distance metric
+#' daisy_dist <- daisy(iris[,-5], metric = "gower")
+#' dendrogram_hclust(daisy_dist)
+#'
+#' @export
+
+dendrogram_hclust <- function(data = veganized_tibble, seed = 1, ...) {
  require(ggdendro)
  set.seed(seed)
-  
+
  # make hierarchical cluster from vegdist matrix and extract data for plotting
-  dat <- vegdist(data, ...) %>% 
-    hclust() %>% 
-    as.dendrogram() %>% 
-    ggdendro::dendro_data(type = "rectangle")
+  dat <- vegdist(data, ...) %>%
+    hclust() %>%
+    as.dendrogram() %>%
+    ggdendro::dendro_data(type = 'rectangle')
+
  return(dat)
-}
\ No newline at end of file
+}
--- a/R/export_longest_reading_frame.R
+++ b/R/export_longest_reading_frame.R
-export_longest_reading_frame <- function(clustered_reading_frames_tbl = tbl, 
-                                         seqs = myDNAStringSet,
-                                         outpath = path){
+#' Export Longest Reading Frame
+#'
+#' Export the longest reading frames from a DNA sequence set to a fasta file containing amino acid sequences.
+#'
+#' @param clustered_reading_frames_tbl tibble of clustered reading frames
+#' @param seqs DNAStringSet object with DNA sequences
+#' @param outpath character string of output directory path
+#' @param verbose logical indicating whether to print progress updates
+#'
+#' @return Messages indicating completion of exporting the longest reading frames to a fasta file format containing amino acid sequences.
+#'
+#' @examples
+#' export_longest_reading_frame(clustered_reading_frames_tbl, myDNAStringSet, myDirPath, TRUE)
+#'
+#' @import Biostrings
+#' @import dplyr
+#' @import tidyr
+#' @import utils
+#'
+#' @export
+# Define a function that exports the longest reading frames
+export_longest_reading_frame <- function(clustered_reading_frames_tbl = tbl, # function argument for clustered_reading_frame table
+                                          seqs = myDNAStringSet, # function argument for DNA sequence set
+                                          outpath = path, # function argument for output file path
+                                          verbose = FALSE){ # function argument for verbosity
+
+  # check if the output path exists, create it if it doesn't
  if(!dir.exists(file.path(outpath))){
    dir.create(outpath)
  }
-  
+
+  # translate the DNA sequences, count stop codons, and add a column for reading frame as a factor
  peps_tbl <- translate_and_count_stops(seqs) %>%
    mutate(rframe = as.factor(rframe))
-  
+
+  # extract the unique cluster names from the clustered_reading_frames_tbl table
  clus_name <- clustered_reading_frames_tbl %>%
    select(cluster_name) %>% unique() %>% unlist() %>% unname()
-  
+
+  # join the clustered_reading_frames_tbl table and the peps_tbl table by shared columns
  temp_seq_tbl <- left_join(clustered_reading_frames_tbl, peps_tbl,
-                            by = c("seqnames", "rframe", "fos_width", 
-                                   "first_stop", "stop_count", "hrf_width"))
+                            by = c('seqnames', 'rframe', 'fos_width',
+                                   'first_stop', 'stop_count', 'hrf_width'))
+
+  # create an amino acid sequence set from the 'seq' column of the temp_seq_tbl table
  temp_aa_seqs <- AAStringSet(x = as.character(
-    temp_seq_tbl["seq"] %>% unlist() %>% unname()))
-  names(temp_aa_seqs) <- as.character(temp_seq_tbl["seqnames"] %>% unlist() %>% 
+    temp_seq_tbl['seq'] %>% unlist() %>% unname()))
+
+  # assign names to the sequences in the temp_aa_seqs sequence set
+  names(temp_aa_seqs) <- as.character(temp_seq_tbl['seqnames'] %>% unlist() %>%
                                        unname())
-  Biostrings::writeXStringSet(temp_aa_seqs, 
+
+  # write the amino acid sequences to a fasta file
+  Biostrings::writeXStringSet(temp_aa_seqs,
                              filepath = file.path(outpath, paste0(
-                                clus_name, "_cluster_longest_AA_seqs.fa")))
-  return(paste("AA sequences for cluster", clus_name, "written to file", 
-               file.path(outpath, paste0(
-                 clus_name, "_cluster_longest_AA_seqs.fa"))
-  ))
-  
-}
\ No newline at end of file
+                                clus_name, '_cluster_longest_AA_seqs.fa')))
+
+  # return a message confirming the export and file path if verbose is true
+  if(verbose){
+    return(paste('AA sequences for cluster', clus_name, 'written to file',
+                 file.path(outpath, paste0(
+                   clus_name, '_cluster_longest_AA_seqs.fa'))
+    ))
+  }else{
+    return(paste('AA sequences saved to file'))
+  }
+}
--- a/R/find_contiguous_multi_repeats.R
+++ b/R/find_contiguous_multi_repeats.R
-## Repeat counter ----
+#' Find contiguous multi-repeats in a set of sequences
+#'
+#' This function finds contiguous multi-repeats in a set of sequences,
+#' where the repeat sequence is made up of n consecutive copies of a given string.
+#'
+#' @param sequences A DNAStringSet or AAStringSet object containing the sequences
+#'        to search for multi-repeats (default: DNAStringSet)
+#' @param repeat_sequence A character string representing the repeat sequence to search
+#'        for (default: 'string')
+#' @param singlet_count An integer or vector of integers specifying the number
+#'        of single occurrences of the repeat, or the maximum expected number of
+#'        single occurrences (default: NULL)
+#'
+#' @return A numeric vector containing the repeat count for each input sequence
+#'         (i.e., the number of contiguous multi-repeats found)
+#'
+#' @export
+#'
+#' @examples
+#' # Create example DNAStringSet
+#' sequences <- DNAStringSet(c("ATCGATATCGATCGATCG", "GCTAGCTAGCTAGCTAGCTA"))
+#' find_contiguous_multi_repeats(sequences, 'AT', 3)
+#'
+#' # Expected output: c(2, 1)
+#'
+#' @import stringr
+#' @import Biostrings
+#'
+#' @keywords sequence, repeats
+#'
 find_contiguous_multi_repeats <- function(sequences = DNAStringSet,
-                                          repeat_sequence = "string",
-                                          singlet_count) {
-  
-  
+                                          repeat_sequence = 'string',
+                                          singlet_count = 100) {
+
  # Make vector of zeros for minimum repeat count, make 1 if singlet was found
  repeat_count <- replicate(length(sequences), 0)
  repeat_count[singlet_count > 0] <- 1
-  
+
  # Iterate n to the highest observed singlet count, overwrite repeat count if
  # exactly one n-mer of repeat is found in the sequence
  for (n in seq(1, max(singlet_count))) {
-    contig_rep <- paste(replicate(n, repeat_sequence), collapse = "")
+    contig_rep <- paste(replicate(n, repeat_sequence), collapse = '')
    contig_rep_count <- str_count(as.character(sequences), contig_rep)
-    repeat_count[contig_rep_count < singlet_count & contig_rep_count > 0] <- n
+    repeat_count[contig_rep_count < singlet_count & contig_rep_count > 0] <-n
  }
  return(repeat_count)
-}
\ No newline at end of file
+}
--- a/R/find_longest_hrf.R
+++ b/R/find_longest_hrf.R
+#' Find longest half-open reading frame
+#'
+#' This function takes a DNAStringSet as input and returns the longest half-open
+#' reading frames as a data table. Wrapper function around translate_and_count_stops()
+#'
+#' @param seqs A DNAStringSet object with nucleotide sequences.
+#'
+#' @return A data table with the longest open reading frames.
+#'
+#' @examples
+#' find_longest_hrf(seqs)
+#'
+#' @import dplyr
+#'
+#' @export
 find_longest_hrf <- function(seqs = DNAStringSet){
  peps_tbl <- translate_and_count_stops(seqs)
  hrfs <- select(peps_tbl, -seq)
  return(hrfs)
-}
\ No newline at end of file
+}
--- a/R/find_longest_orf.R
+++ b/R/find_longest_orf.R
-find_longest_orf <-function(seqs = DNAStringSet) {
+#' Find the longest ORF in a DNA sequence set
+#'
+#' This function identifies the longest open reading frame (ORF) in a DNA sequence set.
+#' Wrapper around \code{\link{findORFs}}
+#'
+#' @param seqs A DNAStringSet object containing the DNA sequences to search for ORFs.
+#'
+#' @return A tibble containing the start and end positions, strand, and length of the longest ORF in each sequence.
+#'
+#' @import Biostrings
+#' @importFrom GenomicRanges GRanges
+#' @import tibble
+#' @import dplyr
+#' @export
+#'
+#' @examples
+#' seqs <- DNAStringSet(c("ATGAGTTCGAAATGGCGTTGAA", "GGGGGCTCGAGCTAGC"))
+#' find_longest_orf(seqs)
+#'
+#' @seealso \code{\link{findORFs}}
+#'
+
+find_longest_orf <- function(seqs = DNAStringSet) {
+  # Find ORFs in the sequences, return longest ORF, and convert to a vector
  orfs <- findORFs(seqs, longestORF = TRUE, startCodon = startDefinition(6)) %>%
    unlist(use.names = TRUE)
-  orfs <- GRanges(seqnames = names(seqs)[as.integer(names(orfs))], 
-                  ranges(orfs), strand = "+") %>%
+  # Convert the ORFs to a GRanges object
+  orfs <- GRanges(seqnames = names(seqs)[as.integer(names(orfs))],
+                  ranges(orfs), strand = '+') %>%
+    # Convert to a tibble and add a column for reading frame
    as_tibble() %>%
    mutate(rframe = ((start -1) %% 3 ) + 1) %>%
+    # Rename the width column to orf_width
    dplyr::rename(orf_width = width)
+  # Return the ORFs
  return(orfs)
-}
\ No newline at end of file
+}
--- a/R/find_longest_reading_frames.R
+++ b/R/find_longest_reading_frames.R
+#' Reading frame finder (longest orf or hrf)
+#'
+#' This function finds the longest reading frames in a set of DNA sequences.
+#' It combines the results from finding the longest open reading frames (orfs)
+#' and the longest half reading frames (hrfs) and determines the reading frame
+#' with the biggest possible translation.
+#'
+#' @param seqs A DNAStringSet object containing the input DNA sequences.
+#'
+#' @return A data frame containing the longest reading frames for each sequence.
+#' The data frame includes the sequence names, reading frame, and the width of the reading frame.
+#'
+#' @import dplyr, tidyr
+#'
+#' @export
 ## Reading frame finder (longest orf or hrf)
 find_longest_reading_frames <- function(seqs = myDNAStringSet){
  orfs <- find_longest_orf(seqs)
  hrfs <- find_longest_hrf(seqs)
-  # Combine orf and hrf tables and find the reading frame with the biggest 
+  # Combine orf and hrf tables and find the reading frame with the biggest
  # possible translation
  orf_and_hrf <- full_join(orfs, hrfs, by = join_by(seqnames, rframe)) %>%
    mutate(orf_width = replace_na(orf_width, 0)) %>%
@@ -13,12 +28,12 @@ find_longest_reading_frames <- function(seqs = myDNAStringSet){
                             "Five-open (no start)", max_type)) %>%
    mutate(max_type = ifelse(fos_width > hrf_width & max_type == "open",
                             "Three-open (no stop)", max_type))
-  
+
  max_rf <- orf_and_hrf %>%
    group_by(seqnames) %>%
    slice_max(max_width, n = 1, with_ties = FALSE) %>%
    arrange(factor(seqnames, levels = names(seqs))) %>%
    mutate(rframe = factor(rframe, levels = c("1", "2", "3")))
-  
+
  return(max_rf)
-}
\ No newline at end of file
+}
--- a/R/find_repeat_positions.R
+++ b/R/find_repeat_positions.R
-find_repeat_positions <- function(sequences = DNAStringSet, 
-                                  repeat_sequence = "string"){
-  repeat_positions <- str_locate_all(as.character(sequences), repeat_sequence)
-  names(repeat_positions) <- names(sequences)
+#' Find Repeat Positions
+#'
+#' This function finds the positions of a given repeat sequence within a set of DNA sequences. It returns a data frame with the sequence name, start and end positions, and information used in plotting downstream.
+#'
+#' @param sequences A DNAStringSet object containing the DNA sequences to search for repeats.
+#' @param repeat_sequence A string specifying the repeat sequence to search for.
+#'
+#' @return A data frame with columns: seqname, start, end, fragment, and plot_intensity.
+#'
+#' @import stringr
+#' @import dplyr
+#' @import tibble
+#'
+#' @export
+#'
+#' @examples
+#' sequences <- DNAStringSet(c("AGTCAGT",
+#'                             "ACGTAGT",
+#'                             "AGTCGAT"))
+#' find_repeat_positions(sequences, "AGT")
+find_repeat_positions <- function(sequences = DNAStringSet, repeat_sequence = 'string'){
+
+  repeat_positions <- str_locate_all(as.character(sequences), repeat_sequence)names(repeat_positions) <- names(sequences)
  repeat_positions <- lapply(repeat_positions, as_tibble) %>%
-    bind_rows(.id = "seqname") %>%
+    bind_rows(.id = 'seqname') %>%
    mutate(fragment = repeat_sequence)
-  whole_sequence_positions = tibble("seqname" = names(sequences),
-                                    "start" = replicate(length(sequences), 1),
-                                    "end" = str_length(sequences),
-                                    "fragment" = replicate(
-                                      length(sequences), "Full sequence"))
+
+  whole_sequence_positions = tibble('seqname' = names(sequences),
+                                    'start' = replicate(length(sequences), 1),
+                                    'end' = str_length(sequences),
+                                    'fragment' = replicate(length(sequences), 'Full sequence'))
+
  out <- bind_rows(repeat_positions, whole_sequence_positions) %>%
    mutate(plot_intensity = ifelse(fragment == repeat_sequence, 4, 3)) %>%
    mutate(seqname = factor(seqname, levels = names(sequences))) %>%
-    mutate(fragment = factor(fragment, levels = c("Full sequence", 
-                                                  repeat_sequence)))
+    mutate(fragment = factor(fragment, levels = c('Full sequence', repeat_sequence)))
+
  return(out)
-}
\ No newline at end of file
+}
--- a/R/kmer_based_distance_matrix.R
+++ b/R/kmer_based_distance_matrix.R
-# Convert to bin format and get distance matrix using kmers
-kmer_based_distance_matrix <- function (seqs = DNAStringSet) {
+#' Convert to bin format and get distance matrix using kmers
+#'
+#' This function takes in a DNAStringSet and converts it to a bin format using ape::as.DNAbin function. It then calculates the distance matrix using the kdistance function from XXXX and returns it as a matrix.
+#'
+#' @param seqs DNAStringSet containing the DNA sequences
+#'
+#' @return A distance matrix in bin format.
+#'
+#' @importFrom ape as.DNAbin
+#'
+#' @importFrom Biostrings DNAStringSet
+#'
+#' @export
+#'
+kmer_based_distance_matrix <- function (seqs) {
  seqbins <- ape::as.DNAbin(seqs)
  as.matrix(kdistance(seqbins))
-}
\ No newline at end of file
+}
--- a/R/meshclustR.R
+++ b/R/meshclustR.R
-#write temporary file to cluster
-meshclustR <- function(seqs = MyDNAStringSet, 
-                       meshclust_bin = meshclust, filepath = path){
-  temp_file <- file.path(filepath, "cluster_me.fa")
+#' Write temporary file to cluster using MeshclustR
+#'
+#' This function writes a temporary file to perform a clustering analysis on a set of DNA sequences.
+#' The clustering is done using the \href{https://github.com/BioinformaticsToolsmith/MeShClust}{Meshclust commandline} tool.
+#'
+#' James, Benjamin T. et al. (2018),
+#' MeShClust: an intelligent tool for clustering DNA sequences.
+#' Nucleic Acids Research, gky315.
+#'
+#' @param seqs A DNAStringSet object containing the DNA sequences to be clustered.
+#' @param meshclust_bin The path to the Meshclust executable (run
+#' `which meshclust` on commandline). Default is set to "meshclust".
+#' @param filepath The path to the directory where the temporary and log
+#' files will be saved.
+#' @return A data frame with information regarding the clustering analysis.
+#'
+#' @examples
+#' meshclustR(seqs = MyDNAStringSet, meshclust_bin = meshclust, filepath = path)
+#' @import Biostrings
+#' @import readr
+#' @import magrittr
+#' @import stringr
+#' @importFrom tools file_path_sans_ext
+#' @export
+
+meshclustR <- function(seqs = MyDNAStringSet,
+                       meshclust_bin = meshclust,
+                       filepath = path){
+
+  #create temporary file path
+  temp_file <- file.path(filepath, 'cluster_me.fa')
  writeXStringSet(seqs, filepath = temp_file)
+
+  #create output file name
  out_file <- file.path(filepath, paste0(
-    tools::file_path_sans_ext(basename(temp_file)),".txt"))
-  
-  ## Meshclust
-  system2(meshclust, args = c("-d", temp_file,
-                              "-o", out_file))
-  
-  stable_cluster <- read_delim(out_file, delim = "\t", 
-                               col_names = c("cluster", "seqnames", 
-                                             "identity_with_center", "cluster_class"),
-                               col_types = "fcdc") %>% 
-    mutate(seqnames = str_remove(seqnames, ">"))
-  #clean up temp file
+    tools::file_path_sans_ext(basename(temp_file)),'.txt'))
+
+  #run Meshclust
+  system2(meshclust_bin, args = c('-d', temp_file,
+                                  '-o', out_file))
+
+  #read output file and parse
+  stable_cluster <- read_delim(out_file, delim = '\t',
+                               col_names = c('cluster', 'seqnames',
+                                             'identity_with_center', 'cluster_class'),
+                               col_types = 'fcdc') %>%
+    mutate(seqnames = str_remove(seqnames, '>'))
+
+  #clean up temp file and output file
  file.remove(temp_file, out_file)
+
+  #return data frame
  return(stable_cluster)
-}
\ No newline at end of file
+}
--- a/R/pivot_cluster_tbl_wider.R
+++ b/R/pivot_cluster_tbl_wider.R
-# Make wide table of clusters (listing all sequences in each cluster)
+#' Make wide table of clusters (listing all sequences in each cluster)
+#'
+#' @param cluster_tbl A data frame containing the cluster and sequence information
+#'
+#' @return A wide table of clusters with all sequences in each cluster listed
+#'
+#' @import dplyr
+#' @import tidyr
+#' @export
+#'
 pivot_cluster_tbl_wider <- function(cluster_tbl) {
  # First: get names of each cluster
  cluster_names <- cluster_tbl %>%
    dplyr::rename(clus_name = seqnames) %>%
    group_by(cluster) %>%
    slice_head(n = 1)
-  
  # Second: combine names with clusters and reshape tbls
-  cluster_tbl <- left_join(stable_cluster, cluster_names, by = "cluster") %>%
+  cluster_tbl <- left_join(stable_cluster, cluster_names, by = 'cluster') %>%
    group_by(cluster) %>%
-    mutate(num = row_number()) %>% 
+    mutate(num = row_number()) %>%
    pivot_wider(names_from = clus_name, values_from = seqnames, id_cols = num)
-}
\ No newline at end of file
+}
--- a/R/plot_abundance_per_sample.R
+++ b/R/plot_abundance_per_sample.R
-# Specifically intended to plot sequence abundance per sampled within
-# plot_cluster_overview from tbl of sequence abundance
+#' This function creates a plot of sequence abundance per sample from a table of sequence abundance.
+#'
+#' @param tbl_of_abundance A table containing sequence abundance data.
+#' @return A plot displaying sequence abundance per sample.
+#' @import ggplot2
+#' @import scales
+#' @examples
+#' tbl_of_abundance <- data.frame(ID = c("ASV1", "ASV2", "ASV3", "ASV4"),
+#'                                 Sample = c("Sample1", "Sample1", "Sample2", "Sample2"),
+#'                                 count = c(10, 5, 6, 12))
+#' plot_abundance_per_sample(tbl_of_abundance)
+#'
+#' @export
+#' Specifically intended to plot sequence abundance per sampled within plot_cluster_overview from tbl of sequence abundance
 plot_abundance_per_sample <- function(tbl_of_abundance) {
-  ggplot(tbl_of_abundance, aes(y = ID, x = Sample, fill = count/1000)) + 
-    geom_tile() + theme(legend.position = "none",
+  ggplot(tbl_of_abundance, aes(y = ID, x = Sample, fill = count/1000)) +
+    geom_tile() + theme(legend.position = 'none',
                        axis.title = element_blank(),
                        axis.text = element_blank(),
                        axis.ticks = element_blank())  +
-    ggtitle("ASV counts per sample") +
+    ggtitle('ASV counts per sample') +
    scale_fill_viridis_c() +
-    labs(fill = "ASV counts (in thousands)")
-}
\ No newline at end of file
+    labs(fill = 'ASV counts (in thousands)')
+}
--- a/R/plot_abundance_sums_per_sequence.R
+++ b/R/plot_abundance_sums_per_sequence.R
-# Specifically intended to plot total sequence abundance within 
-# plot_cluster_overview from tbl of sequence abundance sums
+#' Plot total sequence abundance within plot_cluster_overview from tbl of sequence abundance sums
+#'
+#' @param tbl_of_sums A table of sequence abundance sums
+#' @return A ggplot object of the total ASV counts and sequence IDs
+#' @examples
+#' tbl_of_sums <- data.frame(ID = c("ASV_001", "ASV_002", "ASV_003"), sum_count = c(1000, 2000, 3000))
+#' plot_abundance_sums_per_sequence(tbl_of_sums)
+#'
+#' @import ggplot2
+#'
+#' @export
 plot_abundance_sums_per_sequence <- function(tbl_of_sums) {
-  ggplot(tbl_of_sums, aes(y = ID, x = sum_count/1000)) + 
-    geom_col() + theme(axis.text.x = element_text(angle = 90,
-                                                  size = 15,
-                                                  vjust = 0.5),
-                       legend.position = "none",
-                       axis.title = element_blank(),
-                       axis.text.y = element_blank(),
-                       axis.ticks.y = element_blank()) +
-    ggtitle("Total ASV counts (in thousands)")
-}
\ No newline at end of file
+  ggplot(tbl_of_sums, aes(y = ID, x = sum_count / 1000)) +
+    geom_col() +
+    theme(axis.text.x = element_text(angle = 90, size = 15, vjust = 0.5),
+          legend.position = 'none',
+          axis.title = element_blank(),
+          axis.text.y = element_blank(),
+          axis.ticks.y = element_blank())+
+    ggtitle('Total ASV counts (in thousands)')
+}
--- a/R/plot_asv_nmds.R
+++ b/R/plot_asv_nmds.R
+#' Create NMDS plot with ggplot2 and centroids
+#'
+#' This function creates an NMDS plot using ggplot2 and can optionally add centroids to the plot.
+#'
+#' @param asv_nmds A data frame created from NMDS analysis with columns "MDS1" and "MDS2".
+#' @param color_by A string indicating which column to use for coloring points on the plot. Default: 'String'.
+#' @param centroids A logical value indicating whether or not to add centroids to the plot. Default: FALSE.
+#'
+#' @return A ggplot2 object containing the NMDS plot.
+#'
+#' @examples
+#' # Assume nmds_df has been created through NMDS analysis
+#' plot_asv_nmds(asv_nmds = nmds_df, color_by = 'Sample', centroids = TRUE)
+#'
+#' @import ggplot2
+#' @importFrom stats aggregate
+#'
+#' @export
 plot_asv_nmds <- function(asv_nmds = my_asv_nmds,
-                          color_by = "String",
+                          color_by = 'String',
                          centroids = FALSE){
+
  # Create NMDS plot with ggplot2 and centroids
-  pl <- ggplot(asv_nmds, aes(x=MDS1, y=MDS2, color= .data[[color_by]])) +
+
+  # Create ggplot object
+  pl <-ggplot(asv_nmds, aes(x=MDS1, y=MDS2, color= .data[[color_by]])) +
    geom_point() +
    theme_classic() +
-    labs(title="NMDS Plot", x="NMDS 1", y="NMDS 2")
+    labs(title='NMDS Plot', x='NMDS 1', y='NMDS 2')
+
+  # If centroids parameter is TRUE, add centroids to the plot
  if(centroids){
-    # Add sample groups to nmds_df
-    nmds_df$Group <- factor(gsub("[0-9]", "", rownames(asv_table)))
+    # Add sample groups to asv_nmds
+    asv_nmds$Group <- factor(gsub('[0-9]', '', rownames(asv_table)))
    # Calculate group centroids
-    centroids_df <- aggregate(nmds_df[, 1:2], by=list(Group=nmds_df$Group), mean)
-    
-    pl <- pl + 
-      geom_point(data=centroids_df, aes(x=NMDS1, y=NMDS2), 
-                 color="black", size=3, shape=21, fill="white")
+    centroids_df <- aggregate(asv_nmds[, 1:2], by=list(Group=asv_nmds$Group), mean)
+
+    # Add centroids to plot
+    pl <- pl +
+      geom_point(data=centroids_df, aes(x=NMDS1, y=NMDS2),
+                 color='black', size=3, shape=21, fill='white')
  }
+
+  # Return ggplot object
  return(pl)
-}
\ No newline at end of file
+}
--- a/R/plot_cluster_dendrogram.R
+++ b/R/plot_cluster_dendrogram.R
-# Generic plotting of tree with title "Dendrogram"
+#' Generic plotting of UPGMA tree using ggtree
+#'
+#' This function plots a dendrogram using the ggtree package.
+#'
+#' @param upgma_tree An object of class 'phylo' representing the tree.
+#'
+#' @import ggtree
+#' @import ggplot2
+#'
+#' @return A dendrogram plot.
+#'
+#' @examples
+#' tree <- ape::read.tree("tree.txt")
+#' plot_cluster_dendrogram(tree)
+#'
+#' @export
 plot_cluster_dendrogram <- function(upgma_tree) {
  ggtree::ggtree(upgma_tree) + ggtree::geom_tiplab(as_ylab = TRUE) +
-    ggtitle("Dendrogram") +
+    ggtitle('Dendrogram') +
    theme(axis.text = element_text(size = 15))
-}
\ No newline at end of file
+}