diff --git a/R/count_clusters.R b/R/count_clusters.R
index a3597ec7daa035748857f83c2d95f6f959fb02db..a0884b4cb2b6277e75e82cf114049402f73a82de 100644
--- a/R/count_clusters.R
+++ b/R/count_clusters.R
@@ -1,11 +1,49 @@
+#' Count Clusters
+#'
+#' This function takes a list of cluster tables and returns a tibble
+#' with cluster counts by threshold.
+#'
+#' @param clus_tbl_list A list of cluster tables.
+#'
+#' @return A tibble with cluster counts by threshold.
+#'
+#' @examples
+#'
+#' set.seed(1)
+#' clus_tbl_list <- list(
+#' data.frame(seqnames = c("chr1", "chr1", "chr2"),
+#' start = c(10, 50, 100), end = c(20, 60, 200), count = c(5, 7, 10)),
+#' data.frame(seqnames = c("chr1", "chr1", "chr2"),
+#' start = c(15, 55, 110), end = c(25, 65, 150), count = c(8, 6, 11))
+#' )
+#' count_clusters(clus_tbl_list)
+#'
+#' @import dplyr
+#' @import purrr
+#' @import tidyr
+#' @export
+
count_clusters <- function(clus_tbl_list){
- #remove non-numeric column from clus_tbl_list
- clus_tbl_num <- map(clus_tbl_list, ~ dplyr::select(., -seqnames))
- clus_counts <- lapply(clus_tbl_num, max)
- names(clus_counts) <- names(clus_tbl_list)
- clus_counts %>%
- bind_rows() %>%
- pivot_longer(everything()) %>%
- mutate(threshold = as.numeric(name)) %>%
- dplyr::rename(cluster_number = value)
-}
\ No newline at end of file
+
+ # Remove the non-numeric column "seqnames" from each cluster table in the list.
+ clus_tbl_num <- map(clus_tbl_list, ~ dplyr::select(., -seqnames))
+
+ # Create a list of the maximum cluster values for each cluster table in clus_tbl_num using lapply()
+ clus_counts <- lapply(clus_tbl_num, max)
+
+ # Assign the names of the original cluster tables to the values in clus_counts
+ names(clus_counts) <- names(clus_tbl_list)
+
+ # Combine the values in clus_counts into a single data frame using bind_rows()
+ # Transform the data frame from wide to long using pivot_longer(), and create
+ # a new column called "threshold" with the values previously used as column names
+ # Rename the "value" column to "cluster_number" using dplyr::rename()
+ clus_counts %>%
+ bind_rows() %>%
+ pivot_longer(everything()) %>%
+ mutate(threshold = as.numeric(name)) %>%
+ dplyr::rename(cluster_number = value)
+
+ # Return the final data frame with the cluster numbers and their maximum values
+ return(clus_counts)
+}
diff --git a/R/define_plateau.R b/R/define_plateau.R
index 1653312b75c81e3cfaa2ff7d1f687f6cb89c07a6..98534ebc203c36698c767a2613058021669a6f16 100644
--- a/R/define_plateau.R
+++ b/R/define_plateau.R
@@ -1,13 +1,38 @@
+#' Define a plateau within cluster counts
+#'
+#' This function takes in a tibble of cluster counts and defines
+#' the plateau within the cluster with the largest threshold window.
+#'
+#' @param cluster_counts A tibble of cluster counts with columns for
+#' cluster_number and threshold.
+#'
+#' @return A named vector with the starting and ending thresholds of the
+#' plateau and the corresponding cluster_number.
+#'
+#' @examples
+#' define_plateau(cluster_counts = cluster_counts_df)
+#'
+#' @import dplyr
+#' @export
+# The following code defines a function called "define_plateau"
define_plateau <- function(cluster_counts){
+ # "cluster_counts" is a tibble of cluster counts passed as a parameter to the function
+ # "clus_plateau" filters the cluster counts by selecting only those with cluster_number greater than or equal to 2
+ # then group the remaining data by cluster_number and calculate the maximum and minimum threshold per cluster_number using the "summarise" function.
+ # This is done to exclude a potentially long tail of single cluster outcomes at high thresholds.
+ # Calculate the difference between the maximum and minimum threshold for each cluster and store in "thresh_win"
+ # "arrange" function arranges the rows in descending order of "thresh_win" and selects the first row using "slice_head"
+ # "clus_plateau" now contains the "plateau_start", "plateau_end", and "cluster_number" values for the selected cluster
clus_plateau <- cluster_counts %>%
- filter(cluster_number >= 2) %>%
- group_by(cluster_number) %>%
- summarise(max_thresh = max(threshold), min_thresh = min(threshold)) %>%
- mutate(thresh_win = max_thresh-min_thresh) %>%
- arrange(desc(thresh_win)) %>%
- slice_head(n = 1) %>%
+ filter(cluster_number >= 2)%>%
+ group_by(cluster_number) %>%
+ summarise(max_thresh = max(threshold), min_thresh = min(threshold)) %>%
+ mutate(thresh_win = max_thresh-min_thresh) %>%
+ arrange(desc(thresh_win)) %>%
+ slice_head(n = 1) %>%
select(min_thresh, max_thresh, cluster_number) %>%
unlist() %>% unname()
- names(clus_plateau) <- c("plateau_start", "plateau_end", "cluster_number")
+ # This line assigns the names "plateau_start", "plateau_end", and "cluster_number" to each of the values in "clus_plateau".
+ names(clus_plateau) <- c('plateau_start', 'plateau_end', 'cluster_number')
return(clus_plateau)
-}
\ No newline at end of file
+}
diff --git a/R/dendrogram_hclust.R b/R/dendrogram_hclust.R
index 08c8f993c2cdc7aa5198a5caed7c44a4cde013d8..87848ca1e628c653892c936a62ce21e0cbf3e911 100644
--- a/R/dendrogram_hclust.R
+++ b/R/dendrogram_hclust.R
@@ -1,11 +1,36 @@
-dendrogram_hclust <- function(data = veganized_tibble, seed = 1, ...){
+#' Generate dendrogram data from numerical data
+#'
+#' This function generates a dendrogram from a hierarchical clustering of a dataset.
+#'
+#' @param data A tibble or data frame suited for vegdist().
+#' @param seed An integer that sets the seed for the random number generator. Default is 1.
+#' @param ... Additional arguments passed to `vegdist`.
+#'
+#' @return A `ggdendro::dendro_data` object, containing data for plotting the dendrogram.
+#'
+#' @import ggdendro
+#' @import vegan
+#' @import stats
+#'
+#' @examples
+#' # Generate dendrogram with default parameters
+#' dendrogram_hclust()
+#'
+#' # Generate dendrogram with custom data and distance metric
+#' daisy_dist <- daisy(iris[,-5], metric = "gower")
+#' dendrogram_hclust(daisy_dist)
+#'
+#' @export
+
+dendrogram_hclust <- function(data = veganized_tibble, seed = 1, ...) {
require(ggdendro)
set.seed(seed)
-
+
# make hierarchical cluster from vegdist matrix and extract data for plotting
- dat <- vegdist(data, ...) %>%
- hclust() %>%
- as.dendrogram() %>%
- ggdendro::dendro_data(type = "rectangle")
+ dat <- vegdist(data, ...) %>%
+ hclust() %>%
+ as.dendrogram() %>%
+ ggdendro::dendro_data(type = 'rectangle')
+
return(dat)
-}
\ No newline at end of file
+}
diff --git a/R/export_longest_reading_frame.R b/R/export_longest_reading_frame.R
index 52908568cca74c4c6dfb65f1448fd09161438172..64e3d6e975b7e575254609bc19af35775f30d061 100644
--- a/R/export_longest_reading_frame.R
+++ b/R/export_longest_reading_frame.R
@@ -1,29 +1,67 @@
-export_longest_reading_frame <- function(clustered_reading_frames_tbl = tbl,
- seqs = myDNAStringSet,
- outpath = path){
+#' Export Longest Reading Frame
+#'
+#' Export the longest reading frames from a DNA sequence set to a fasta file containing amino acid sequences.
+#'
+#' @param clustered_reading_frames_tbl tibble of clustered reading frames
+#' @param seqs DNAStringSet object with DNA sequences
+#' @param outpath character string of output directory path
+#' @param verbose logical indicating whether to print progress updates
+#'
+#' @return Messages indicating completion of exporting the longest reading frames to a fasta file format containing amino acid sequences.
+#'
+#' @examples
+#' export_longest_reading_frame(clustered_reading_frames_tbl, myDNAStringSet, myDirPath, TRUE)
+#'
+#' @import Biostrings
+#' @import dplyr
+#' @import tidyr
+#' @import utils
+#'
+#' @export
+# Define a function that exports the longest reading frames
+export_longest_reading_frame <- function(clustered_reading_frames_tbl = tbl, # function argument for clustered_reading_frame table
+ seqs = myDNAStringSet, # function argument for DNA sequence set
+ outpath = path, # function argument for output file path
+ verbose = FALSE){ # function argument for verbosity
+
+ # check if the output path exists, create it if it doesn't
if(!dir.exists(file.path(outpath))){
dir.create(outpath)
}
-
+
+ # translate the DNA sequences, count stop codons, and add a column for reading frame as a factor
peps_tbl <- translate_and_count_stops(seqs) %>%
mutate(rframe = as.factor(rframe))
-
+
+ # extract the unique cluster names from the clustered_reading_frames_tbl table
clus_name <- clustered_reading_frames_tbl %>%
select(cluster_name) %>% unique() %>% unlist() %>% unname()
-
+
+ # join the clustered_reading_frames_tbl table and the peps_tbl table by shared columns
temp_seq_tbl <- left_join(clustered_reading_frames_tbl, peps_tbl,
- by = c("seqnames", "rframe", "fos_width",
- "first_stop", "stop_count", "hrf_width"))
+ by = c('seqnames', 'rframe', 'fos_width',
+ 'first_stop', 'stop_count', 'hrf_width'))
+
+ # create an amino acid sequence set from the 'seq' column of the temp_seq_tbl table
temp_aa_seqs <- AAStringSet(x = as.character(
- temp_seq_tbl["seq"] %>% unlist() %>% unname()))
- names(temp_aa_seqs) <- as.character(temp_seq_tbl["seqnames"] %>% unlist() %>%
+ temp_seq_tbl['seq'] %>% unlist() %>% unname()))
+
+ # assign names to the sequences in the temp_aa_seqs sequence set
+ names(temp_aa_seqs) <- as.character(temp_seq_tbl['seqnames'] %>% unlist() %>%
unname())
- Biostrings::writeXStringSet(temp_aa_seqs,
+
+ # write the amino acid sequences to a fasta file
+ Biostrings::writeXStringSet(temp_aa_seqs,
filepath = file.path(outpath, paste0(
- clus_name, "_cluster_longest_AA_seqs.fa")))
- return(paste("AA sequences for cluster", clus_name, "written to file",
- file.path(outpath, paste0(
- clus_name, "_cluster_longest_AA_seqs.fa"))
- ))
-
-}
\ No newline at end of file
+ clus_name, '_cluster_longest_AA_seqs.fa')))
+
+ # return a message confirming the export and file path if verbose is true
+ if(verbose){
+ return(paste('AA sequences for cluster', clus_name, 'written to file',
+ file.path(outpath, paste0(
+ clus_name, '_cluster_longest_AA_seqs.fa'))
+ ))
+ }else{
+ return(paste('AA sequences saved to file'))
+ }
+}
diff --git a/R/find_contiguous_multi_repeats.R b/R/find_contiguous_multi_repeats.R
index 8eb10970cfeb82cae4630c5718dd43459c489c9c..7661b0930a33640f588e808cc54d950a7c7e1cc8 100644
--- a/R/find_contiguous_multi_repeats.R
+++ b/R/find_contiguous_multi_repeats.R
@@ -1,19 +1,47 @@
-## Repeat counter ----
+#' Find contiguous multi-repeats in a set of sequences
+#'
+#' This function finds contiguous multi-repeats in a set of sequences,
+#' where the repeat sequence is made up of n consecutive copies of a given string.
+#'
+#' @param sequences A DNAStringSet or AAStringSet object containing the sequences
+#' to search for multi-repeats (default: DNAStringSet)
+#' @param repeat_sequence A character string representing the repeat sequence to search
+#' for (default: 'string')
+#' @param singlet_count An integer or vector of integers specifying the number
+#' of single occurrences of the repeat, or the maximum expected number of
+#' single occurrences (default: NULL)
+#'
+#' @return A numeric vector containing the repeat count for each input sequence
+#' (i.e., the number of contiguous multi-repeats found)
+#'
+#' @export
+#'
+#' @examples
+#' # Create example DNAStringSet
+#' sequences <- DNAStringSet(c("ATCGATATCGATCGATCG", "GCTAGCTAGCTAGCTAGCTA"))
+#' find_contiguous_multi_repeats(sequences, 'AT', 3)
+#'
+#' # Expected output: c(2, 1)
+#'
+#' @import stringr
+#' @import Biostrings
+#'
+#' @keywords sequence, repeats
+#'
find_contiguous_multi_repeats <- function(sequences = DNAStringSet,
- repeat_sequence = "string",
- singlet_count) {
-
-
+ repeat_sequence = 'string',
+ singlet_count = 100) {
+
# Make vector of zeros for minimum repeat count, make 1 if singlet was found
repeat_count <- replicate(length(sequences), 0)
repeat_count[singlet_count > 0] <- 1
-
+
# Iterate n to the highest observed singlet count, overwrite repeat count if
# exactly one n-mer of repeat is found in the sequence
for (n in seq(1, max(singlet_count))) {
- contig_rep <- paste(replicate(n, repeat_sequence), collapse = "")
+ contig_rep <- paste(replicate(n, repeat_sequence), collapse = '')
contig_rep_count <- str_count(as.character(sequences), contig_rep)
- repeat_count[contig_rep_count < singlet_count & contig_rep_count > 0] <- n
+ repeat_count[contig_rep_count < singlet_count & contig_rep_count > 0] <-n
}
return(repeat_count)
-}
\ No newline at end of file
+}
diff --git a/R/find_longest_hrf.R b/R/find_longest_hrf.R
index cb2e43ca238aaff6f0ec32058df06cbed7dc3118..c5d5da9ea72115114c8fcd983e48689165f60a1a 100644
--- a/R/find_longest_hrf.R
+++ b/R/find_longest_hrf.R
@@ -1,5 +1,20 @@
+#' Find longest half-open reading frame
+#'
+#' This function takes a DNAStringSet as input and returns the longest half-open
+#' reading frames as a data table. Wrapper function around translate_and_count_stops()
+#'
+#' @param seqs A DNAStringSet object with nucleotide sequences.
+#'
+#' @return A data table with the longest open reading frames.
+#'
+#' @examples
+#' find_longest_hrf(seqs)
+#'
+#' @import dplyr
+#'
+#' @export
find_longest_hrf <- function(seqs = DNAStringSet){
peps_tbl <- translate_and_count_stops(seqs)
hrfs <- select(peps_tbl, -seq)
return(hrfs)
-}
\ No newline at end of file
+}
diff --git a/R/find_longest_orf.R b/R/find_longest_orf.R
index 121593e542375878e919a75175806e62025aab46..1e5fdee18d3e94b0356959ffd8376e8884cb0ca3 100644
--- a/R/find_longest_orf.R
+++ b/R/find_longest_orf.R
@@ -1,10 +1,37 @@
-find_longest_orf <-function(seqs = DNAStringSet) {
+#' Find the longest ORF in a DNA sequence set
+#'
+#' This function identifies the longest open reading frame (ORF) in a DNA sequence set.
+#' Wrapper around \code{\link{findORFs}}
+#'
+#' @param seqs A DNAStringSet object containing the DNA sequences to search for ORFs.
+#'
+#' @return A tibble containing the start and end positions, strand, and length of the longest ORF in each sequence.
+#'
+#' @import Biostrings
+#' @importFrom GenomicRanges GRanges
+#' @import tibble
+#' @import dplyr
+#' @export
+#'
+#' @examples
+#' seqs <- DNAStringSet(c("ATGAGTTCGAAATGGCGTTGAA", "GGGGGCTCGAGCTAGC"))
+#' find_longest_orf(seqs)
+#'
+#' @seealso \code{\link{findORFs}}
+#'
+
+find_longest_orf <- function(seqs = DNAStringSet) {
+ # Find ORFs in the sequences, return longest ORF, and convert to a vector
orfs <- findORFs(seqs, longestORF = TRUE, startCodon = startDefinition(6)) %>%
unlist(use.names = TRUE)
- orfs <- GRanges(seqnames = names(seqs)[as.integer(names(orfs))],
- ranges(orfs), strand = "+") %>%
+ # Convert the ORFs to a GRanges object
+ orfs <- GRanges(seqnames = names(seqs)[as.integer(names(orfs))],
+ ranges(orfs), strand = '+') %>%
+ # Convert to a tibble and add a column for reading frame
as_tibble() %>%
mutate(rframe = ((start -1) %% 3 ) + 1) %>%
+ # Rename the width column to orf_width
dplyr::rename(orf_width = width)
+ # Return the ORFs
return(orfs)
-}
\ No newline at end of file
+}
diff --git a/R/find_longest_reading_frames.R b/R/find_longest_reading_frames.R
index 526d329dfc8901099a8b4d81039b4444ef8f79c7..de34c89a96f7ebf61372d27cb87c12b943c66431 100644
--- a/R/find_longest_reading_frames.R
+++ b/R/find_longest_reading_frames.R
@@ -1,8 +1,23 @@
+#' Reading frame finder (longest orf or hrf)
+#'
+#' This function finds the longest reading frames in a set of DNA sequences.
+#' It combines the results from finding the longest open reading frames (orfs)
+#' and the longest half reading frames (hrfs) and determines the reading frame
+#' with the biggest possible translation.
+#'
+#' @param seqs A DNAStringSet object containing the input DNA sequences.
+#'
+#' @return A data frame containing the longest reading frames for each sequence.
+#' The data frame includes the sequence names, reading frame, and the width of the reading frame.
+#'
+#' @import dplyr, tidyr
+#'
+#' @export
## Reading frame finder (longest orf or hrf)
find_longest_reading_frames <- function(seqs = myDNAStringSet){
orfs <- find_longest_orf(seqs)
hrfs <- find_longest_hrf(seqs)
- # Combine orf and hrf tables and find the reading frame with the biggest
+ # Combine orf and hrf tables and find the reading frame with the biggest
# possible translation
orf_and_hrf <- full_join(orfs, hrfs, by = join_by(seqnames, rframe)) %>%
mutate(orf_width = replace_na(orf_width, 0)) %>%
@@ -13,12 +28,12 @@ find_longest_reading_frames <- function(seqs = myDNAStringSet){
"Five-open (no start)", max_type)) %>%
mutate(max_type = ifelse(fos_width > hrf_width & max_type == "open",
"Three-open (no stop)", max_type))
-
+
max_rf <- orf_and_hrf %>%
group_by(seqnames) %>%
slice_max(max_width, n = 1, with_ties = FALSE) %>%
arrange(factor(seqnames, levels = names(seqs))) %>%
mutate(rframe = factor(rframe, levels = c("1", "2", "3")))
-
+
return(max_rf)
-}
\ No newline at end of file
+}
diff --git a/R/find_repeat_positions.R b/R/find_repeat_positions.R
index 18c5d652edbad6fa080f151970235a7b6ecf227a..603fa53a0218ba0c791dd676d3b2171784fceed1 100644
--- a/R/find_repeat_positions.R
+++ b/R/find_repeat_positions.R
@@ -1,19 +1,39 @@
-find_repeat_positions <- function(sequences = DNAStringSet,
- repeat_sequence = "string"){
- repeat_positions <- str_locate_all(as.character(sequences), repeat_sequence)
- names(repeat_positions) <- names(sequences)
+#' Find Repeat Positions
+#'
+#' This function finds the positions of a given repeat sequence within a set of DNA sequences. It returns a data frame with the sequence name, start and end positions, and information used in plotting downstream.
+#'
+#' @param sequences A DNAStringSet object containing the DNA sequences to search for repeats.
+#' @param repeat_sequence A string specifying the repeat sequence to search for.
+#'
+#' @return A data frame with columns: seqname, start, end, fragment, and plot_intensity.
+#'
+#' @import stringr
+#' @import dplyr
+#' @import tibble
+#'
+#' @export
+#'
+#' @examples
+#' sequences <- DNAStringSet(c("AGTCAGT",
+#' "ACGTAGT",
+#' "AGTCGAT"))
+#' find_repeat_positions(sequences, "AGT")
+find_repeat_positions <- function(sequences = DNAStringSet, repeat_sequence = 'string'){
+
+ repeat_positions <- str_locate_all(as.character(sequences), repeat_sequence)names(repeat_positions) <- names(sequences)
repeat_positions <- lapply(repeat_positions, as_tibble) %>%
- bind_rows(.id = "seqname") %>%
+ bind_rows(.id = 'seqname') %>%
mutate(fragment = repeat_sequence)
- whole_sequence_positions = tibble("seqname" = names(sequences),
- "start" = replicate(length(sequences), 1),
- "end" = str_length(sequences),
- "fragment" = replicate(
- length(sequences), "Full sequence"))
+
+ whole_sequence_positions = tibble('seqname' = names(sequences),
+ 'start' = replicate(length(sequences), 1),
+ 'end' = str_length(sequences),
+ 'fragment' = replicate(length(sequences), 'Full sequence'))
+
out <- bind_rows(repeat_positions, whole_sequence_positions) %>%
mutate(plot_intensity = ifelse(fragment == repeat_sequence, 4, 3)) %>%
mutate(seqname = factor(seqname, levels = names(sequences))) %>%
- mutate(fragment = factor(fragment, levels = c("Full sequence",
- repeat_sequence)))
+ mutate(fragment = factor(fragment, levels = c('Full sequence', repeat_sequence)))
+
return(out)
-}
\ No newline at end of file
+}
diff --git a/R/kmer_based_distance_matrix.R b/R/kmer_based_distance_matrix.R
index 3d445324ecdb40d2554d11fb8e67210c036e2c64..4a9737074cba1a5174db678daa62e5f030ffd7a7 100644
--- a/R/kmer_based_distance_matrix.R
+++ b/R/kmer_based_distance_matrix.R
@@ -1,5 +1,18 @@
-# Convert to bin format and get distance matrix using kmers
-kmer_based_distance_matrix <- function (seqs = DNAStringSet) {
+#' Convert to bin format and get distance matrix using kmers
+#'
+#' This function takes in a DNAStringSet and converts it to a bin format using ape::as.DNAbin function. It then calculates the distance matrix using the kdistance function from XXXX and returns it as a matrix.
+#'
+#' @param seqs DNAStringSet containing the DNA sequences
+#'
+#' @return A distance matrix in bin format.
+#'
+#' @importFrom ape as.DNAbin
+#'
+#' @importFrom Biostrings DNAStringSet
+#'
+#' @export
+#'
+kmer_based_distance_matrix <- function (seqs) {
seqbins <- ape::as.DNAbin(seqs)
as.matrix(kdistance(seqbins))
-}
\ No newline at end of file
+}
diff --git a/R/meshclustR.R b/R/meshclustR.R
index d5d41ecb4fa4c99224dde4904d68912e6c199769..6b2517cbffd117435c26ffed86f964ecdfe96f2b 100644
--- a/R/meshclustR.R
+++ b/R/meshclustR.R
@@ -1,21 +1,54 @@
-#write temporary file to cluster
-meshclustR <- function(seqs = MyDNAStringSet,
- meshclust_bin = meshclust, filepath = path){
- temp_file <- file.path(filepath, "cluster_me.fa")
+#' Write temporary file to cluster using MeshclustR
+#'
+#' This function writes a temporary file to perform a clustering analysis on a set of DNA sequences.
+#' The clustering is done using the \href{https://github.com/BioinformaticsToolsmith/MeShClust}{Meshclust commandline} tool.
+#'
+#' James, Benjamin T. et al. (2018),
+#' MeShClust: an intelligent tool for clustering DNA sequences.
+#' Nucleic Acids Research, gky315.
+#'
+#' @param seqs A DNAStringSet object containing the DNA sequences to be clustered.
+#' @param meshclust_bin The path to the Meshclust executable (run
+#' `which meshclust` on commandline). Default is set to "meshclust".
+#' @param filepath The path to the directory where the temporary and log
+#' files will be saved.
+#' @return A data frame with information regarding the clustering analysis.
+#'
+#' @examples
+#' meshclustR(seqs = MyDNAStringSet, meshclust_bin = meshclust, filepath = path)
+#' @import Biostrings
+#' @import readr
+#' @import magrittr
+#' @import stringr
+#' @importFrom tools file_path_sans_ext
+#' @export
+
+meshclustR <- function(seqs = MyDNAStringSet,
+ meshclust_bin = meshclust,
+ filepath = path){
+
+ #create temporary file path
+ temp_file <- file.path(filepath, 'cluster_me.fa')
writeXStringSet(seqs, filepath = temp_file)
+
+ #create output file name
out_file <- file.path(filepath, paste0(
- tools::file_path_sans_ext(basename(temp_file)),".txt"))
-
- ## Meshclust
- system2(meshclust, args = c("-d", temp_file,
- "-o", out_file))
-
- stable_cluster <- read_delim(out_file, delim = "\t",
- col_names = c("cluster", "seqnames",
- "identity_with_center", "cluster_class"),
- col_types = "fcdc") %>%
- mutate(seqnames = str_remove(seqnames, ">"))
- #clean up temp file
+ tools::file_path_sans_ext(basename(temp_file)),'.txt'))
+
+ #run Meshclust
+ system2(meshclust_bin, args = c('-d', temp_file,
+ '-o', out_file))
+
+ #read output file and parse
+ stable_cluster <- read_delim(out_file, delim = '\t',
+ col_names = c('cluster', 'seqnames',
+ 'identity_with_center', 'cluster_class'),
+ col_types = 'fcdc') %>%
+ mutate(seqnames = str_remove(seqnames, '>'))
+
+ #clean up temp file and output file
file.remove(temp_file, out_file)
+
+ #return data frame
return(stable_cluster)
-}
\ No newline at end of file
+}
diff --git a/R/pivot_cluster_tbl_wider.R b/R/pivot_cluster_tbl_wider.R
index aad01e26bb8ef3b614178177ab9a73654934e182..c42839631f180074205ed76eda6af93ba9a06cca 100644
--- a/R/pivot_cluster_tbl_wider.R
+++ b/R/pivot_cluster_tbl_wider.R
@@ -1,14 +1,22 @@
-# Make wide table of clusters (listing all sequences in each cluster)
+#' Make wide table of clusters (listing all sequences in each cluster)
+#'
+#' @param cluster_tbl A data frame containing the cluster and sequence information
+#'
+#' @return A wide table of clusters with all sequences in each cluster listed
+#'
+#' @import dplyr
+#' @import tidyr
+#' @export
+#'
pivot_cluster_tbl_wider <- function(cluster_tbl) {
# First: get names of each cluster
cluster_names <- cluster_tbl %>%
dplyr::rename(clus_name = seqnames) %>%
group_by(cluster) %>%
slice_head(n = 1)
-
# Second: combine names with clusters and reshape tbls
- cluster_tbl <- left_join(stable_cluster, cluster_names, by = "cluster") %>%
+ cluster_tbl <- left_join(stable_cluster, cluster_names, by = 'cluster') %>%
group_by(cluster) %>%
- mutate(num = row_number()) %>%
+ mutate(num = row_number()) %>%
pivot_wider(names_from = clus_name, values_from = seqnames, id_cols = num)
-}
\ No newline at end of file
+}
diff --git a/R/plot_abundance_per_sample.R b/R/plot_abundance_per_sample.R
index 3b2aa744e818ad9b6610763a2ff90cb03035529a..48dfee3fe230590b13ace238db1558e7717a4fa1 100644
--- a/R/plot_abundance_per_sample.R
+++ b/R/plot_abundance_per_sample.R
@@ -1,12 +1,24 @@
-# Specifically intended to plot sequence abundance per sampled within
-# plot_cluster_overview from tbl of sequence abundance
+#' This function creates a plot of sequence abundance per sample from a table of sequence abundance.
+#'
+#' @param tbl_of_abundance A table containing sequence abundance data.
+#' @return A plot displaying sequence abundance per sample.
+#' @import ggplot2
+#' @import scales
+#' @examples
+#' tbl_of_abundance <- data.frame(ID = c("ASV1", "ASV2", "ASV3", "ASV4"),
+#' Sample = c("Sample1", "Sample1", "Sample2", "Sample2"),
+#' count = c(10, 5, 6, 12))
+#' plot_abundance_per_sample(tbl_of_abundance)
+#'
+#' @export
+#' Specifically intended to plot sequence abundance per sampled within plot_cluster_overview from tbl of sequence abundance
plot_abundance_per_sample <- function(tbl_of_abundance) {
- ggplot(tbl_of_abundance, aes(y = ID, x = Sample, fill = count/1000)) +
- geom_tile() + theme(legend.position = "none",
+ ggplot(tbl_of_abundance, aes(y = ID, x = Sample, fill = count/1000)) +
+ geom_tile() + theme(legend.position = 'none',
axis.title = element_blank(),
axis.text = element_blank(),
axis.ticks = element_blank()) +
- ggtitle("ASV counts per sample") +
+ ggtitle('ASV counts per sample') +
scale_fill_viridis_c() +
- labs(fill = "ASV counts (in thousands)")
-}
\ No newline at end of file
+ labs(fill = 'ASV counts (in thousands)')
+}
diff --git a/R/plot_abundance_sums_per_sequence.R b/R/plot_abundance_sums_per_sequence.R
index feb1a2cd4eda27fdb5d8b3107344ad7151e78ae1..f871d709486cd467511d6a4307b79ca4508c6c66 100644
--- a/R/plot_abundance_sums_per_sequence.R
+++ b/R/plot_abundance_sums_per_sequence.R
@@ -1,13 +1,21 @@
-# Specifically intended to plot total sequence abundance within
-# plot_cluster_overview from tbl of sequence abundance sums
+#' Plot total sequence abundance within plot_cluster_overview from tbl of sequence abundance sums
+#'
+#' @param tbl_of_sums A table of sequence abundance sums
+#' @return A ggplot object of the total ASV counts and sequence IDs
+#' @examples
+#' tbl_of_sums <- data.frame(ID = c("ASV_001", "ASV_002", "ASV_003"), sum_count = c(1000, 2000, 3000))
+#' plot_abundance_sums_per_sequence(tbl_of_sums)
+#'
+#' @import ggplot2
+#'
+#' @export
plot_abundance_sums_per_sequence <- function(tbl_of_sums) {
- ggplot(tbl_of_sums, aes(y = ID, x = sum_count/1000)) +
- geom_col() + theme(axis.text.x = element_text(angle = 90,
- size = 15,
- vjust = 0.5),
- legend.position = "none",
- axis.title = element_blank(),
- axis.text.y = element_blank(),
- axis.ticks.y = element_blank()) +
- ggtitle("Total ASV counts (in thousands)")
-}
\ No newline at end of file
+ ggplot(tbl_of_sums, aes(y = ID, x = sum_count / 1000)) +
+ geom_col() +
+ theme(axis.text.x = element_text(angle = 90, size = 15, vjust = 0.5),
+ legend.position = 'none',
+ axis.title = element_blank(),
+ axis.text.y = element_blank(),
+ axis.ticks.y = element_blank())+
+ ggtitle('Total ASV counts (in thousands)')
+}
diff --git a/R/plot_asv_nmds.R b/R/plot_asv_nmds.R
index 7c1391b0e65658be8735d351e9725236827f87c6..1b62a215dfe6cce35ef5ff4bc60cb3dc05b79ea8 100644
--- a/R/plot_asv_nmds.R
+++ b/R/plot_asv_nmds.R
@@ -1,20 +1,46 @@
+#' Create NMDS plot with ggplot2 and centroids
+#'
+#' This function creates an NMDS plot using ggplot2 and can optionally add centroids to the plot.
+#'
+#' @param asv_nmds A data frame created from NMDS analysis with columns "MDS1" and "MDS2".
+#' @param color_by A string indicating which column to use for coloring points on the plot. Default: 'String'.
+#' @param centroids A logical value indicating whether or not to add centroids to the plot. Default: FALSE.
+#'
+#' @return A ggplot2 object containing the NMDS plot.
+#'
+#' @examples
+#' # Assume nmds_df has been created through NMDS analysis
+#' plot_asv_nmds(asv_nmds = nmds_df, color_by = 'Sample', centroids = TRUE)
+#'
+#' @import ggplot2
+#' @importFrom stats aggregate
+#'
+#' @export
plot_asv_nmds <- function(asv_nmds = my_asv_nmds,
- color_by = "String",
+ color_by = 'String',
centroids = FALSE){
+
# Create NMDS plot with ggplot2 and centroids
- pl <- ggplot(asv_nmds, aes(x=MDS1, y=MDS2, color= .data[[color_by]])) +
+
+ # Create ggplot object
+ pl <-ggplot(asv_nmds, aes(x=MDS1, y=MDS2, color= .data[[color_by]])) +
geom_point() +
theme_classic() +
- labs(title="NMDS Plot", x="NMDS 1", y="NMDS 2")
+ labs(title='NMDS Plot', x='NMDS 1', y='NMDS 2')
+
+ # If centroids parameter is TRUE, add centroids to the plot
if(centroids){
- # Add sample groups to nmds_df
- nmds_df$Group <- factor(gsub("[0-9]", "", rownames(asv_table)))
+ # Add sample groups to asv_nmds
+ asv_nmds$Group <- factor(gsub('[0-9]', '', rownames(asv_table)))
# Calculate group centroids
- centroids_df <- aggregate(nmds_df[, 1:2], by=list(Group=nmds_df$Group), mean)
-
- pl <- pl +
- geom_point(data=centroids_df, aes(x=NMDS1, y=NMDS2),
- color="black", size=3, shape=21, fill="white")
+ centroids_df <- aggregate(asv_nmds[, 1:2], by=list(Group=asv_nmds$Group), mean)
+
+ # Add centroids to plot
+ pl <- pl +
+ geom_point(data=centroids_df, aes(x=NMDS1, y=NMDS2),
+ color='black', size=3, shape=21, fill='white')
}
+
+ # Return ggplot object
return(pl)
-}
\ No newline at end of file
+}
diff --git a/R/plot_cluster_dendrogram.R b/R/plot_cluster_dendrogram.R
index 1362fbd88cc84268edbff136887cb678494e6e37..adc21f717188200cc58bf81342d8f577cac994e4 100644
--- a/R/plot_cluster_dendrogram.R
+++ b/R/plot_cluster_dendrogram.R
@@ -1,6 +1,21 @@
-# Generic plotting of tree with title "Dendrogram"
+#' Generic plotting of UPGMA tree using ggtree
+#'
+#' This function plots a dendrogram using the ggtree package.
+#'
+#' @param upgma_tree An object of class 'phylo' representing the tree.
+#'
+#' @import ggtree
+#' @import ggplot2
+#'
+#' @return A dendrogram plot.
+#'
+#' @examples
+#' tree <- ape::read.tree("tree.txt")
+#' plot_cluster_dendrogram(tree)
+#'
+#' @export
plot_cluster_dendrogram <- function(upgma_tree) {
ggtree::ggtree(upgma_tree) + ggtree::geom_tiplab(as_ylab = TRUE) +
- ggtitle("Dendrogram") +
+ ggtitle('Dendrogram') +
theme(axis.text = element_text(size = 15))
-}
\ No newline at end of file
+}