diff --git a/R/count_clusters.R b/R/count_clusters.R index a3597ec7daa035748857f83c2d95f6f959fb02db..a0884b4cb2b6277e75e82cf114049402f73a82de 100644 --- a/R/count_clusters.R +++ b/R/count_clusters.R @@ -1,11 +1,49 @@ +#' Count Clusters +#' +#' This function takes a list of cluster tables and returns a tibble +#' with cluster counts by threshold. +#' +#' @param clus_tbl_list A list of cluster tables. +#' +#' @return A tibble with cluster counts by threshold. +#' +#' @examples +#' +#' set.seed(1) +#' clus_tbl_list <- list( +#' data.frame(seqnames = c("chr1", "chr1", "chr2"), +#' start = c(10, 50, 100), end = c(20, 60, 200), count = c(5, 7, 10)), +#' data.frame(seqnames = c("chr1", "chr1", "chr2"), +#' start = c(15, 55, 110), end = c(25, 65, 150), count = c(8, 6, 11)) +#' ) +#' count_clusters(clus_tbl_list) +#' +#' @import dplyr +#' @import purrr +#' @import tidyr +#' @export + count_clusters <- function(clus_tbl_list){ - #remove non-numeric column from clus_tbl_list - clus_tbl_num <- map(clus_tbl_list, ~ dplyr::select(., -seqnames)) - clus_counts <- lapply(clus_tbl_num, max) - names(clus_counts) <- names(clus_tbl_list) - clus_counts %>% - bind_rows() %>% - pivot_longer(everything()) %>% - mutate(threshold = as.numeric(name)) %>% - dplyr::rename(cluster_number = value) -} \ No newline at end of file + + # Remove the non-numeric column "seqnames" from each cluster table in the list. + clus_tbl_num <- map(clus_tbl_list, ~ dplyr::select(., -seqnames)) + + # Create a list of the maximum cluster values for each cluster table in clus_tbl_num using lapply() + clus_counts <- lapply(clus_tbl_num, max) + + # Assign the names of the original cluster tables to the values in clus_counts + names(clus_counts) <- names(clus_tbl_list) + + # Combine the values in clus_counts into a single data frame using bind_rows() + # Transform the data frame from wide to long using pivot_longer(), and create + # a new column called "threshold" with the values previously used as column names + # Rename the "value" column to "cluster_number" using dplyr::rename() + clus_counts %>% + bind_rows() %>% + pivot_longer(everything()) %>% + mutate(threshold = as.numeric(name)) %>% + dplyr::rename(cluster_number = value) + + # Return the final data frame with the cluster numbers and their maximum values + return(clus_counts) +} diff --git a/R/define_plateau.R b/R/define_plateau.R index 1653312b75c81e3cfaa2ff7d1f687f6cb89c07a6..98534ebc203c36698c767a2613058021669a6f16 100644 --- a/R/define_plateau.R +++ b/R/define_plateau.R @@ -1,13 +1,38 @@ +#' Define a plateau within cluster counts +#' +#' This function takes in a tibble of cluster counts and defines +#' the plateau within the cluster with the largest threshold window. +#' +#' @param cluster_counts A tibble of cluster counts with columns for +#' cluster_number and threshold. +#' +#' @return A named vector with the starting and ending thresholds of the +#' plateau and the corresponding cluster_number. +#' +#' @examples +#' define_plateau(cluster_counts = cluster_counts_df) +#' +#' @import dplyr +#' @export +# The following code defines a function called "define_plateau" define_plateau <- function(cluster_counts){ + # "cluster_counts" is a tibble of cluster counts passed as a parameter to the function + # "clus_plateau" filters the cluster counts by selecting only those with cluster_number greater than or equal to 2 + # then group the remaining data by cluster_number and calculate the maximum and minimum threshold per cluster_number using the "summarise" function. + # This is done to exclude a potentially long tail of single cluster outcomes at high thresholds. + # Calculate the difference between the maximum and minimum threshold for each cluster and store in "thresh_win" + # "arrange" function arranges the rows in descending order of "thresh_win" and selects the first row using "slice_head" + # "clus_plateau" now contains the "plateau_start", "plateau_end", and "cluster_number" values for the selected cluster clus_plateau <- cluster_counts %>% - filter(cluster_number >= 2) %>% - group_by(cluster_number) %>% - summarise(max_thresh = max(threshold), min_thresh = min(threshold)) %>% - mutate(thresh_win = max_thresh-min_thresh) %>% - arrange(desc(thresh_win)) %>% - slice_head(n = 1) %>% + filter(cluster_number >= 2)%>% + group_by(cluster_number) %>% + summarise(max_thresh = max(threshold), min_thresh = min(threshold)) %>% + mutate(thresh_win = max_thresh-min_thresh) %>% + arrange(desc(thresh_win)) %>% + slice_head(n = 1) %>% select(min_thresh, max_thresh, cluster_number) %>% unlist() %>% unname() - names(clus_plateau) <- c("plateau_start", "plateau_end", "cluster_number") + # This line assigns the names "plateau_start", "plateau_end", and "cluster_number" to each of the values in "clus_plateau". + names(clus_plateau) <- c('plateau_start', 'plateau_end', 'cluster_number') return(clus_plateau) -} \ No newline at end of file +} diff --git a/R/dendrogram_hclust.R b/R/dendrogram_hclust.R index 08c8f993c2cdc7aa5198a5caed7c44a4cde013d8..87848ca1e628c653892c936a62ce21e0cbf3e911 100644 --- a/R/dendrogram_hclust.R +++ b/R/dendrogram_hclust.R @@ -1,11 +1,36 @@ -dendrogram_hclust <- function(data = veganized_tibble, seed = 1, ...){ +#' Generate dendrogram data from numerical data +#' +#' This function generates a dendrogram from a hierarchical clustering of a dataset. +#' +#' @param data A tibble or data frame suited for vegdist(). +#' @param seed An integer that sets the seed for the random number generator. Default is 1. +#' @param ... Additional arguments passed to `vegdist`. +#' +#' @return A `ggdendro::dendro_data` object, containing data for plotting the dendrogram. +#' +#' @import ggdendro +#' @import vegan +#' @import stats +#' +#' @examples +#' # Generate dendrogram with default parameters +#' dendrogram_hclust() +#' +#' # Generate dendrogram with custom data and distance metric +#' daisy_dist <- daisy(iris[,-5], metric = "gower") +#' dendrogram_hclust(daisy_dist) +#' +#' @export + +dendrogram_hclust <- function(data = veganized_tibble, seed = 1, ...) { require(ggdendro) set.seed(seed) - + # make hierarchical cluster from vegdist matrix and extract data for plotting - dat <- vegdist(data, ...) %>% - hclust() %>% - as.dendrogram() %>% - ggdendro::dendro_data(type = "rectangle") + dat <- vegdist(data, ...) %>% + hclust() %>% + as.dendrogram() %>% + ggdendro::dendro_data(type = 'rectangle') + return(dat) -} \ No newline at end of file +} diff --git a/R/export_longest_reading_frame.R b/R/export_longest_reading_frame.R index 52908568cca74c4c6dfb65f1448fd09161438172..64e3d6e975b7e575254609bc19af35775f30d061 100644 --- a/R/export_longest_reading_frame.R +++ b/R/export_longest_reading_frame.R @@ -1,29 +1,67 @@ -export_longest_reading_frame <- function(clustered_reading_frames_tbl = tbl, - seqs = myDNAStringSet, - outpath = path){ +#' Export Longest Reading Frame +#' +#' Export the longest reading frames from a DNA sequence set to a fasta file containing amino acid sequences. +#' +#' @param clustered_reading_frames_tbl tibble of clustered reading frames +#' @param seqs DNAStringSet object with DNA sequences +#' @param outpath character string of output directory path +#' @param verbose logical indicating whether to print progress updates +#' +#' @return Messages indicating completion of exporting the longest reading frames to a fasta file format containing amino acid sequences. +#' +#' @examples +#' export_longest_reading_frame(clustered_reading_frames_tbl, myDNAStringSet, myDirPath, TRUE) +#' +#' @import Biostrings +#' @import dplyr +#' @import tidyr +#' @import utils +#' +#' @export +# Define a function that exports the longest reading frames +export_longest_reading_frame <- function(clustered_reading_frames_tbl = tbl, # function argument for clustered_reading_frame table + seqs = myDNAStringSet, # function argument for DNA sequence set + outpath = path, # function argument for output file path + verbose = FALSE){ # function argument for verbosity + + # check if the output path exists, create it if it doesn't if(!dir.exists(file.path(outpath))){ dir.create(outpath) } - + + # translate the DNA sequences, count stop codons, and add a column for reading frame as a factor peps_tbl <- translate_and_count_stops(seqs) %>% mutate(rframe = as.factor(rframe)) - + + # extract the unique cluster names from the clustered_reading_frames_tbl table clus_name <- clustered_reading_frames_tbl %>% select(cluster_name) %>% unique() %>% unlist() %>% unname() - + + # join the clustered_reading_frames_tbl table and the peps_tbl table by shared columns temp_seq_tbl <- left_join(clustered_reading_frames_tbl, peps_tbl, - by = c("seqnames", "rframe", "fos_width", - "first_stop", "stop_count", "hrf_width")) + by = c('seqnames', 'rframe', 'fos_width', + 'first_stop', 'stop_count', 'hrf_width')) + + # create an amino acid sequence set from the 'seq' column of the temp_seq_tbl table temp_aa_seqs <- AAStringSet(x = as.character( - temp_seq_tbl["seq"] %>% unlist() %>% unname())) - names(temp_aa_seqs) <- as.character(temp_seq_tbl["seqnames"] %>% unlist() %>% + temp_seq_tbl['seq'] %>% unlist() %>% unname())) + + # assign names to the sequences in the temp_aa_seqs sequence set + names(temp_aa_seqs) <- as.character(temp_seq_tbl['seqnames'] %>% unlist() %>% unname()) - Biostrings::writeXStringSet(temp_aa_seqs, + + # write the amino acid sequences to a fasta file + Biostrings::writeXStringSet(temp_aa_seqs, filepath = file.path(outpath, paste0( - clus_name, "_cluster_longest_AA_seqs.fa"))) - return(paste("AA sequences for cluster", clus_name, "written to file", - file.path(outpath, paste0( - clus_name, "_cluster_longest_AA_seqs.fa")) - )) - -} \ No newline at end of file + clus_name, '_cluster_longest_AA_seqs.fa'))) + + # return a message confirming the export and file path if verbose is true + if(verbose){ + return(paste('AA sequences for cluster', clus_name, 'written to file', + file.path(outpath, paste0( + clus_name, '_cluster_longest_AA_seqs.fa')) + )) + }else{ + return(paste('AA sequences saved to file')) + } +} diff --git a/R/find_contiguous_multi_repeats.R b/R/find_contiguous_multi_repeats.R index 8eb10970cfeb82cae4630c5718dd43459c489c9c..7661b0930a33640f588e808cc54d950a7c7e1cc8 100644 --- a/R/find_contiguous_multi_repeats.R +++ b/R/find_contiguous_multi_repeats.R @@ -1,19 +1,47 @@ -## Repeat counter ---- +#' Find contiguous multi-repeats in a set of sequences +#' +#' This function finds contiguous multi-repeats in a set of sequences, +#' where the repeat sequence is made up of n consecutive copies of a given string. +#' +#' @param sequences A DNAStringSet or AAStringSet object containing the sequences +#' to search for multi-repeats (default: DNAStringSet) +#' @param repeat_sequence A character string representing the repeat sequence to search +#' for (default: 'string') +#' @param singlet_count An integer or vector of integers specifying the number +#' of single occurrences of the repeat, or the maximum expected number of +#' single occurrences (default: NULL) +#' +#' @return A numeric vector containing the repeat count for each input sequence +#' (i.e., the number of contiguous multi-repeats found) +#' +#' @export +#' +#' @examples +#' # Create example DNAStringSet +#' sequences <- DNAStringSet(c("ATCGATATCGATCGATCG", "GCTAGCTAGCTAGCTAGCTA")) +#' find_contiguous_multi_repeats(sequences, 'AT', 3) +#' +#' # Expected output: c(2, 1) +#' +#' @import stringr +#' @import Biostrings +#' +#' @keywords sequence, repeats +#' find_contiguous_multi_repeats <- function(sequences = DNAStringSet, - repeat_sequence = "string", - singlet_count) { - - + repeat_sequence = 'string', + singlet_count = 100) { + # Make vector of zeros for minimum repeat count, make 1 if singlet was found repeat_count <- replicate(length(sequences), 0) repeat_count[singlet_count > 0] <- 1 - + # Iterate n to the highest observed singlet count, overwrite repeat count if # exactly one n-mer of repeat is found in the sequence for (n in seq(1, max(singlet_count))) { - contig_rep <- paste(replicate(n, repeat_sequence), collapse = "") + contig_rep <- paste(replicate(n, repeat_sequence), collapse = '') contig_rep_count <- str_count(as.character(sequences), contig_rep) - repeat_count[contig_rep_count < singlet_count & contig_rep_count > 0] <- n + repeat_count[contig_rep_count < singlet_count & contig_rep_count > 0] <-n } return(repeat_count) -} \ No newline at end of file +} diff --git a/R/find_longest_hrf.R b/R/find_longest_hrf.R index cb2e43ca238aaff6f0ec32058df06cbed7dc3118..c5d5da9ea72115114c8fcd983e48689165f60a1a 100644 --- a/R/find_longest_hrf.R +++ b/R/find_longest_hrf.R @@ -1,5 +1,20 @@ +#' Find longest half-open reading frame +#' +#' This function takes a DNAStringSet as input and returns the longest half-open +#' reading frames as a data table. Wrapper function around translate_and_count_stops() +#' +#' @param seqs A DNAStringSet object with nucleotide sequences. +#' +#' @return A data table with the longest open reading frames. +#' +#' @examples +#' find_longest_hrf(seqs) +#' +#' @import dplyr +#' +#' @export find_longest_hrf <- function(seqs = DNAStringSet){ peps_tbl <- translate_and_count_stops(seqs) hrfs <- select(peps_tbl, -seq) return(hrfs) -} \ No newline at end of file +} diff --git a/R/find_longest_orf.R b/R/find_longest_orf.R index 121593e542375878e919a75175806e62025aab46..1e5fdee18d3e94b0356959ffd8376e8884cb0ca3 100644 --- a/R/find_longest_orf.R +++ b/R/find_longest_orf.R @@ -1,10 +1,37 @@ -find_longest_orf <-function(seqs = DNAStringSet) { +#' Find the longest ORF in a DNA sequence set +#' +#' This function identifies the longest open reading frame (ORF) in a DNA sequence set. +#' Wrapper around \code{\link{findORFs}} +#' +#' @param seqs A DNAStringSet object containing the DNA sequences to search for ORFs. +#' +#' @return A tibble containing the start and end positions, strand, and length of the longest ORF in each sequence. +#' +#' @import Biostrings +#' @importFrom GenomicRanges GRanges +#' @import tibble +#' @import dplyr +#' @export +#' +#' @examples +#' seqs <- DNAStringSet(c("ATGAGTTCGAAATGGCGTTGAA", "GGGGGCTCGAGCTAGC")) +#' find_longest_orf(seqs) +#' +#' @seealso \code{\link{findORFs}} +#' + +find_longest_orf <- function(seqs = DNAStringSet) { + # Find ORFs in the sequences, return longest ORF, and convert to a vector orfs <- findORFs(seqs, longestORF = TRUE, startCodon = startDefinition(6)) %>% unlist(use.names = TRUE) - orfs <- GRanges(seqnames = names(seqs)[as.integer(names(orfs))], - ranges(orfs), strand = "+") %>% + # Convert the ORFs to a GRanges object + orfs <- GRanges(seqnames = names(seqs)[as.integer(names(orfs))], + ranges(orfs), strand = '+') %>% + # Convert to a tibble and add a column for reading frame as_tibble() %>% mutate(rframe = ((start -1) %% 3 ) + 1) %>% + # Rename the width column to orf_width dplyr::rename(orf_width = width) + # Return the ORFs return(orfs) -} \ No newline at end of file +} diff --git a/R/find_longest_reading_frames.R b/R/find_longest_reading_frames.R index 526d329dfc8901099a8b4d81039b4444ef8f79c7..de34c89a96f7ebf61372d27cb87c12b943c66431 100644 --- a/R/find_longest_reading_frames.R +++ b/R/find_longest_reading_frames.R @@ -1,8 +1,23 @@ +#' Reading frame finder (longest orf or hrf) +#' +#' This function finds the longest reading frames in a set of DNA sequences. +#' It combines the results from finding the longest open reading frames (orfs) +#' and the longest half reading frames (hrfs) and determines the reading frame +#' with the biggest possible translation. +#' +#' @param seqs A DNAStringSet object containing the input DNA sequences. +#' +#' @return A data frame containing the longest reading frames for each sequence. +#' The data frame includes the sequence names, reading frame, and the width of the reading frame. +#' +#' @import dplyr, tidyr +#' +#' @export ## Reading frame finder (longest orf or hrf) find_longest_reading_frames <- function(seqs = myDNAStringSet){ orfs <- find_longest_orf(seqs) hrfs <- find_longest_hrf(seqs) - # Combine orf and hrf tables and find the reading frame with the biggest + # Combine orf and hrf tables and find the reading frame with the biggest # possible translation orf_and_hrf <- full_join(orfs, hrfs, by = join_by(seqnames, rframe)) %>% mutate(orf_width = replace_na(orf_width, 0)) %>% @@ -13,12 +28,12 @@ find_longest_reading_frames <- function(seqs = myDNAStringSet){ "Five-open (no start)", max_type)) %>% mutate(max_type = ifelse(fos_width > hrf_width & max_type == "open", "Three-open (no stop)", max_type)) - + max_rf <- orf_and_hrf %>% group_by(seqnames) %>% slice_max(max_width, n = 1, with_ties = FALSE) %>% arrange(factor(seqnames, levels = names(seqs))) %>% mutate(rframe = factor(rframe, levels = c("1", "2", "3"))) - + return(max_rf) -} \ No newline at end of file +} diff --git a/R/find_repeat_positions.R b/R/find_repeat_positions.R index 18c5d652edbad6fa080f151970235a7b6ecf227a..603fa53a0218ba0c791dd676d3b2171784fceed1 100644 --- a/R/find_repeat_positions.R +++ b/R/find_repeat_positions.R @@ -1,19 +1,39 @@ -find_repeat_positions <- function(sequences = DNAStringSet, - repeat_sequence = "string"){ - repeat_positions <- str_locate_all(as.character(sequences), repeat_sequence) - names(repeat_positions) <- names(sequences) +#' Find Repeat Positions +#' +#' This function finds the positions of a given repeat sequence within a set of DNA sequences. It returns a data frame with the sequence name, start and end positions, and information used in plotting downstream. +#' +#' @param sequences A DNAStringSet object containing the DNA sequences to search for repeats. +#' @param repeat_sequence A string specifying the repeat sequence to search for. +#' +#' @return A data frame with columns: seqname, start, end, fragment, and plot_intensity. +#' +#' @import stringr +#' @import dplyr +#' @import tibble +#' +#' @export +#' +#' @examples +#' sequences <- DNAStringSet(c("AGTCAGT", +#' "ACGTAGT", +#' "AGTCGAT")) +#' find_repeat_positions(sequences, "AGT") +find_repeat_positions <- function(sequences = DNAStringSet, repeat_sequence = 'string'){ + + repeat_positions <- str_locate_all(as.character(sequences), repeat_sequence)names(repeat_positions) <- names(sequences) repeat_positions <- lapply(repeat_positions, as_tibble) %>% - bind_rows(.id = "seqname") %>% + bind_rows(.id = 'seqname') %>% mutate(fragment = repeat_sequence) - whole_sequence_positions = tibble("seqname" = names(sequences), - "start" = replicate(length(sequences), 1), - "end" = str_length(sequences), - "fragment" = replicate( - length(sequences), "Full sequence")) + + whole_sequence_positions = tibble('seqname' = names(sequences), + 'start' = replicate(length(sequences), 1), + 'end' = str_length(sequences), + 'fragment' = replicate(length(sequences), 'Full sequence')) + out <- bind_rows(repeat_positions, whole_sequence_positions) %>% mutate(plot_intensity = ifelse(fragment == repeat_sequence, 4, 3)) %>% mutate(seqname = factor(seqname, levels = names(sequences))) %>% - mutate(fragment = factor(fragment, levels = c("Full sequence", - repeat_sequence))) + mutate(fragment = factor(fragment, levels = c('Full sequence', repeat_sequence))) + return(out) -} \ No newline at end of file +} diff --git a/R/kmer_based_distance_matrix.R b/R/kmer_based_distance_matrix.R index 3d445324ecdb40d2554d11fb8e67210c036e2c64..4a9737074cba1a5174db678daa62e5f030ffd7a7 100644 --- a/R/kmer_based_distance_matrix.R +++ b/R/kmer_based_distance_matrix.R @@ -1,5 +1,18 @@ -# Convert to bin format and get distance matrix using kmers -kmer_based_distance_matrix <- function (seqs = DNAStringSet) { +#' Convert to bin format and get distance matrix using kmers +#' +#' This function takes in a DNAStringSet and converts it to a bin format using ape::as.DNAbin function. It then calculates the distance matrix using the kdistance function from XXXX and returns it as a matrix. +#' +#' @param seqs DNAStringSet containing the DNA sequences +#' +#' @return A distance matrix in bin format. +#' +#' @importFrom ape as.DNAbin +#' +#' @importFrom Biostrings DNAStringSet +#' +#' @export +#' +kmer_based_distance_matrix <- function (seqs) { seqbins <- ape::as.DNAbin(seqs) as.matrix(kdistance(seqbins)) -} \ No newline at end of file +} diff --git a/R/meshclustR.R b/R/meshclustR.R index d5d41ecb4fa4c99224dde4904d68912e6c199769..6b2517cbffd117435c26ffed86f964ecdfe96f2b 100644 --- a/R/meshclustR.R +++ b/R/meshclustR.R @@ -1,21 +1,54 @@ -#write temporary file to cluster -meshclustR <- function(seqs = MyDNAStringSet, - meshclust_bin = meshclust, filepath = path){ - temp_file <- file.path(filepath, "cluster_me.fa") +#' Write temporary file to cluster using MeshclustR +#' +#' This function writes a temporary file to perform a clustering analysis on a set of DNA sequences. +#' The clustering is done using the \href{https://github.com/BioinformaticsToolsmith/MeShClust}{Meshclust commandline} tool. +#' +#' James, Benjamin T. et al. (2018), +#' MeShClust: an intelligent tool for clustering DNA sequences. +#' Nucleic Acids Research, gky315. +#' +#' @param seqs A DNAStringSet object containing the DNA sequences to be clustered. +#' @param meshclust_bin The path to the Meshclust executable (run +#' `which meshclust` on commandline). Default is set to "meshclust". +#' @param filepath The path to the directory where the temporary and log +#' files will be saved. +#' @return A data frame with information regarding the clustering analysis. +#' +#' @examples +#' meshclustR(seqs = MyDNAStringSet, meshclust_bin = meshclust, filepath = path) +#' @import Biostrings +#' @import readr +#' @import magrittr +#' @import stringr +#' @importFrom tools file_path_sans_ext +#' @export + +meshclustR <- function(seqs = MyDNAStringSet, + meshclust_bin = meshclust, + filepath = path){ + + #create temporary file path + temp_file <- file.path(filepath, 'cluster_me.fa') writeXStringSet(seqs, filepath = temp_file) + + #create output file name out_file <- file.path(filepath, paste0( - tools::file_path_sans_ext(basename(temp_file)),".txt")) - - ## Meshclust - system2(meshclust, args = c("-d", temp_file, - "-o", out_file)) - - stable_cluster <- read_delim(out_file, delim = "\t", - col_names = c("cluster", "seqnames", - "identity_with_center", "cluster_class"), - col_types = "fcdc") %>% - mutate(seqnames = str_remove(seqnames, ">")) - #clean up temp file + tools::file_path_sans_ext(basename(temp_file)),'.txt')) + + #run Meshclust + system2(meshclust_bin, args = c('-d', temp_file, + '-o', out_file)) + + #read output file and parse + stable_cluster <- read_delim(out_file, delim = '\t', + col_names = c('cluster', 'seqnames', + 'identity_with_center', 'cluster_class'), + col_types = 'fcdc') %>% + mutate(seqnames = str_remove(seqnames, '>')) + + #clean up temp file and output file file.remove(temp_file, out_file) + + #return data frame return(stable_cluster) -} \ No newline at end of file +} diff --git a/R/pivot_cluster_tbl_wider.R b/R/pivot_cluster_tbl_wider.R index aad01e26bb8ef3b614178177ab9a73654934e182..c42839631f180074205ed76eda6af93ba9a06cca 100644 --- a/R/pivot_cluster_tbl_wider.R +++ b/R/pivot_cluster_tbl_wider.R @@ -1,14 +1,22 @@ -# Make wide table of clusters (listing all sequences in each cluster) +#' Make wide table of clusters (listing all sequences in each cluster) +#' +#' @param cluster_tbl A data frame containing the cluster and sequence information +#' +#' @return A wide table of clusters with all sequences in each cluster listed +#' +#' @import dplyr +#' @import tidyr +#' @export +#' pivot_cluster_tbl_wider <- function(cluster_tbl) { # First: get names of each cluster cluster_names <- cluster_tbl %>% dplyr::rename(clus_name = seqnames) %>% group_by(cluster) %>% slice_head(n = 1) - # Second: combine names with clusters and reshape tbls - cluster_tbl <- left_join(stable_cluster, cluster_names, by = "cluster") %>% + cluster_tbl <- left_join(stable_cluster, cluster_names, by = 'cluster') %>% group_by(cluster) %>% - mutate(num = row_number()) %>% + mutate(num = row_number()) %>% pivot_wider(names_from = clus_name, values_from = seqnames, id_cols = num) -} \ No newline at end of file +} diff --git a/R/plot_abundance_per_sample.R b/R/plot_abundance_per_sample.R index 3b2aa744e818ad9b6610763a2ff90cb03035529a..48dfee3fe230590b13ace238db1558e7717a4fa1 100644 --- a/R/plot_abundance_per_sample.R +++ b/R/plot_abundance_per_sample.R @@ -1,12 +1,24 @@ -# Specifically intended to plot sequence abundance per sampled within -# plot_cluster_overview from tbl of sequence abundance +#' This function creates a plot of sequence abundance per sample from a table of sequence abundance. +#' +#' @param tbl_of_abundance A table containing sequence abundance data. +#' @return A plot displaying sequence abundance per sample. +#' @import ggplot2 +#' @import scales +#' @examples +#' tbl_of_abundance <- data.frame(ID = c("ASV1", "ASV2", "ASV3", "ASV4"), +#' Sample = c("Sample1", "Sample1", "Sample2", "Sample2"), +#' count = c(10, 5, 6, 12)) +#' plot_abundance_per_sample(tbl_of_abundance) +#' +#' @export +#' Specifically intended to plot sequence abundance per sampled within plot_cluster_overview from tbl of sequence abundance plot_abundance_per_sample <- function(tbl_of_abundance) { - ggplot(tbl_of_abundance, aes(y = ID, x = Sample, fill = count/1000)) + - geom_tile() + theme(legend.position = "none", + ggplot(tbl_of_abundance, aes(y = ID, x = Sample, fill = count/1000)) + + geom_tile() + theme(legend.position = 'none', axis.title = element_blank(), axis.text = element_blank(), axis.ticks = element_blank()) + - ggtitle("ASV counts per sample") + + ggtitle('ASV counts per sample') + scale_fill_viridis_c() + - labs(fill = "ASV counts (in thousands)") -} \ No newline at end of file + labs(fill = 'ASV counts (in thousands)') +} diff --git a/R/plot_abundance_sums_per_sequence.R b/R/plot_abundance_sums_per_sequence.R index feb1a2cd4eda27fdb5d8b3107344ad7151e78ae1..f871d709486cd467511d6a4307b79ca4508c6c66 100644 --- a/R/plot_abundance_sums_per_sequence.R +++ b/R/plot_abundance_sums_per_sequence.R @@ -1,13 +1,21 @@ -# Specifically intended to plot total sequence abundance within -# plot_cluster_overview from tbl of sequence abundance sums +#' Plot total sequence abundance within plot_cluster_overview from tbl of sequence abundance sums +#' +#' @param tbl_of_sums A table of sequence abundance sums +#' @return A ggplot object of the total ASV counts and sequence IDs +#' @examples +#' tbl_of_sums <- data.frame(ID = c("ASV_001", "ASV_002", "ASV_003"), sum_count = c(1000, 2000, 3000)) +#' plot_abundance_sums_per_sequence(tbl_of_sums) +#' +#' @import ggplot2 +#' +#' @export plot_abundance_sums_per_sequence <- function(tbl_of_sums) { - ggplot(tbl_of_sums, aes(y = ID, x = sum_count/1000)) + - geom_col() + theme(axis.text.x = element_text(angle = 90, - size = 15, - vjust = 0.5), - legend.position = "none", - axis.title = element_blank(), - axis.text.y = element_blank(), - axis.ticks.y = element_blank()) + - ggtitle("Total ASV counts (in thousands)") -} \ No newline at end of file + ggplot(tbl_of_sums, aes(y = ID, x = sum_count / 1000)) + + geom_col() + + theme(axis.text.x = element_text(angle = 90, size = 15, vjust = 0.5), + legend.position = 'none', + axis.title = element_blank(), + axis.text.y = element_blank(), + axis.ticks.y = element_blank())+ + ggtitle('Total ASV counts (in thousands)') +} diff --git a/R/plot_asv_nmds.R b/R/plot_asv_nmds.R index 7c1391b0e65658be8735d351e9725236827f87c6..1b62a215dfe6cce35ef5ff4bc60cb3dc05b79ea8 100644 --- a/R/plot_asv_nmds.R +++ b/R/plot_asv_nmds.R @@ -1,20 +1,46 @@ +#' Create NMDS plot with ggplot2 and centroids +#' +#' This function creates an NMDS plot using ggplot2 and can optionally add centroids to the plot. +#' +#' @param asv_nmds A data frame created from NMDS analysis with columns "MDS1" and "MDS2". +#' @param color_by A string indicating which column to use for coloring points on the plot. Default: 'String'. +#' @param centroids A logical value indicating whether or not to add centroids to the plot. Default: FALSE. +#' +#' @return A ggplot2 object containing the NMDS plot. +#' +#' @examples +#' # Assume nmds_df has been created through NMDS analysis +#' plot_asv_nmds(asv_nmds = nmds_df, color_by = 'Sample', centroids = TRUE) +#' +#' @import ggplot2 +#' @importFrom stats aggregate +#' +#' @export plot_asv_nmds <- function(asv_nmds = my_asv_nmds, - color_by = "String", + color_by = 'String', centroids = FALSE){ + # Create NMDS plot with ggplot2 and centroids - pl <- ggplot(asv_nmds, aes(x=MDS1, y=MDS2, color= .data[[color_by]])) + + + # Create ggplot object + pl <-ggplot(asv_nmds, aes(x=MDS1, y=MDS2, color= .data[[color_by]])) + geom_point() + theme_classic() + - labs(title="NMDS Plot", x="NMDS 1", y="NMDS 2") + labs(title='NMDS Plot', x='NMDS 1', y='NMDS 2') + + # If centroids parameter is TRUE, add centroids to the plot if(centroids){ - # Add sample groups to nmds_df - nmds_df$Group <- factor(gsub("[0-9]", "", rownames(asv_table))) + # Add sample groups to asv_nmds + asv_nmds$Group <- factor(gsub('[0-9]', '', rownames(asv_table))) # Calculate group centroids - centroids_df <- aggregate(nmds_df[, 1:2], by=list(Group=nmds_df$Group), mean) - - pl <- pl + - geom_point(data=centroids_df, aes(x=NMDS1, y=NMDS2), - color="black", size=3, shape=21, fill="white") + centroids_df <- aggregate(asv_nmds[, 1:2], by=list(Group=asv_nmds$Group), mean) + + # Add centroids to plot + pl <- pl + + geom_point(data=centroids_df, aes(x=NMDS1, y=NMDS2), + color='black', size=3, shape=21, fill='white') } + + # Return ggplot object return(pl) -} \ No newline at end of file +} diff --git a/R/plot_cluster_dendrogram.R b/R/plot_cluster_dendrogram.R index 1362fbd88cc84268edbff136887cb678494e6e37..adc21f717188200cc58bf81342d8f577cac994e4 100644 --- a/R/plot_cluster_dendrogram.R +++ b/R/plot_cluster_dendrogram.R @@ -1,6 +1,21 @@ -# Generic plotting of tree with title "Dendrogram" +#' Generic plotting of UPGMA tree using ggtree +#' +#' This function plots a dendrogram using the ggtree package. +#' +#' @param upgma_tree An object of class 'phylo' representing the tree. +#' +#' @import ggtree +#' @import ggplot2 +#' +#' @return A dendrogram plot. +#' +#' @examples +#' tree <- ape::read.tree("tree.txt") +#' plot_cluster_dendrogram(tree) +#' +#' @export plot_cluster_dendrogram <- function(upgma_tree) { ggtree::ggtree(upgma_tree) + ggtree::geom_tiplab(as_ylab = TRUE) + - ggtitle("Dendrogram") + + ggtitle('Dendrogram') + theme(axis.text = element_text(size = 15)) -} \ No newline at end of file +}