Skip to content
Snippets Groups Projects
Commit f111274c authored by Simeon's avatar Simeon
Browse files

roxygen tags added

parent 6941973c
No related branches found
No related tags found
No related merge requests found
#' Count Clusters
#'
#' This function takes a list of cluster tables and returns a tibble
#' with cluster counts by threshold.
#'
#' @param clus_tbl_list A list of cluster tables.
#'
#' @return A tibble with cluster counts by threshold.
#'
#' @examples
#'
#' set.seed(1)
#' clus_tbl_list <- list(
#' data.frame(seqnames = c("chr1", "chr1", "chr2"),
#' start = c(10, 50, 100), end = c(20, 60, 200), count = c(5, 7, 10)),
#' data.frame(seqnames = c("chr1", "chr1", "chr2"),
#' start = c(15, 55, 110), end = c(25, 65, 150), count = c(8, 6, 11))
#' )
#' count_clusters(clus_tbl_list)
#'
#' @import dplyr
#' @import purrr
#' @import tidyr
#' @export
count_clusters <- function(clus_tbl_list){
#remove non-numeric column from clus_tbl_list
clus_tbl_num <- map(clus_tbl_list, ~ dplyr::select(., -seqnames))
clus_counts <- lapply(clus_tbl_num, max)
names(clus_counts) <- names(clus_tbl_list)
clus_counts %>%
bind_rows() %>%
pivot_longer(everything()) %>%
mutate(threshold = as.numeric(name)) %>%
dplyr::rename(cluster_number = value)
}
\ No newline at end of file
# Remove the non-numeric column "seqnames" from each cluster table in the list.
clus_tbl_num <- map(clus_tbl_list, ~ dplyr::select(., -seqnames))
# Create a list of the maximum cluster values for each cluster table in clus_tbl_num using lapply()
clus_counts <- lapply(clus_tbl_num, max)
# Assign the names of the original cluster tables to the values in clus_counts
names(clus_counts) <- names(clus_tbl_list)
# Combine the values in clus_counts into a single data frame using bind_rows()
# Transform the data frame from wide to long using pivot_longer(), and create
# a new column called "threshold" with the values previously used as column names
# Rename the "value" column to "cluster_number" using dplyr::rename()
clus_counts %>%
bind_rows() %>%
pivot_longer(everything()) %>%
mutate(threshold = as.numeric(name)) %>%
dplyr::rename(cluster_number = value)
# Return the final data frame with the cluster numbers and their maximum values
return(clus_counts)
}
#' Define a plateau within cluster counts
#'
#' This function takes in a tibble of cluster counts and defines
#' the plateau within the cluster with the largest threshold window.
#'
#' @param cluster_counts A tibble of cluster counts with columns for
#' cluster_number and threshold.
#'
#' @return A named vector with the starting and ending thresholds of the
#' plateau and the corresponding cluster_number.
#'
#' @examples
#' define_plateau(cluster_counts = cluster_counts_df)
#'
#' @import dplyr
#' @export
# The following code defines a function called "define_plateau"
define_plateau <- function(cluster_counts){
# "cluster_counts" is a tibble of cluster counts passed as a parameter to the function
# "clus_plateau" filters the cluster counts by selecting only those with cluster_number greater than or equal to 2
# then group the remaining data by cluster_number and calculate the maximum and minimum threshold per cluster_number using the "summarise" function.
# This is done to exclude a potentially long tail of single cluster outcomes at high thresholds.
# Calculate the difference between the maximum and minimum threshold for each cluster and store in "thresh_win"
# "arrange" function arranges the rows in descending order of "thresh_win" and selects the first row using "slice_head"
# "clus_plateau" now contains the "plateau_start", "plateau_end", and "cluster_number" values for the selected cluster
clus_plateau <- cluster_counts %>%
filter(cluster_number >= 2) %>%
group_by(cluster_number) %>%
summarise(max_thresh = max(threshold), min_thresh = min(threshold)) %>%
mutate(thresh_win = max_thresh-min_thresh) %>%
arrange(desc(thresh_win)) %>%
slice_head(n = 1) %>%
filter(cluster_number >= 2)%>%
group_by(cluster_number) %>%
summarise(max_thresh = max(threshold), min_thresh = min(threshold)) %>%
mutate(thresh_win = max_thresh-min_thresh) %>%
arrange(desc(thresh_win)) %>%
slice_head(n = 1) %>%
select(min_thresh, max_thresh, cluster_number) %>%
unlist() %>% unname()
names(clus_plateau) <- c("plateau_start", "plateau_end", "cluster_number")
# This line assigns the names "plateau_start", "plateau_end", and "cluster_number" to each of the values in "clus_plateau".
names(clus_plateau) <- c('plateau_start', 'plateau_end', 'cluster_number')
return(clus_plateau)
}
\ No newline at end of file
}
dendrogram_hclust <- function(data = veganized_tibble, seed = 1, ...){
#' Generate dendrogram data from numerical data
#'
#' This function generates a dendrogram from a hierarchical clustering of a dataset.
#'
#' @param data A tibble or data frame suited for vegdist().
#' @param seed An integer that sets the seed for the random number generator. Default is 1.
#' @param ... Additional arguments passed to `vegdist`.
#'
#' @return A `ggdendro::dendro_data` object, containing data for plotting the dendrogram.
#'
#' @import ggdendro
#' @import vegan
#' @import stats
#'
#' @examples
#' # Generate dendrogram with default parameters
#' dendrogram_hclust()
#'
#' # Generate dendrogram with custom data and distance metric
#' daisy_dist <- daisy(iris[,-5], metric = "gower")
#' dendrogram_hclust(daisy_dist)
#'
#' @export
dendrogram_hclust <- function(data = veganized_tibble, seed = 1, ...) {
require(ggdendro)
set.seed(seed)
# make hierarchical cluster from vegdist matrix and extract data for plotting
dat <- vegdist(data, ...) %>%
hclust() %>%
as.dendrogram() %>%
ggdendro::dendro_data(type = "rectangle")
dat <- vegdist(data, ...) %>%
hclust() %>%
as.dendrogram() %>%
ggdendro::dendro_data(type = 'rectangle')
return(dat)
}
\ No newline at end of file
}
export_longest_reading_frame <- function(clustered_reading_frames_tbl = tbl,
seqs = myDNAStringSet,
outpath = path){
#' Export Longest Reading Frame
#'
#' Export the longest reading frames from a DNA sequence set to a fasta file containing amino acid sequences.
#'
#' @param clustered_reading_frames_tbl tibble of clustered reading frames
#' @param seqs DNAStringSet object with DNA sequences
#' @param outpath character string of output directory path
#' @param verbose logical indicating whether to print progress updates
#'
#' @return Messages indicating completion of exporting the longest reading frames to a fasta file format containing amino acid sequences.
#'
#' @examples
#' export_longest_reading_frame(clustered_reading_frames_tbl, myDNAStringSet, myDirPath, TRUE)
#'
#' @import Biostrings
#' @import dplyr
#' @import tidyr
#' @import utils
#'
#' @export
# Define a function that exports the longest reading frames
export_longest_reading_frame <- function(clustered_reading_frames_tbl = tbl, # function argument for clustered_reading_frame table
seqs = myDNAStringSet, # function argument for DNA sequence set
outpath = path, # function argument for output file path
verbose = FALSE){ # function argument for verbosity
# check if the output path exists, create it if it doesn't
if(!dir.exists(file.path(outpath))){
dir.create(outpath)
}
# translate the DNA sequences, count stop codons, and add a column for reading frame as a factor
peps_tbl <- translate_and_count_stops(seqs) %>%
mutate(rframe = as.factor(rframe))
# extract the unique cluster names from the clustered_reading_frames_tbl table
clus_name <- clustered_reading_frames_tbl %>%
select(cluster_name) %>% unique() %>% unlist() %>% unname()
# join the clustered_reading_frames_tbl table and the peps_tbl table by shared columns
temp_seq_tbl <- left_join(clustered_reading_frames_tbl, peps_tbl,
by = c("seqnames", "rframe", "fos_width",
"first_stop", "stop_count", "hrf_width"))
by = c('seqnames', 'rframe', 'fos_width',
'first_stop', 'stop_count', 'hrf_width'))
# create an amino acid sequence set from the 'seq' column of the temp_seq_tbl table
temp_aa_seqs <- AAStringSet(x = as.character(
temp_seq_tbl["seq"] %>% unlist() %>% unname()))
names(temp_aa_seqs) <- as.character(temp_seq_tbl["seqnames"] %>% unlist() %>%
temp_seq_tbl['seq'] %>% unlist() %>% unname()))
# assign names to the sequences in the temp_aa_seqs sequence set
names(temp_aa_seqs) <- as.character(temp_seq_tbl['seqnames'] %>% unlist() %>%
unname())
Biostrings::writeXStringSet(temp_aa_seqs,
# write the amino acid sequences to a fasta file
Biostrings::writeXStringSet(temp_aa_seqs,
filepath = file.path(outpath, paste0(
clus_name, "_cluster_longest_AA_seqs.fa")))
return(paste("AA sequences for cluster", clus_name, "written to file",
file.path(outpath, paste0(
clus_name, "_cluster_longest_AA_seqs.fa"))
))
}
\ No newline at end of file
clus_name, '_cluster_longest_AA_seqs.fa')))
# return a message confirming the export and file path if verbose is true
if(verbose){
return(paste('AA sequences for cluster', clus_name, 'written to file',
file.path(outpath, paste0(
clus_name, '_cluster_longest_AA_seqs.fa'))
))
}else{
return(paste('AA sequences saved to file'))
}
}
## Repeat counter ----
#' Find contiguous multi-repeats in a set of sequences
#'
#' This function finds contiguous multi-repeats in a set of sequences,
#' where the repeat sequence is made up of n consecutive copies of a given string.
#'
#' @param sequences A DNAStringSet or AAStringSet object containing the sequences
#' to search for multi-repeats (default: DNAStringSet)
#' @param repeat_sequence A character string representing the repeat sequence to search
#' for (default: 'string')
#' @param singlet_count An integer or vector of integers specifying the number
#' of single occurrences of the repeat, or the maximum expected number of
#' single occurrences (default: NULL)
#'
#' @return A numeric vector containing the repeat count for each input sequence
#' (i.e., the number of contiguous multi-repeats found)
#'
#' @export
#'
#' @examples
#' # Create example DNAStringSet
#' sequences <- DNAStringSet(c("ATCGATATCGATCGATCG", "GCTAGCTAGCTAGCTAGCTA"))
#' find_contiguous_multi_repeats(sequences, 'AT', 3)
#'
#' # Expected output: c(2, 1)
#'
#' @import stringr
#' @import Biostrings
#'
#' @keywords sequence, repeats
#'
find_contiguous_multi_repeats <- function(sequences = DNAStringSet,
repeat_sequence = "string",
singlet_count) {
repeat_sequence = 'string',
singlet_count = 100) {
# Make vector of zeros for minimum repeat count, make 1 if singlet was found
repeat_count <- replicate(length(sequences), 0)
repeat_count[singlet_count > 0] <- 1
# Iterate n to the highest observed singlet count, overwrite repeat count if
# exactly one n-mer of repeat is found in the sequence
for (n in seq(1, max(singlet_count))) {
contig_rep <- paste(replicate(n, repeat_sequence), collapse = "")
contig_rep <- paste(replicate(n, repeat_sequence), collapse = '')
contig_rep_count <- str_count(as.character(sequences), contig_rep)
repeat_count[contig_rep_count < singlet_count & contig_rep_count > 0] <- n
repeat_count[contig_rep_count < singlet_count & contig_rep_count > 0] <-n
}
return(repeat_count)
}
\ No newline at end of file
}
#' Find longest half-open reading frame
#'
#' This function takes a DNAStringSet as input and returns the longest half-open
#' reading frames as a data table. Wrapper function around translate_and_count_stops()
#'
#' @param seqs A DNAStringSet object with nucleotide sequences.
#'
#' @return A data table with the longest open reading frames.
#'
#' @examples
#' find_longest_hrf(seqs)
#'
#' @import dplyr
#'
#' @export
find_longest_hrf <- function(seqs = DNAStringSet){
peps_tbl <- translate_and_count_stops(seqs)
hrfs <- select(peps_tbl, -seq)
return(hrfs)
}
\ No newline at end of file
}
find_longest_orf <-function(seqs = DNAStringSet) {
#' Find the longest ORF in a DNA sequence set
#'
#' This function identifies the longest open reading frame (ORF) in a DNA sequence set.
#' Wrapper around \code{\link{findORFs}}
#'
#' @param seqs A DNAStringSet object containing the DNA sequences to search for ORFs.
#'
#' @return A tibble containing the start and end positions, strand, and length of the longest ORF in each sequence.
#'
#' @import Biostrings
#' @importFrom GenomicRanges GRanges
#' @import tibble
#' @import dplyr
#' @export
#'
#' @examples
#' seqs <- DNAStringSet(c("ATGAGTTCGAAATGGCGTTGAA", "GGGGGCTCGAGCTAGC"))
#' find_longest_orf(seqs)
#'
#' @seealso \code{\link{findORFs}}
#'
find_longest_orf <- function(seqs = DNAStringSet) {
# Find ORFs in the sequences, return longest ORF, and convert to a vector
orfs <- findORFs(seqs, longestORF = TRUE, startCodon = startDefinition(6)) %>%
unlist(use.names = TRUE)
orfs <- GRanges(seqnames = names(seqs)[as.integer(names(orfs))],
ranges(orfs), strand = "+") %>%
# Convert the ORFs to a GRanges object
orfs <- GRanges(seqnames = names(seqs)[as.integer(names(orfs))],
ranges(orfs), strand = '+') %>%
# Convert to a tibble and add a column for reading frame
as_tibble() %>%
mutate(rframe = ((start -1) %% 3 ) + 1) %>%
# Rename the width column to orf_width
dplyr::rename(orf_width = width)
# Return the ORFs
return(orfs)
}
\ No newline at end of file
}
#' Reading frame finder (longest orf or hrf)
#'
#' This function finds the longest reading frames in a set of DNA sequences.
#' It combines the results from finding the longest open reading frames (orfs)
#' and the longest half reading frames (hrfs) and determines the reading frame
#' with the biggest possible translation.
#'
#' @param seqs A DNAStringSet object containing the input DNA sequences.
#'
#' @return A data frame containing the longest reading frames for each sequence.
#' The data frame includes the sequence names, reading frame, and the width of the reading frame.
#'
#' @import dplyr, tidyr
#'
#' @export
## Reading frame finder (longest orf or hrf)
find_longest_reading_frames <- function(seqs = myDNAStringSet){
orfs <- find_longest_orf(seqs)
hrfs <- find_longest_hrf(seqs)
# Combine orf and hrf tables and find the reading frame with the biggest
# Combine orf and hrf tables and find the reading frame with the biggest
# possible translation
orf_and_hrf <- full_join(orfs, hrfs, by = join_by(seqnames, rframe)) %>%
mutate(orf_width = replace_na(orf_width, 0)) %>%
......@@ -13,12 +28,12 @@ find_longest_reading_frames <- function(seqs = myDNAStringSet){
"Five-open (no start)", max_type)) %>%
mutate(max_type = ifelse(fos_width > hrf_width & max_type == "open",
"Three-open (no stop)", max_type))
max_rf <- orf_and_hrf %>%
group_by(seqnames) %>%
slice_max(max_width, n = 1, with_ties = FALSE) %>%
arrange(factor(seqnames, levels = names(seqs))) %>%
mutate(rframe = factor(rframe, levels = c("1", "2", "3")))
return(max_rf)
}
\ No newline at end of file
}
find_repeat_positions <- function(sequences = DNAStringSet,
repeat_sequence = "string"){
repeat_positions <- str_locate_all(as.character(sequences), repeat_sequence)
names(repeat_positions) <- names(sequences)
#' Find Repeat Positions
#'
#' This function finds the positions of a given repeat sequence within a set of DNA sequences. It returns a data frame with the sequence name, start and end positions, and information used in plotting downstream.
#'
#' @param sequences A DNAStringSet object containing the DNA sequences to search for repeats.
#' @param repeat_sequence A string specifying the repeat sequence to search for.
#'
#' @return A data frame with columns: seqname, start, end, fragment, and plot_intensity.
#'
#' @import stringr
#' @import dplyr
#' @import tibble
#'
#' @export
#'
#' @examples
#' sequences <- DNAStringSet(c("AGTCAGT",
#' "ACGTAGT",
#' "AGTCGAT"))
#' find_repeat_positions(sequences, "AGT")
find_repeat_positions <- function(sequences = DNAStringSet, repeat_sequence = 'string'){
repeat_positions <- str_locate_all(as.character(sequences), repeat_sequence)names(repeat_positions) <- names(sequences)
repeat_positions <- lapply(repeat_positions, as_tibble) %>%
bind_rows(.id = "seqname") %>%
bind_rows(.id = 'seqname') %>%
mutate(fragment = repeat_sequence)
whole_sequence_positions = tibble("seqname" = names(sequences),
"start" = replicate(length(sequences), 1),
"end" = str_length(sequences),
"fragment" = replicate(
length(sequences), "Full sequence"))
whole_sequence_positions = tibble('seqname' = names(sequences),
'start' = replicate(length(sequences), 1),
'end' = str_length(sequences),
'fragment' = replicate(length(sequences), 'Full sequence'))
out <- bind_rows(repeat_positions, whole_sequence_positions) %>%
mutate(plot_intensity = ifelse(fragment == repeat_sequence, 4, 3)) %>%
mutate(seqname = factor(seqname, levels = names(sequences))) %>%
mutate(fragment = factor(fragment, levels = c("Full sequence",
repeat_sequence)))
mutate(fragment = factor(fragment, levels = c('Full sequence', repeat_sequence)))
return(out)
}
\ No newline at end of file
}
# Convert to bin format and get distance matrix using kmers
kmer_based_distance_matrix <- function (seqs = DNAStringSet) {
#' Convert to bin format and get distance matrix using kmers
#'
#' This function takes in a DNAStringSet and converts it to a bin format using ape::as.DNAbin function. It then calculates the distance matrix using the kdistance function from XXXX and returns it as a matrix.
#'
#' @param seqs DNAStringSet containing the DNA sequences
#'
#' @return A distance matrix in bin format.
#'
#' @importFrom ape as.DNAbin
#'
#' @importFrom Biostrings DNAStringSet
#'
#' @export
#'
kmer_based_distance_matrix <- function (seqs) {
seqbins <- ape::as.DNAbin(seqs)
as.matrix(kdistance(seqbins))
}
\ No newline at end of file
}
#write temporary file to cluster
meshclustR <- function(seqs = MyDNAStringSet,
meshclust_bin = meshclust, filepath = path){
temp_file <- file.path(filepath, "cluster_me.fa")
#' Write temporary file to cluster using MeshclustR
#'
#' This function writes a temporary file to perform a clustering analysis on a set of DNA sequences.
#' The clustering is done using the \href{https://github.com/BioinformaticsToolsmith/MeShClust}{Meshclust commandline} tool.
#'
#' James, Benjamin T. et al. (2018),
#' MeShClust: an intelligent tool for clustering DNA sequences.
#' Nucleic Acids Research, gky315.
#'
#' @param seqs A DNAStringSet object containing the DNA sequences to be clustered.
#' @param meshclust_bin The path to the Meshclust executable (run
#' `which meshclust` on commandline). Default is set to "meshclust".
#' @param filepath The path to the directory where the temporary and log
#' files will be saved.
#' @return A data frame with information regarding the clustering analysis.
#'
#' @examples
#' meshclustR(seqs = MyDNAStringSet, meshclust_bin = meshclust, filepath = path)
#' @import Biostrings
#' @import readr
#' @import magrittr
#' @import stringr
#' @importFrom tools file_path_sans_ext
#' @export
meshclustR <- function(seqs = MyDNAStringSet,
meshclust_bin = meshclust,
filepath = path){
#create temporary file path
temp_file <- file.path(filepath, 'cluster_me.fa')
writeXStringSet(seqs, filepath = temp_file)
#create output file name
out_file <- file.path(filepath, paste0(
tools::file_path_sans_ext(basename(temp_file)),".txt"))
## Meshclust
system2(meshclust, args = c("-d", temp_file,
"-o", out_file))
stable_cluster <- read_delim(out_file, delim = "\t",
col_names = c("cluster", "seqnames",
"identity_with_center", "cluster_class"),
col_types = "fcdc") %>%
mutate(seqnames = str_remove(seqnames, ">"))
#clean up temp file
tools::file_path_sans_ext(basename(temp_file)),'.txt'))
#run Meshclust
system2(meshclust_bin, args = c('-d', temp_file,
'-o', out_file))
#read output file and parse
stable_cluster <- read_delim(out_file, delim = '\t',
col_names = c('cluster', 'seqnames',
'identity_with_center', 'cluster_class'),
col_types = 'fcdc') %>%
mutate(seqnames = str_remove(seqnames, '>'))
#clean up temp file and output file
file.remove(temp_file, out_file)
#return data frame
return(stable_cluster)
}
\ No newline at end of file
}
# Make wide table of clusters (listing all sequences in each cluster)
#' Make wide table of clusters (listing all sequences in each cluster)
#'
#' @param cluster_tbl A data frame containing the cluster and sequence information
#'
#' @return A wide table of clusters with all sequences in each cluster listed
#'
#' @import dplyr
#' @import tidyr
#' @export
#'
pivot_cluster_tbl_wider <- function(cluster_tbl) {
# First: get names of each cluster
cluster_names <- cluster_tbl %>%
dplyr::rename(clus_name = seqnames) %>%
group_by(cluster) %>%
slice_head(n = 1)
# Second: combine names with clusters and reshape tbls
cluster_tbl <- left_join(stable_cluster, cluster_names, by = "cluster") %>%
cluster_tbl <- left_join(stable_cluster, cluster_names, by = 'cluster') %>%
group_by(cluster) %>%
mutate(num = row_number()) %>%
mutate(num = row_number()) %>%
pivot_wider(names_from = clus_name, values_from = seqnames, id_cols = num)
}
\ No newline at end of file
}
# Specifically intended to plot sequence abundance per sampled within
# plot_cluster_overview from tbl of sequence abundance
#' This function creates a plot of sequence abundance per sample from a table of sequence abundance.
#'
#' @param tbl_of_abundance A table containing sequence abundance data.
#' @return A plot displaying sequence abundance per sample.
#' @import ggplot2
#' @import scales
#' @examples
#' tbl_of_abundance <- data.frame(ID = c("ASV1", "ASV2", "ASV3", "ASV4"),
#' Sample = c("Sample1", "Sample1", "Sample2", "Sample2"),
#' count = c(10, 5, 6, 12))
#' plot_abundance_per_sample(tbl_of_abundance)
#'
#' @export
#' Specifically intended to plot sequence abundance per sampled within plot_cluster_overview from tbl of sequence abundance
plot_abundance_per_sample <- function(tbl_of_abundance) {
ggplot(tbl_of_abundance, aes(y = ID, x = Sample, fill = count/1000)) +
geom_tile() + theme(legend.position = "none",
ggplot(tbl_of_abundance, aes(y = ID, x = Sample, fill = count/1000)) +
geom_tile() + theme(legend.position = 'none',
axis.title = element_blank(),
axis.text = element_blank(),
axis.ticks = element_blank()) +
ggtitle("ASV counts per sample") +
ggtitle('ASV counts per sample') +
scale_fill_viridis_c() +
labs(fill = "ASV counts (in thousands)")
}
\ No newline at end of file
labs(fill = 'ASV counts (in thousands)')
}
# Specifically intended to plot total sequence abundance within
# plot_cluster_overview from tbl of sequence abundance sums
#' Plot total sequence abundance within plot_cluster_overview from tbl of sequence abundance sums
#'
#' @param tbl_of_sums A table of sequence abundance sums
#' @return A ggplot object of the total ASV counts and sequence IDs
#' @examples
#' tbl_of_sums <- data.frame(ID = c("ASV_001", "ASV_002", "ASV_003"), sum_count = c(1000, 2000, 3000))
#' plot_abundance_sums_per_sequence(tbl_of_sums)
#'
#' @import ggplot2
#'
#' @export
plot_abundance_sums_per_sequence <- function(tbl_of_sums) {
ggplot(tbl_of_sums, aes(y = ID, x = sum_count/1000)) +
geom_col() + theme(axis.text.x = element_text(angle = 90,
size = 15,
vjust = 0.5),
legend.position = "none",
axis.title = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank()) +
ggtitle("Total ASV counts (in thousands)")
}
\ No newline at end of file
ggplot(tbl_of_sums, aes(y = ID, x = sum_count / 1000)) +
geom_col() +
theme(axis.text.x = element_text(angle = 90, size = 15, vjust = 0.5),
legend.position = 'none',
axis.title = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank())+
ggtitle('Total ASV counts (in thousands)')
}
#' Create NMDS plot with ggplot2 and centroids
#'
#' This function creates an NMDS plot using ggplot2 and can optionally add centroids to the plot.
#'
#' @param asv_nmds A data frame created from NMDS analysis with columns "MDS1" and "MDS2".
#' @param color_by A string indicating which column to use for coloring points on the plot. Default: 'String'.
#' @param centroids A logical value indicating whether or not to add centroids to the plot. Default: FALSE.
#'
#' @return A ggplot2 object containing the NMDS plot.
#'
#' @examples
#' # Assume nmds_df has been created through NMDS analysis
#' plot_asv_nmds(asv_nmds = nmds_df, color_by = 'Sample', centroids = TRUE)
#'
#' @import ggplot2
#' @importFrom stats aggregate
#'
#' @export
plot_asv_nmds <- function(asv_nmds = my_asv_nmds,
color_by = "String",
color_by = 'String',
centroids = FALSE){
# Create NMDS plot with ggplot2 and centroids
pl <- ggplot(asv_nmds, aes(x=MDS1, y=MDS2, color= .data[[color_by]])) +
# Create ggplot object
pl <-ggplot(asv_nmds, aes(x=MDS1, y=MDS2, color= .data[[color_by]])) +
geom_point() +
theme_classic() +
labs(title="NMDS Plot", x="NMDS 1", y="NMDS 2")
labs(title='NMDS Plot', x='NMDS 1', y='NMDS 2')
# If centroids parameter is TRUE, add centroids to the plot
if(centroids){
# Add sample groups to nmds_df
nmds_df$Group <- factor(gsub("[0-9]", "", rownames(asv_table)))
# Add sample groups to asv_nmds
asv_nmds$Group <- factor(gsub('[0-9]', '', rownames(asv_table)))
# Calculate group centroids
centroids_df <- aggregate(nmds_df[, 1:2], by=list(Group=nmds_df$Group), mean)
pl <- pl +
geom_point(data=centroids_df, aes(x=NMDS1, y=NMDS2),
color="black", size=3, shape=21, fill="white")
centroids_df <- aggregate(asv_nmds[, 1:2], by=list(Group=asv_nmds$Group), mean)
# Add centroids to plot
pl <- pl +
geom_point(data=centroids_df, aes(x=NMDS1, y=NMDS2),
color='black', size=3, shape=21, fill='white')
}
# Return ggplot object
return(pl)
}
\ No newline at end of file
}
# Generic plotting of tree with title "Dendrogram"
#' Generic plotting of UPGMA tree using ggtree
#'
#' This function plots a dendrogram using the ggtree package.
#'
#' @param upgma_tree An object of class 'phylo' representing the tree.
#'
#' @import ggtree
#' @import ggplot2
#'
#' @return A dendrogram plot.
#'
#' @examples
#' tree <- ape::read.tree("tree.txt")
#' plot_cluster_dendrogram(tree)
#'
#' @export
plot_cluster_dendrogram <- function(upgma_tree) {
ggtree::ggtree(upgma_tree) + ggtree::geom_tiplab(as_ylab = TRUE) +
ggtitle("Dendrogram") +
ggtitle('Dendrogram') +
theme(axis.text = element_text(size = 15))
}
\ No newline at end of file
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment