Skip to content
Snippets Groups Projects
Commit f111274c authored by Simeon's avatar Simeon
Browse files

roxygen tags added

parent 6941973c
Branches
No related tags found
No related merge requests found
#' Count Clusters
#'
#' This function takes a list of cluster tables and returns a tibble
#' with cluster counts by threshold.
#'
#' @param clus_tbl_list A list of cluster tables.
#'
#' @return A tibble with cluster counts by threshold.
#'
#' @examples
#'
#' set.seed(1)
#' clus_tbl_list <- list(
#' data.frame(seqnames = c("chr1", "chr1", "chr2"),
#' start = c(10, 50, 100), end = c(20, 60, 200), count = c(5, 7, 10)),
#' data.frame(seqnames = c("chr1", "chr1", "chr2"),
#' start = c(15, 55, 110), end = c(25, 65, 150), count = c(8, 6, 11))
#' )
#' count_clusters(clus_tbl_list)
#'
#' @import dplyr
#' @import purrr
#' @import tidyr
#' @export
count_clusters <- function(clus_tbl_list){
#remove non-numeric column from clus_tbl_list
clus_tbl_num <- map(clus_tbl_list, ~ dplyr::select(., -seqnames))
clus_counts <- lapply(clus_tbl_num, max)
names(clus_counts) <- names(clus_tbl_list)
clus_counts %>%
bind_rows() %>%
pivot_longer(everything()) %>%
mutate(threshold = as.numeric(name)) %>%
dplyr::rename(cluster_number = value)
}
\ No newline at end of file
# Remove the non-numeric column "seqnames" from each cluster table in the list.
clus_tbl_num <- map(clus_tbl_list, ~ dplyr::select(., -seqnames))
# Create a list of the maximum cluster values for each cluster table in clus_tbl_num using lapply()
clus_counts <- lapply(clus_tbl_num, max)
# Assign the names of the original cluster tables to the values in clus_counts
names(clus_counts) <- names(clus_tbl_list)
# Combine the values in clus_counts into a single data frame using bind_rows()
# Transform the data frame from wide to long using pivot_longer(), and create
# a new column called "threshold" with the values previously used as column names
# Rename the "value" column to "cluster_number" using dplyr::rename()
clus_counts %>%
bind_rows() %>%
pivot_longer(everything()) %>%
mutate(threshold = as.numeric(name)) %>%
dplyr::rename(cluster_number = value)
# Return the final data frame with the cluster numbers and their maximum values
return(clus_counts)
}
#' Define a plateau within cluster counts
#'
#' This function takes in a tibble of cluster counts and defines
#' the plateau within the cluster with the largest threshold window.
#'
#' @param cluster_counts A tibble of cluster counts with columns for
#' cluster_number and threshold.
#'
#' @return A named vector with the starting and ending thresholds of the
#' plateau and the corresponding cluster_number.
#'
#' @examples
#' define_plateau(cluster_counts = cluster_counts_df)
#'
#' @import dplyr
#' @export
# The following code defines a function called "define_plateau"
define_plateau <- function(cluster_counts){
# "cluster_counts" is a tibble of cluster counts passed as a parameter to the function
# "clus_plateau" filters the cluster counts by selecting only those with cluster_number greater than or equal to 2
# then group the remaining data by cluster_number and calculate the maximum and minimum threshold per cluster_number using the "summarise" function.
# This is done to exclude a potentially long tail of single cluster outcomes at high thresholds.
# Calculate the difference between the maximum and minimum threshold for each cluster and store in "thresh_win"
# "arrange" function arranges the rows in descending order of "thresh_win" and selects the first row using "slice_head"
# "clus_plateau" now contains the "plateau_start", "plateau_end", and "cluster_number" values for the selected cluster
clus_plateau <- cluster_counts %>%
filter(cluster_number >= 2) %>%
group_by(cluster_number) %>%
summarise(max_thresh = max(threshold), min_thresh = min(threshold)) %>%
mutate(thresh_win = max_thresh-min_thresh) %>%
arrange(desc(thresh_win)) %>%
slice_head(n = 1) %>%
filter(cluster_number >= 2)%>%
group_by(cluster_number) %>%
summarise(max_thresh = max(threshold), min_thresh = min(threshold)) %>%
mutate(thresh_win = max_thresh-min_thresh) %>%
arrange(desc(thresh_win)) %>%
slice_head(n = 1) %>%
select(min_thresh, max_thresh, cluster_number) %>%
unlist() %>% unname()
names(clus_plateau) <- c("plateau_start", "plateau_end", "cluster_number")
# This line assigns the names "plateau_start", "plateau_end", and "cluster_number" to each of the values in "clus_plateau".
names(clus_plateau) <- c('plateau_start', 'plateau_end', 'cluster_number')
return(clus_plateau)
}
\ No newline at end of file
}
dendrogram_hclust <- function(data = veganized_tibble, seed = 1, ...){
#' Generate dendrogram data from numerical data
#'
#' This function generates a dendrogram from a hierarchical clustering of a dataset.
#'
#' @param data A tibble or data frame suited for vegdist().
#' @param seed An integer that sets the seed for the random number generator. Default is 1.
#' @param ... Additional arguments passed to `vegdist`.
#'
#' @return A `ggdendro::dendro_data` object, containing data for plotting the dendrogram.
#'
#' @import ggdendro
#' @import vegan
#' @import stats
#'
#' @examples
#' # Generate dendrogram with default parameters
#' dendrogram_hclust()
#'
#' # Generate dendrogram with custom data and distance metric
#' daisy_dist <- daisy(iris[,-5], metric = "gower")
#' dendrogram_hclust(daisy_dist)
#'
#' @export
dendrogram_hclust <- function(data = veganized_tibble, seed = 1, ...) {
require(ggdendro)
set.seed(seed)
# make hierarchical cluster from vegdist matrix and extract data for plotting
dat <- vegdist(data, ...) %>%
hclust() %>%
as.dendrogram() %>%
ggdendro::dendro_data(type = "rectangle")
dat <- vegdist(data, ...) %>%
hclust() %>%
as.dendrogram() %>%
ggdendro::dendro_data(type = 'rectangle')
return(dat)
}
\ No newline at end of file
}
export_longest_reading_frame <- function(clustered_reading_frames_tbl = tbl,
seqs = myDNAStringSet,
outpath = path){
#' Export Longest Reading Frame
#'
#' Export the longest reading frames from a DNA sequence set to a fasta file containing amino acid sequences.
#'
#' @param clustered_reading_frames_tbl tibble of clustered reading frames
#' @param seqs DNAStringSet object with DNA sequences
#' @param outpath character string of output directory path
#' @param verbose logical indicating whether to print progress updates
#'
#' @return Messages indicating completion of exporting the longest reading frames to a fasta file format containing amino acid sequences.
#'
#' @examples
#' export_longest_reading_frame(clustered_reading_frames_tbl, myDNAStringSet, myDirPath, TRUE)
#'
#' @import Biostrings
#' @import dplyr
#' @import tidyr
#' @import utils
#'
#' @export
# Define a function that exports the longest reading frames
export_longest_reading_frame <- function(clustered_reading_frames_tbl = tbl, # function argument for clustered_reading_frame table
seqs = myDNAStringSet, # function argument for DNA sequence set
outpath = path, # function argument for output file path
verbose = FALSE){ # function argument for verbosity
# check if the output path exists, create it if it doesn't
if(!dir.exists(file.path(outpath))){
dir.create(outpath)
}
# translate the DNA sequences, count stop codons, and add a column for reading frame as a factor
peps_tbl <- translate_and_count_stops(seqs) %>%
mutate(rframe = as.factor(rframe))
# extract the unique cluster names from the clustered_reading_frames_tbl table
clus_name <- clustered_reading_frames_tbl %>%
select(cluster_name) %>% unique() %>% unlist() %>% unname()
# join the clustered_reading_frames_tbl table and the peps_tbl table by shared columns
temp_seq_tbl <- left_join(clustered_reading_frames_tbl, peps_tbl,
by = c("seqnames", "rframe", "fos_width",
"first_stop", "stop_count", "hrf_width"))
by = c('seqnames', 'rframe', 'fos_width',
'first_stop', 'stop_count', 'hrf_width'))
# create an amino acid sequence set from the 'seq' column of the temp_seq_tbl table
temp_aa_seqs <- AAStringSet(x = as.character(
temp_seq_tbl["seq"] %>% unlist() %>% unname()))
names(temp_aa_seqs) <- as.character(temp_seq_tbl["seqnames"] %>% unlist() %>%
temp_seq_tbl['seq'] %>% unlist() %>% unname()))
# assign names to the sequences in the temp_aa_seqs sequence set
names(temp_aa_seqs) <- as.character(temp_seq_tbl['seqnames'] %>% unlist() %>%
unname())
Biostrings::writeXStringSet(temp_aa_seqs,
# write the amino acid sequences to a fasta file
Biostrings::writeXStringSet(temp_aa_seqs,
filepath = file.path(outpath, paste0(
clus_name, "_cluster_longest_AA_seqs.fa")))
return(paste("AA sequences for cluster", clus_name, "written to file",
file.path(outpath, paste0(
clus_name, "_cluster_longest_AA_seqs.fa"))
))
}
\ No newline at end of file
clus_name, '_cluster_longest_AA_seqs.fa')))
# return a message confirming the export and file path if verbose is true
if(verbose){
return(paste('AA sequences for cluster', clus_name, 'written to file',
file.path(outpath, paste0(
clus_name, '_cluster_longest_AA_seqs.fa'))
))
}else{
return(paste('AA sequences saved to file'))
}
}
## Repeat counter ----
#' Find contiguous multi-repeats in a set of sequences
#'
#' This function finds contiguous multi-repeats in a set of sequences,
#' where the repeat sequence is made up of n consecutive copies of a given string.
#'
#' @param sequences A DNAStringSet or AAStringSet object containing the sequences
#' to search for multi-repeats (default: DNAStringSet)
#' @param repeat_sequence A character string representing the repeat sequence to search
#' for (default: 'string')
#' @param singlet_count An integer or vector of integers specifying the number
#' of single occurrences of the repeat, or the maximum expected number of
#' single occurrences (default: NULL)
#'
#' @return A numeric vector containing the repeat count for each input sequence
#' (i.e., the number of contiguous multi-repeats found)
#'
#' @export
#'
#' @examples
#' # Create example DNAStringSet
#' sequences <- DNAStringSet(c("ATCGATATCGATCGATCG", "GCTAGCTAGCTAGCTAGCTA"))
#' find_contiguous_multi_repeats(sequences, 'AT', 3)
#'
#' # Expected output: c(2, 1)
#'
#' @import stringr
#' @import Biostrings
#'
#' @keywords sequence, repeats
#'
find_contiguous_multi_repeats <- function(sequences = DNAStringSet,
repeat_sequence = "string",
singlet_count) {
repeat_sequence = 'string',
singlet_count = 100) {
# Make vector of zeros for minimum repeat count, make 1 if singlet was found
repeat_count <- replicate(length(sequences), 0)
repeat_count[singlet_count > 0] <- 1
# Iterate n to the highest observed singlet count, overwrite repeat count if
# exactly one n-mer of repeat is found in the sequence
for (n in seq(1, max(singlet_count))) {
contig_rep <- paste(replicate(n, repeat_sequence), collapse = "")
contig_rep <- paste(replicate(n, repeat_sequence), collapse = '')
contig_rep_count <- str_count(as.character(sequences), contig_rep)
repeat_count[contig_rep_count < singlet_count & contig_rep_count > 0] <- n
repeat_count[contig_rep_count < singlet_count & contig_rep_count > 0] <-n
}
return(repeat_count)
}
\ No newline at end of file
}
#' Find longest half-open reading frame
#'
#' This function takes a DNAStringSet as input and returns the longest half-open
#' reading frames as a data table. Wrapper function around translate_and_count_stops()
#'
#' @param seqs A DNAStringSet object with nucleotide sequences.
#'
#' @return A data table with the longest open reading frames.
#'
#' @examples
#' find_longest_hrf(seqs)
#'
#' @import dplyr
#'
#' @export
find_longest_hrf <- function(seqs = DNAStringSet){
peps_tbl <- translate_and_count_stops(seqs)
hrfs <- select(peps_tbl, -seq)
return(hrfs)
}
\ No newline at end of file
}
find_longest_orf <-function(seqs = DNAStringSet) {
#' Find the longest ORF in a DNA sequence set
#'
#' This function identifies the longest open reading frame (ORF) in a DNA sequence set.
#' Wrapper around \code{\link{findORFs}}
#'
#' @param seqs A DNAStringSet object containing the DNA sequences to search for ORFs.
#'
#' @return A tibble containing the start and end positions, strand, and length of the longest ORF in each sequence.
#'
#' @import Biostrings
#' @importFrom GenomicRanges GRanges
#' @import tibble
#' @import dplyr
#' @export
#'
#' @examples
#' seqs <- DNAStringSet(c("ATGAGTTCGAAATGGCGTTGAA", "GGGGGCTCGAGCTAGC"))
#' find_longest_orf(seqs)
#'
#' @seealso \code{\link{findORFs}}
#'
find_longest_orf <- function(seqs = DNAStringSet) {
# Find ORFs in the sequences, return longest ORF, and convert to a vector
orfs <- findORFs(seqs, longestORF = TRUE, startCodon = startDefinition(6)) %>%
unlist(use.names = TRUE)
orfs <- GRanges(seqnames = names(seqs)[as.integer(names(orfs))],
ranges(orfs), strand = "+") %>%
# Convert the ORFs to a GRanges object
orfs <- GRanges(seqnames = names(seqs)[as.integer(names(orfs))],
ranges(orfs), strand = '+') %>%
# Convert to a tibble and add a column for reading frame
as_tibble() %>%
mutate(rframe = ((start -1) %% 3 ) + 1) %>%
# Rename the width column to orf_width
dplyr::rename(orf_width = width)
# Return the ORFs
return(orfs)
}
\ No newline at end of file
}
#' Reading frame finder (longest orf or hrf)
#'
#' This function finds the longest reading frames in a set of DNA sequences.
#' It combines the results from finding the longest open reading frames (orfs)
#' and the longest half reading frames (hrfs) and determines the reading frame
#' with the biggest possible translation.
#'
#' @param seqs A DNAStringSet object containing the input DNA sequences.
#'
#' @return A data frame containing the longest reading frames for each sequence.
#' The data frame includes the sequence names, reading frame, and the width of the reading frame.
#'
#' @import dplyr, tidyr
#'
#' @export
## Reading frame finder (longest orf or hrf)
find_longest_reading_frames <- function(seqs = myDNAStringSet){
orfs <- find_longest_orf(seqs)
hrfs <- find_longest_hrf(seqs)
# Combine orf and hrf tables and find the reading frame with the biggest
# Combine orf and hrf tables and find the reading frame with the biggest
# possible translation
orf_and_hrf <- full_join(orfs, hrfs, by = join_by(seqnames, rframe)) %>%
mutate(orf_width = replace_na(orf_width, 0)) %>%
......@@ -13,12 +28,12 @@ find_longest_reading_frames <- function(seqs = myDNAStringSet){
"Five-open (no start)", max_type)) %>%
mutate(max_type = ifelse(fos_width > hrf_width & max_type == "open",
"Three-open (no stop)", max_type))
max_rf <- orf_and_hrf %>%
group_by(seqnames) %>%
slice_max(max_width, n = 1, with_ties = FALSE) %>%
arrange(factor(seqnames, levels = names(seqs))) %>%
mutate(rframe = factor(rframe, levels = c("1", "2", "3")))
return(max_rf)
}
\ No newline at end of file
}
find_repeat_positions <- function(sequences = DNAStringSet,
repeat_sequence = "string"){
repeat_positions <- str_locate_all(as.character(sequences), repeat_sequence)
names(repeat_positions) <- names(sequences)
#' Find Repeat Positions
#'
#' This function finds the positions of a given repeat sequence within a set of DNA sequences. It returns a data frame with the sequence name, start and end positions, and information used in plotting downstream.
#'
#' @param sequences A DNAStringSet object containing the DNA sequences to search for repeats.
#' @param repeat_sequence A string specifying the repeat sequence to search for.
#'
#' @return A data frame with columns: seqname, start, end, fragment, and plot_intensity.
#'
#' @import stringr
#' @import dplyr
#' @import tibble
#'
#' @export
#'
#' @examples
#' sequences <- DNAStringSet(c("AGTCAGT",
#' "ACGTAGT",
#' "AGTCGAT"))
#' find_repeat_positions(sequences, "AGT")
find_repeat_positions <- function(sequences = DNAStringSet, repeat_sequence = 'string'){
repeat_positions <- str_locate_all(as.character(sequences), repeat_sequence)names(repeat_positions) <- names(sequences)
repeat_positions <- lapply(repeat_positions, as_tibble) %>%
bind_rows(.id = "seqname") %>%
bind_rows(.id = 'seqname') %>%
mutate(fragment = repeat_sequence)
whole_sequence_positions = tibble("seqname" = names(sequences),
"start" = replicate(length(sequences), 1),
"end" = str_length(sequences),
"fragment" = replicate(
length(sequences), "Full sequence"))
whole_sequence_positions = tibble('seqname' = names(sequences),
'start' = replicate(length(sequences), 1),
'end' = str_length(sequences),
'fragment' = replicate(length(sequences), 'Full sequence'))
out <- bind_rows(repeat_positions, whole_sequence_positions) %>%
mutate(plot_intensity = ifelse(fragment == repeat_sequence, 4, 3)) %>%
mutate(seqname = factor(seqname, levels = names(sequences))) %>%
mutate(fragment = factor(fragment, levels = c("Full sequence",
repeat_sequence)))
mutate(fragment = factor(fragment, levels = c('Full sequence', repeat_sequence)))
return(out)
}
\ No newline at end of file
}
# Convert to bin format and get distance matrix using kmers
kmer_based_distance_matrix <- function (seqs = DNAStringSet) {
#' Convert to bin format and get distance matrix using kmers
#'
#' This function takes in a DNAStringSet and converts it to a bin format using ape::as.DNAbin function. It then calculates the distance matrix using the kdistance function from XXXX and returns it as a matrix.
#'
#' @param seqs DNAStringSet containing the DNA sequences
#'
#' @return A distance matrix in bin format.
#'
#' @importFrom ape as.DNAbin
#'
#' @importFrom Biostrings DNAStringSet
#'
#' @export
#'
kmer_based_distance_matrix <- function (seqs) {
seqbins <- ape::as.DNAbin(seqs)
as.matrix(kdistance(seqbins))
}
\ No newline at end of file
}
#write temporary file to cluster
meshclustR <- function(seqs = MyDNAStringSet,
meshclust_bin = meshclust, filepath = path){
temp_file <- file.path(filepath, "cluster_me.fa")
#' Write temporary file to cluster using MeshclustR
#'
#' This function writes a temporary file to perform a clustering analysis on a set of DNA sequences.
#' The clustering is done using the \href{https://github.com/BioinformaticsToolsmith/MeShClust}{Meshclust commandline} tool.
#'
#' James, Benjamin T. et al. (2018),
#' MeShClust: an intelligent tool for clustering DNA sequences.
#' Nucleic Acids Research, gky315.
#'
#' @param seqs A DNAStringSet object containing the DNA sequences to be clustered.
#' @param meshclust_bin The path to the Meshclust executable (run
#' `which meshclust` on commandline). Default is set to "meshclust".
#' @param filepath The path to the directory where the temporary and log
#' files will be saved.
#' @return A data frame with information regarding the clustering analysis.
#'
#' @examples
#' meshclustR(seqs = MyDNAStringSet, meshclust_bin = meshclust, filepath = path)
#' @import Biostrings
#' @import readr
#' @import magrittr
#' @import stringr
#' @importFrom tools file_path_sans_ext
#' @export
meshclustR <- function(seqs = MyDNAStringSet,
meshclust_bin = meshclust,
filepath = path){
#create temporary file path
temp_file <- file.path(filepath, 'cluster_me.fa')
writeXStringSet(seqs, filepath = temp_file)
#create output file name
out_file <- file.path(filepath, paste0(
tools::file_path_sans_ext(basename(temp_file)),".txt"))
## Meshclust
system2(meshclust, args = c("-d", temp_file,
"-o", out_file))
stable_cluster <- read_delim(out_file, delim = "\t",
col_names = c("cluster", "seqnames",
"identity_with_center", "cluster_class"),
col_types = "fcdc") %>%
mutate(seqnames = str_remove(seqnames, ">"))
#clean up temp file
tools::file_path_sans_ext(basename(temp_file)),'.txt'))
#run Meshclust
system2(meshclust_bin, args = c('-d', temp_file,
'-o', out_file))
#read output file and parse
stable_cluster <- read_delim(out_file, delim = '\t',
col_names = c('cluster', 'seqnames',
'identity_with_center', 'cluster_class'),
col_types = 'fcdc') %>%
mutate(seqnames = str_remove(seqnames, '>'))
#clean up temp file and output file
file.remove(temp_file, out_file)
#return data frame
return(stable_cluster)
}
\ No newline at end of file
}
# Make wide table of clusters (listing all sequences in each cluster)
#' Make wide table of clusters (listing all sequences in each cluster)
#'
#' @param cluster_tbl A data frame containing the cluster and sequence information
#'
#' @return A wide table of clusters with all sequences in each cluster listed
#'
#' @import dplyr
#' @import tidyr
#' @export
#'
pivot_cluster_tbl_wider <- function(cluster_tbl) {
# First: get names of each cluster
cluster_names <- cluster_tbl %>%
dplyr::rename(clus_name = seqnames) %>%
group_by(cluster) %>%
slice_head(n = 1)
# Second: combine names with clusters and reshape tbls
cluster_tbl <- left_join(stable_cluster, cluster_names, by = "cluster") %>%
cluster_tbl <- left_join(stable_cluster, cluster_names, by = 'cluster') %>%
group_by(cluster) %>%
mutate(num = row_number()) %>%
mutate(num = row_number()) %>%
pivot_wider(names_from = clus_name, values_from = seqnames, id_cols = num)
}
\ No newline at end of file
}
# Specifically intended to plot sequence abundance per sampled within
# plot_cluster_overview from tbl of sequence abundance
#' This function creates a plot of sequence abundance per sample from a table of sequence abundance.
#'
#' @param tbl_of_abundance A table containing sequence abundance data.
#' @return A plot displaying sequence abundance per sample.
#' @import ggplot2
#' @import scales
#' @examples
#' tbl_of_abundance <- data.frame(ID = c("ASV1", "ASV2", "ASV3", "ASV4"),
#' Sample = c("Sample1", "Sample1", "Sample2", "Sample2"),
#' count = c(10, 5, 6, 12))
#' plot_abundance_per_sample(tbl_of_abundance)
#'
#' @export
#' Specifically intended to plot sequence abundance per sampled within plot_cluster_overview from tbl of sequence abundance
plot_abundance_per_sample <- function(tbl_of_abundance) {
ggplot(tbl_of_abundance, aes(y = ID, x = Sample, fill = count/1000)) +
geom_tile() + theme(legend.position = "none",
ggplot(tbl_of_abundance, aes(y = ID, x = Sample, fill = count/1000)) +
geom_tile() + theme(legend.position = 'none',
axis.title = element_blank(),
axis.text = element_blank(),
axis.ticks = element_blank()) +
ggtitle("ASV counts per sample") +
ggtitle('ASV counts per sample') +
scale_fill_viridis_c() +
labs(fill = "ASV counts (in thousands)")
}
\ No newline at end of file
labs(fill = 'ASV counts (in thousands)')
}
# Specifically intended to plot total sequence abundance within
# plot_cluster_overview from tbl of sequence abundance sums
#' Plot total sequence abundance within plot_cluster_overview from tbl of sequence abundance sums
#'
#' @param tbl_of_sums A table of sequence abundance sums
#' @return A ggplot object of the total ASV counts and sequence IDs
#' @examples
#' tbl_of_sums <- data.frame(ID = c("ASV_001", "ASV_002", "ASV_003"), sum_count = c(1000, 2000, 3000))
#' plot_abundance_sums_per_sequence(tbl_of_sums)
#'
#' @import ggplot2
#'
#' @export
plot_abundance_sums_per_sequence <- function(tbl_of_sums) {
ggplot(tbl_of_sums, aes(y = ID, x = sum_count/1000)) +
geom_col() + theme(axis.text.x = element_text(angle = 90,
size = 15,
vjust = 0.5),
legend.position = "none",
axis.title = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank()) +
ggtitle("Total ASV counts (in thousands)")
}
\ No newline at end of file
ggplot(tbl_of_sums, aes(y = ID, x = sum_count / 1000)) +
geom_col() +
theme(axis.text.x = element_text(angle = 90, size = 15, vjust = 0.5),
legend.position = 'none',
axis.title = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank())+
ggtitle('Total ASV counts (in thousands)')
}
#' Create NMDS plot with ggplot2 and centroids
#'
#' This function creates an NMDS plot using ggplot2 and can optionally add centroids to the plot.
#'
#' @param asv_nmds A data frame created from NMDS analysis with columns "MDS1" and "MDS2".
#' @param color_by A string indicating which column to use for coloring points on the plot. Default: 'String'.
#' @param centroids A logical value indicating whether or not to add centroids to the plot. Default: FALSE.
#'
#' @return A ggplot2 object containing the NMDS plot.
#'
#' @examples
#' # Assume nmds_df has been created through NMDS analysis
#' plot_asv_nmds(asv_nmds = nmds_df, color_by = 'Sample', centroids = TRUE)
#'
#' @import ggplot2
#' @importFrom stats aggregate
#'
#' @export
plot_asv_nmds <- function(asv_nmds = my_asv_nmds,
color_by = "String",
color_by = 'String',
centroids = FALSE){
# Create NMDS plot with ggplot2 and centroids
pl <- ggplot(asv_nmds, aes(x=MDS1, y=MDS2, color= .data[[color_by]])) +
# Create ggplot object
pl <-ggplot(asv_nmds, aes(x=MDS1, y=MDS2, color= .data[[color_by]])) +
geom_point() +
theme_classic() +
labs(title="NMDS Plot", x="NMDS 1", y="NMDS 2")
labs(title='NMDS Plot', x='NMDS 1', y='NMDS 2')
# If centroids parameter is TRUE, add centroids to the plot
if(centroids){
# Add sample groups to nmds_df
nmds_df$Group <- factor(gsub("[0-9]", "", rownames(asv_table)))
# Add sample groups to asv_nmds
asv_nmds$Group <- factor(gsub('[0-9]', '', rownames(asv_table)))
# Calculate group centroids
centroids_df <- aggregate(nmds_df[, 1:2], by=list(Group=nmds_df$Group), mean)
pl <- pl +
geom_point(data=centroids_df, aes(x=NMDS1, y=NMDS2),
color="black", size=3, shape=21, fill="white")
centroids_df <- aggregate(asv_nmds[, 1:2], by=list(Group=asv_nmds$Group), mean)
# Add centroids to plot
pl <- pl +
geom_point(data=centroids_df, aes(x=NMDS1, y=NMDS2),
color='black', size=3, shape=21, fill='white')
}
# Return ggplot object
return(pl)
}
\ No newline at end of file
}
# Generic plotting of tree with title "Dendrogram"
#' Generic plotting of UPGMA tree using ggtree
#'
#' This function plots a dendrogram using the ggtree package.
#'
#' @param upgma_tree An object of class 'phylo' representing the tree.
#'
#' @import ggtree
#' @import ggplot2
#'
#' @return A dendrogram plot.
#'
#' @examples
#' tree <- ape::read.tree("tree.txt")
#' plot_cluster_dendrogram(tree)
#'
#' @export
plot_cluster_dendrogram <- function(upgma_tree) {
ggtree::ggtree(upgma_tree) + ggtree::geom_tiplab(as_ylab = TRUE) +
ggtitle("Dendrogram") +
ggtitle('Dendrogram') +
theme(axis.text = element_text(size = 15))
}
\ No newline at end of file
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment