Skip to content
Snippets Groups Projects
Commit 6941973c authored by Simeon's avatar Simeon
Browse files

roxygen tags

parent ead693c0
Branches
No related tags found
No related merge requests found
# UPGMA tree from DNAStringSet object in named sequence list
#' UPGMA tree from DNAStringSet object in named sequence list
#'
#' This function generates a UPGMA tree from a DNAStringSet object in a named sequence list.
#'
#' @param cluster The name of the cluster to generate the UPGMA tree from
#' @param sequence_list A named list where each element is a \code{DNAStringSet} object containing DNA sequences
#' @return A UPGMA tree object
#' @import utils
#' @importFrom DECIPHER AlignSeqs
#' @importFrom phangorn phyDat
#' @importFrom phangorn dist.ml
#' @importFrom phangorn upgma
#' @export
# Define function to align a cluster of sequences and generate a UPGMA tree
align_and_generate_upgma <- function(cluster, sequence_list) {
# Use AlignSeqs function to align the sequences in the given cluster
alig <- AlignSeqs(sequence_list[[cluster]], verbose = FALSE)
tree <- phangorn::phyDat(as(alig, "matrix"), type = "DNA") %>%
# Convert the aligned sequences to a matrix of DNA data and calculate the distance matrix using maximum likelihood
# Then use the upgma function to generate the tree
tree <- phangorn::phyDat(as(alig, 'matrix'), type = 'DNA') %>%
phangorn::dist.ml() %>%
phangorn::upgma()
# Return the generated UPGMA tree
return(tree)
}
\ No newline at end of file
}
# Align seqs and get distance matrix using DECIPHER, multithreading supported
alignment_based_distance_matrix <- function (seqs = DNAStringSet,
#' Generate an alignment-based distance matrix from a set of DNA sequences
#'
#' This function uses the DECIPHER package to align a set of DNA sequences and generate a distance matrix based on the alignments. The alignments can be performed using multiple processors for faster execution. This function requires the DECIPHER package to be installed.
#'
#' @param seqs A DNAStringSet object containing the DNA sequences to be used for the generation of the distance matrix.
#' @param ncores An integer indicating the number of processors to be used for the alignment and generation of the distance matrix. The default value is 1.
#'
#' @return A matrix object containing the distance scores calculated based on the alignments of the input sequences.
#'
#' @import DECIPHER
#' @import Biostrings
#'
#' @examples
#' data(smallexample)
#' dna_sequences <- DNAStringSet(smallexample)
#' alignment_based_distance_matrix(dna_sequences, ncores = 2)
#'
#' @export
alignment_based_distance_matrix <- function(seqs = DNAStringSet,
ncores = 1) {
# Align the sequences using the specified number of processors
seqs_alig <- AlignSeqs(seqs, processors = ncores, verbose = FALSE)
# Generate the distance matrix from the aligned sequences using the specified number of processors
DistanceMatrix(seqs_alig, verbose = FALSE, processors = ncores)
}
\ No newline at end of file
}
calc_asv_nmds <- function(cleaned_seqtab = my_cleaned_seqtab, seed = 1905, ...){
#' Calculate NMDS using ASV counts
#'
#' This function calculates NMDS (non-metric multidimensional scaling) using ASV counts data.
#' It converts ASV counts to numeric data, performs NMDS analysis on the data,
#' formats the NMDS results as a tibble, and combines the NMDS results and tibble into a list object.
#'
#' @param cleaned_seqtab A data frame of cleaned ASV counts data (from clean_seqtab())
#' @param seed An optional integer specifying the seed for the random number generator (default: 1905)
#' @param ... Additional arguments passed to the `metaMDS` function from the vegan package
#'
#' @return A list object with results including NMDS results and NMDS tibble
#' @export
#'
#' @examples
#' data("my_cleaned_seqtab")
#' calc_asv_nmds(my_cleaned_seqtab)
calc_asv_nmds <- function(cleaned_seqtab = my_cleaned_seqtab, seed = 1905, ...) {
# Extract numeric asvs
num_asv <- veganify_asvcounts(cleaned_seqtab)
# Calculate NMDS
set.seed(seed)
nmds <- metaMDS(num_asv, "bray", ...)
nmds_tbl <- as_tibble(nmds$points, rownames = "sample")
out <- list(nmds, nmds_tbl)
names(out) <- c("NMDS", "NMDS tibble")
return(out)
}
\ No newline at end of file
num_asv <- veganify_asvcounts(cleaned_seqtab) # Convert ASV counts to numeric data
## Calculate NMDS
set.seed(seed) # Set seed for reproducibility
nmds <- metaMDS(num_asv, 'bray', ...) # Perform NMDS analysis on the data
nmds_tbl <- as_tibble(nmds$points, rownames = 'sample') # Format NMDS results as a tibble
out <- list(nmds, nmds_tbl) # Combine NMDS results and tibble into a list object
names(out) <- c('NMDS', 'NMDS tibble') # Give names to the list elements
return(out) # Return the list object with results
}
clean_seqtab <- function(file = "seqtab_nochim.rds",
ASV_sequences = asvstrings,
output = TRUE){
clean_seqtab <- function(file = 'seqtab_nochim.rds',
ASV_sequences = asvstrings, # Specify a default value for 'ASV_sequences' if none given
output = TRUE){ # Specify a default value for 'output' if none given
# Read in an RDS file and save it as 'st'
st <- readRDS(file)
st_clean <- st %>%
t() %>%
tibble::as_tibble() %>%
# Transpose 'st', convert to tibble, add column 'seqs' to it
# Note the use of the pipe operator '%>%'
st_clean <- st %>%
t() %>%
tibble::as_tibble() %>%
dplyr::mutate(seqs = colnames(st))
st_clean <- tibble(seqs = as.character(ASV_sequences),
ASV = names(ASV_sequences)) %>%
left_join(st_clean, by = "seqs")
# Create a tibble with column 'seqs' containing ASV sequences as character strings
# and another column 'ASV' containing their names. Then left join with 'st_clean'.
st_clean <- tibble(seqs = as.character(ASV_sequences),
ASV = names(ASV_sequences)) %>%
left_join(st_clean, by = 'seqs')
# If 'output' is TRUE, write a csv filecontaining the cleaned sequence table in the same directory as the original file
# Note the use of file.path()
if(output){
readr::write_csv(st_clean, file.path(dirname(file), "clean_seqtab.csv"))
readr::write_csv(st_clean, file.path(dirname(file), 'clean_seqtab.csv'))
}
# Return the cleaned sequence table
return(st_clean)
}
\ No newline at end of file
}
# Include cluster info in reading frame tbl
#' Cluster Longest Reading Frames
#'
#' This function takes two inputs: clustered_sequences (which should be a \code{\link{DNAStringSetList}}) and a reading_frame_tbl from \code{\link{find_longest_reading_frames()}}
#'
#' @param clustered_sequences A list of DNA sequences that have been clustered, e.g. using \code{\link{meshclustR()}}
#' @param reading_frame_tbl A tibble object with columns "seqnames," "strand," "start," "end," and "width"
#' @return A tibble object with the same column structure as the \code{\link{reading_frame_tbl}}, but with an additional column for "cluster_name"
#' @export
#'
#' @examples
#' clustered_sequences <- DNAStringSetList(c("ATGTCGTA", "CGTACTG", "AGTGTCAGT"))
#' reading_frame_tbl <- data.frame(seqnames=c("seq1","seq2"), strand=c("+","-"), start=c(1,3), end=c(6,11), width=c(6,9))
#' cluster_longest_reading_frames(clustered_sequences=clustered_sequences, reading_frame_tbl=reading_frame_tbl)
#'
#' @import dplyr
#' @import Biostrings
#'
# Define a function that clusters DNA sequences and determines their longest reading frame
cluster_longest_reading_frames <- function(
clustered_sequences = DNAStringSetList, reading_frame_tbl = tbl) {
seqnames_clus_order <- map(clustered_sequences, names) %>%
unlist() %>%
clustered_sequences = DNAStringSetList, # A variable that holds a list of DNA sequences that have been clustered
reading_frame_tbl = tbl) { # A variable that holds a table of reading frames
# Create a vector of sequence names in the order they appear in the clustered_sequences variable
seqnames_clus_order <- map(clustered_sequences, names) %>%
unlist() %>%
unname()
# Create a data frame with a column that holds the group names of the DNA sequences
group_names_clus_order <- as.data.frame(
DNAStringSetList(clustered_sequences))[["group_name"]]
DNAStringSetList(clustered_sequences))[['group_name']]
# Combine the reading frame table with the vector of sequence names and group names
# to get the maximum reading frame for each DNA sequence cluster
clus_max_rf <- left_join(reading_frame_tbl,
tibble(seqnames = seqnames_clus_order,
cluster_name = group_names_clus_order),
by = "seqnames")
cluster_name =group_names_clus_order),
by = 'seqnames')
# Return a table with the maximum reading frame for each DNA sequence cluster
return(clus_max_rf)
}
\ No newline at end of file
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment