Skip to content
Snippets Groups Projects
Commit 2d3318b5 authored by Simeon's avatar Simeon
Browse files

roxygen tags

parent 741ea9da
Branches
No related tags found
No related merge requests found
plot_repeat_positions <- function(repeat_positions) {
legend_name <- ""
pl <- ggplot(repeat_positions, aes(x = start, xend = end,
y = seqname, yend = seqname,
color = fragment, size = plot_intensity)) +
#' Plot repeat positions
#'
#' This function plots repeat positions on a sequence. It takes a data frame
#' as input which should have columns 'start', 'end', 'seqname', 'fragment',
#' and 'plot_intensity'. It uses ggplot2 to create the plot.
#'
#' @param repeat_positions A data frame with columns 'start', 'end', 'seqname',
#' 'fragment', and 'plot_intensity'.
#' @return A ggplot object representing the repeat positions plot.
#' @import ggplot2
#' @import viridis
#' @keywords plot
plot_repeat_positions <- function(repeat_positions){
legend_name <- ''
pl <- ggplot(repeat_positions, aes(x = start, xend = end,
y = seqname, yend = seqname,
color = fragment, size = plot_intensity)) +
geom_segment() +
guides(size = "none") +
guides(size = 'none') +
scale_color_viridis_d(begin = 0.25, end = 0.8, name = legend_name) +
ylab("Sequence") + xlab("Sequence position (bp)") +
theme(legend.position = "top")
ylab('Sequence') + xlab('Sequence position (bp)') +
theme(legend.position = 'top')
return(pl)
}
\ No newline at end of file
}
#' Plot Repeat Quantity
#'
#' This function creates a plot to visualize the quantity of repeated sequences across different sequences.
#'
#' @param quantified_repeats A data frame containing information about quantified repeats.
#'
#' @return A ggplot object representing the plot.
#'
#' @export
#'
#' @examples
#' quantified_repeats <- data.frame(seqname = c('A', 'B', 'C'),
#' repeat_count = c(1, 2, 3),
#' count_type = c('Type1', 'Type2', 'Type3'))
#' plot_repeat_quantity(quantified_repeats)
plot_repeat_quantity <- function(quantified_repeats) {
legend_name <- ""
legend_name <- ''
pl <- ggplot(quantified_repeats, aes(
x = repeat_count, y = seqname, fill = count_type)) +
x = repeat_count, y = seqname, fill = count_type)) +
geom_col(position = position_dodge()) +
scale_fill_viridis_d(begin = 0.25, end = 0.8, name = legend_name) +
ylab("Sequence") + xlab("Repeat count") +
theme(legend.position = "top",
ylab('Sequence') + xlab('Repeat count') +
theme(legend.position = 'top',
axis.title.y = element_blank(),
axis.ticks.y = element_blank(),
axis.text.y = element_blank())
return(pl)
}
\ No newline at end of file
return(pl)
}
plot_repeats <- function(sequences = DNAStringSet(),
repeat_sequence = "GATC"){
#' Plot short sequence repeats
#'
#' This function generates a plot that shows the positions and quantities of repeated sequences in a given set of DNA sequences.
#'
#' @param sequences A DNAStringSet object containing the DNA sequences to be analyzed. Default is an empty DNAStringSet.
#' @param repeat_sequence A character vector specifying the repeat sequence to be searched. Default is 'GATC'.
#'
#' @return A ggplot object displaying both positions and quantities of repeated sequences.
#' @export
#' @name plot_repeats
plot_repeats <- function(sequences = DNAStringSet(),
repeat_sequence = 'GATC') {
clus_name <- names(sequences)[[1]]
# quantify repeats and
quant_data <- quantify_repeats(sequences, repeat_sequence) %>%
mutate(seqname = fct_reorder(
seqname, dplyr::desc(largest_repeat_contig))) %>%
pivot_longer(cols = singlets:largest_repeat_contig,
names_to = "count_type", values_to = "repeat_count") %>%
mutate(count_type = str_replace_all(count_type, "singlets",
"single instances"),
count_type = str_replace_all(count_type, "largest_repeat_contig",
"largest contiguous instance"))
mutate(seqname = fct_reorder(seqname, dplyr::desc(largest_repeat_contig))) %>%
pivot_longer(cols = singlets:largest_repeat_contig,
names_to = 'count_type', values_to = 'repeat_count') %>%
mutate(count_type = str_replace_all(count_type, 'singlets', 'single instances'),
count_type = str_replace_all(count_type, 'largest_repeat_contig', 'largest contiguous instance'))
pos_data <- find_repeat_positions(sequences, repeat_sequence) %>%
mutate(seqname = factor(seqname, levels = levels(quant_data$seqname)))
pos_pl <- plot_repeat_positions(pos_data)
quant_pl <- plot_repeat_quantity(quant_data)
combo_pl <- ggpubr::ggarrange(pos_pl, quant_pl)
combo_pl <- ggpubr::annotate_figure(combo_pl, top = ggpubr::text_grob(
paste("Repeat", repeat_sequence, "in", clus_name, "cluster"),
paste('Repeat', repeat_sequence, 'in', clus_name, 'cluster'),
vjust = 0.5, x = 0.01, hjust = 0))
return(combo_pl)
}
\ No newline at end of file
}
#' Plot amplicon sequence variants
#'
#' This function plots the amplicon variants per sample using a provided distance clustering table. The plot can be customized by specifying the sequence type and whether hierarchical clustering should be performed. The function also creates a custom color scheme based on the viridis color palette.
#'
#' @param distclust_table A tibble containing the distance clustering table (default: mytibble)
#' @param sequence_type The type of sequence to consider (default: 'bp')
#' @param hierarchical_clustering Whether to perform hierarchical clustering (default: FALSE)
#'
#' @return A ggplot object representing the variants per sample plot
#'
#' @import ggplot2
#' @import dplyr
#' @importFrom cowplot plot_grid
#' @importFrom cowplot get_legend
#' @importFrom viridisLite viridis
#' @importFrom viridisLite viridisLite
#' @importFrom gridExtra unit
#' @importFrom gridExtra margin
#'
#' @examples
#' plot_variants_per_sample()
#'
#' @export
plot_variants_per_sample <- function(distclust_table = mytibble,
sequence_type = "bp",
sequence_type = 'bp',
hierarchical_clustering = FALSE){
# Make a custom color scheme that assigns viridis 0 (purple) to ref/major,
# Make a custom color scheme that assigns viridis 0 (purple) to ref/major,
# viridis 1 (yellow) to major/minor
# and a gradient between 0.7 and 0.3 to the rest.
custom_colors <- c(viridisLite::viridis(1, begin = 0),
viridisLite::viridis(1, begin = 1),
viridisLite::viridis(1, begin = 1),
viridisLite::viridis(
length(unique(distclust_table$seq_var))-2,
option = "D", begin = 0.7 , end = 0.3))
names(custom_colors) <- distclust_table$seq_var %>%
unique() %>%
length(unique(distclust_table$seq_var))-2,
option = 'D', begin = 0.7 , end = 0.3))
names(custom_colors) <- distclust_table$seq_var %>%
unique() %>%
sort()
if(hierarchical_clustering){
for_vegan <- distclust_table %>%
for_vegan <- distclust_table %>%
select(var_type, seqnames, rel_var_abundance) %>%
pivot_wider(id_cols = var_type, names_from = seqnames,
pivot_wider(id_cols = var_type, names_from = seqnames,
values_from = rel_var_abundance)
# rearrange x_axis variable according to order in hierarchical cluster
dat <- dendrogram_hclust(veganify_generic_wide_tbl(for_vegan), seed = 1)
distclust_table <- dplyr::mutate(
distclust_table, var_type = factor(var_type, levels = dat$labels$label))
}
length_measure <- ifelse(sequence_type != "aa", "seqs_lengths",
"longest_aa_seq")
length_measure <- ifelse(sequence_type != 'aa', 'seqs_lengths',
'longest_aa_seq')
legend_text_master <- function(){
element_text(size = 10, margin = margin(t = 0, r = 15,
b = 0, l = 0, unit = "pt"))
element_text(size = 10, margin = margin(t = 0, r = 15,
b = 0, l = 0, unit = 'pt'))
}
# plot
pl <- ggplot(distclust_table, aes(x = .data[[length_measure]],
pl <- ggplot(distclust_table, aes(x = .data[[length_measure]],
y = var_type, fill = seq_var,
alpha = rel_var_abundance)) +
geom_col(position = position_dodge2(padding = 0)) +
theme_classic() +
facet_grid(cols = vars(clus_name), scales = "free") +
facet_grid(cols = vars(clus_name), scales = 'free') +
scale_fill_manual(values = custom_colors) +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
axis.text.y = element_text(hjust = 0),
strip.text.x.top = element_text(angle = 90, hjust = 0, vjust = 0.5),
strip.background = element_blank(),
axis.title.y = element_blank()) +
guides(fill = guide_legend(title = "Sequence variants",
guides(fill = guide_legend(title = 'Sequence variants',
title.theme = legend_text_master(),
label.theme = legend_text_master(),
keywidth = 0.5,
keyheight = 0.5),
alpha = guide_legend(title = "Prevalence",
alpha = guide_legend(title = 'Prevalence',
title.theme = legend_text_master(),
label.theme = legend_text_master(),
keywidth = 0.5,
keyheight = 0.5))
keyheight = 0.5))
if(hierarchical_clustering){
d_pl <- plot_dendrogram(distclust_table = distclust_table)
# extract a legend that is laid out horizontally
legend <- get_legend(
pl + theme(legend.position = "bottom")# +
#guides(fill = guide_legend(nrow = 6))
)
pl_p <- cowplot::plot_grid(d_pl, pl +
theme(legend.position = "none"),
align = "h", axis = "bt",
legend <- get_legend(pl + theme(legend.position = "bottom")# +
#guides(fill = guide_legend(nrow = 6))
)
pl_p <- cowplot::plot_grid(d_pl, pl +
theme(legend.position = "none"),
align = "h", axis = "bt",
rel_widths = c(1, 5), nrow = 1)
pl <- cowplot::plot_grid(pl_p, legend, ncol = 1,
rel_heights = c(1, .15)) +
pl <- cowplot::plot_grid(pl_p, legend, ncol = 1,
rel_heights = c(1, .15)) +
theme(plot.background = element_rect(fill = "white"))
}
return(pl)
}
\ No newline at end of file
}
quantify_repeats <- function(sequences = DNAStringSet,
repeat_sequence = "string") {
#' Quantify Repeats
#'
#' This function quantifies the number of repeats of a given sequence in a set of DNA sequences.
#'
#' @param sequences A DNAStringSet object containing the DNA sequences to be analyzed.
#' @param repeat_sequence A character string specifying the repeat sequence to be quantified.
#' @return A tibble with the following columns:
#' \itemize{
#' \item \code{seqname}: The name of the sequence.
#' \item \code{repeat_seq}: The specified repeat sequence.
#' \item \code{singlets}: The number of occurrences of the repeat sequence as singlets in each sequence.
#' \item \code{largest_repeat_contig}: The number of contiguous repeats of the repeat sequence in each sequence.
#' }
#' @export
quantify_repeats <- function(sequences = DNAStringSet, repeat_sequence = 'string') {
singlet_count <- str_count(as.character(sequences), repeat_sequence)
rep_count <- find_contiguous_multi_repeats(sequences, repeat_sequence,
singlet_count)
rep_count <- find_contiguous_multi_repeats(sequences, repeat_sequence, singlet_count)
out <- tibble(seqname = names(sequences), repeat_seq = repeat_sequence,
singlets = singlet_count, largest_repeat_contig = rep_count)
return(out)
}
\ No newline at end of file
}
#' Read and Write Cluster Abundance
#'
#' This function reads a sequence table, calculates the abundance of sequences belonging to each cluster, and writes the results to a CSV file.
#'
#' @param cluster_sequence_list A list of DNAStringSet objects representing the sequences assigned to each cluster.
#' @param reference_seqs A DNAStringSet object representing a set of reference sequences.
#' @param seqtab_nochim Name of the file containing the sequence table with no chimeric sequences. Default is 'seqtab_nochim.rds'.
#' @param outpath Path where the output files will be written. Default is 'path'.
#'
#' @return A list of data frames, with each data frame representing the abundance of sequences belonging to each cluster.
#'
#' @examples
#' \dontrun{
#' # Read and write cluster abundance
#' read_and_write_cluster_abundance(cluster_sequence_list, reference_seqs, seqtab_nochim = 'seqtab_nochim.rds', outpath = path)
#' }
#'
#' @export
read_and_write_cluster_abundance <- function(
cluster_sequence_list = DNAStringSetList,
reference_seqs = NULL,
seqtab_nochim = "seqtab_nochim.rds",
seqtab_nochim = 'seqtab_nochim.rds',
outpath = path) {
stab <- readRDS(seqtab_nochim) %>%
stab <- readRDS(seqtab_nochim) %>%
t() %>% as.data.frame() %>%
rownames_to_column(var = "seqs")
rownames_to_column(var = 'seqs')
per_cluster_abundance <- function(seqs_of_one_cluster = DNAStringSet){
seq_tbl <- tibble(seqs = as.data.frame(seqs_of_one_cluster)[[1]],
seq_tbl <- tibble(seqs = as.data.frame(seqs_of_one_cluster)[[1]],
ID = names(seqs_of_one_cluster))
named_stab <- left_join(seq_tbl, stab, by = "seqs")
named_stab <- left_join(seq_tbl, stab, by = 'seqs')
if(!is.null(reference_seqs)){
named_stab <- named_stab %>%
named_stab <- named_stab %>%
filter(!(ID %in% names(reference_seqs)))
}
stab_tbl <- named_stab %>%
select(-seqs) %>%
pivot_longer(cols = -ID, names_to = "Sample", values_to = "count")
pivot_longer(cols = -ID, names_to = 'Sample', values_to = 'count')
}
out <- lapply(cluster_sequence_list, per_cluster_abundance)
write_cluster_abundance_tbl <- function(clus_name){
stab_tbl <- out[[clus_name]]
if(nrow(stab_tbl > 0)) {
outpath_subdir <- file.path(outpath, "cluster_seqtabs")
outpath_subdir <- file.path(outpath, 'cluster_seqtabs')
dir.create(outpath_subdir, showWarnings = FALSE)
path_csv <- file.path(outpath_subdir,
paste0("seqtab_", clus_name, "_cluster.csv"))
paste0('seqtab_', clus_name, '_cluster.csv'))
stab_wide <- stab_tbl %>%
pivot_wider(names_from = Sample, values_from = count)
write_csv(stab_wide, file = path_csv)
cat("Abundance of ASVs belonging to the cluster", clus_name,
"written to: \n", path_csv, "\n")
cat('Abundance of ASVs belonging to the cluster', clus_name,
'written to: \n', path_csv, '\n')
} else {
warning(paste(
"No table written for cluster", clus_name,
"because no ASVs belong to this cluster"), call. = FALSE)
'No table written for cluster', clus_name,
'because no ASVs belong to this cluster'), call. = FALSE)
}
}
lapply(names(out), write_cluster_abundance_tbl)
return(out)
}
\ No newline at end of file
}
# parse similiarity matrix to extract similarity to reference
similiarity_to_reference <- function (seqs = DNAStringSet,
#' Parse similarity matrix to extract similarity to reference
#'
#' This function takes in a set of sequences and parses a similarity matrix to extract the similarity to a reference sequence. The resulting data is filtered to return only the similarity values for the reference sequence. The function supports parallel processing with multiple cores.
#'
#' @param seqs A DNAStringSet object containing the sequences.
#' @param ncores An integer specifying the number of cores to use for parallel processing. Defaults to 1.
#'
#' @export
#' @importFrom DNAtools alignment_based_distance_matrix
#' @importFrom dplyr as_tibble filter mutate pivot_longer select
#' @importFrom magrittr %>%
#' @importFrom glue if_else
#' @importFrom stringr str_c
#' @importFrom Biostrings DNAStringSet
similiarity_to_reference <- function (seqs = DNAStringSet,
ncores = 1) {
if(length(seqs) > 1){
out <- alignment_based_distance_matrix(seqs = seqs,
ncores = ncores) %>%
as_tibble(rownames = "query") %>%
pivot_longer(cols = -query, names_to = "seqnames",
values_to = "sim_to_ref") %>%
out <- alignment_based_distance_matrix(seqs = seqs,
ncores = ncores) %>%
as_tibble(rownames = "query") %>%
pivot_longer(cols = -query, names_to = "seqnames",
values_to = "sim_to_ref") %>%
mutate(seq_var = if_else(query == names(seqs)[1], "ref", "none")) %>%
filter(seq_var == "ref") %>%
select(-seq_var, -query)}
}
\ No newline at end of file
filter(seq_var == "ref") %>%
select(-seq_var, -query)
}
}
#' @rdname similiarity_to_reference
#' @keywords internal
#' Subset sequences by clusters
#'
#' This function subsets sequences based on cluster assignments provided in a cluster table. It can save the resulting sequences to files if specified.
#'
#' @param seqs A sequence object containing the sequences to be subsetted.
#' @param cluster_tbl A data frame or tibble containing the cluster assignments. It should have two columns, 'cluster' and 'seqnames', where 'cluster' contains the cluster numbers and 'seqnames' contains the corresponding sequence names.
#' @param save_to_file Logical value indicating whether to save the resulting sequences to separate files for each cluster.
#' @return A list of sequence objects, where each list element corresponds to a cluster and contains the sequences in that cluster
subset_by_clusters <- function(seqs, cluster_tbl, save_to_file = TRUE){
cluster_seqs <- cluster_tbl %>%
select(cluster, seqnames) %>%
select(cluster, seqnames) %>%
arrange(cluster) %>%
group_by(cluster) %>%
group_split(.keep = FALSE)
seqs_one_cluster <- function(cluster){
seqs[unname(unlist(cluster))]
}
out <- lapply(cluster_seqs, seqs_one_cluster)
names(out) <- map(lapply(out, names), 1) %>%
unlist() %>%
names(out) <- map(lapply(out, names), 1) %>%
unlist() %>%
unname ()
save_cluster <- function(clusnum){
clus_save <- file.path(
clus_save_dir, paste0(names(out)[clusnum], "_cluster.fasta"))
......@@ -24,4 +33,4 @@ subset_by_clusters <- function(seqs, cluster_tbl, save_to_file = TRUE){
lapply(seq_along(out), save_cluster)
}
return(out)
}
\ No newline at end of file
}
#' Subset a variant table by excluding specific clusters, variants, and samples
#'
#' This function allows you to subset a variant table by excluding specific clusters, variants, and samples.
#'
#' @param variant_classified_table A tibble containing the variant classified table. Default: mytibble.
#' @param exclude_clusters A character vector specifying the clusters to exclude. Default: c().
#' @param exclude_variants A character vector specifying the variants to exclude. Default: c().
#' @param exclude_samples A character vector specifying the samples to exclude. Default: c().
#'
#' @return A tibble containing the subsetted variant table.
#'
#' @import dplyr
#' @importFrom stats setdiff
#'
#' @examples
#' subset_variant_table(mytibble, c("cluster1", "cluster2"), c("variant1", "variant2"), c("sample1", "sample2"))
subset_variant_table <- function(variant_classified_table = mytibble,
exclude_clusters = c(),
exclude_variants = c(),
exclude_samples = c()){
variant_classified_table_sub <- variant_classified_table %>%
variant_classified_table_sub <- variant_classified_table %>%
filter(clus_name %in% setdiff(unique(variant_classified_table$clus_name),
exclude_clusters)) %>%
filter(var_type %in% setdiff(unique(variant_classified_table$var_type),
exclude_variants)) %>%
exclude_clusters)) %>%
filter(var_type %in% setdiff(unique(variant_classified_table$var_type),
exclude_variants)) %>%
filter(sample %in% setdiff(unique(variant_classified_table$sample),
exclude_samples))
}
\ No newline at end of file
}
# Iterate over threshold values
test_clustering_thresholds <- function(MyDNAstring, step_size,
#' Iterate over threshold values
#'
#' This function iterates over a range of threshold values and performs clustering
#' on a DNA string dataset.
#'
#' @param MyDNAstring A DNAStringSet object containing the DNA sequences to be clustered.
#' @param step_size The step size for the threshold values. Default is 0.01.
#' @param step_max The maximum threshold value. Default is 0.99.
#' @param ncores The number of cores to use for parallel processing. Default is 1.
#'
#' @return A list of clustering results, where each element in the list corresponds to a specific threshold value.
#'
#' @import DECIPHER
#' @import tibble
#' @import dplyr
#' @import tidyr
#'
#' @examples
#' # Create a DNAStringSet object
#' my_sequences <- DNAStringSet(c("ACGT", "AGCT", "ATGC"))
#'
#' # Run the clustering function
#' results <- test_clustering_thresholds(my_sequences, step_size = 0.1, step_max = 0.9, ncores = 2)
#' @export
test_clustering_thresholds <- function(MyDNAstring, step_size,
step_max = 0.99, ncores = 1) {
#DECIPHER removed the 'type' argument from IdClusters around v2.24. Currently (Feb23), the function is renamed "Clusterize"
if(numeric_version(packageVersion("DECIPHER")) < 2.24){
clus_tbl <- IdClusters(method = "inexact", myXStringSet = MyDNAstring,
......@@ -9,18 +32,18 @@ test_clustering_thresholds <- function(MyDNAstring, step_size,
verbose = FALSE)
}else{
clus_tbl <- Clusterize(MyDNAstring,
cutoff = seq(0, step_max, step_size),
cutoff = seq(0, step_max, step_size),
processors = ncores,
verbose = FALSE)
verbose = FALSE)
}
steps_temp <- seq(0, step_max, step_size)
clus_tbl_list <- as_tibble(clus_tbl, rownames = "seqnames") %>%
pivot_longer(cols = -seqnames, names_to = "cutoff", values_to = "cluster") %>%
group_by(cutoff) %>%
group_split(.keep = FALSE)
names(clus_tbl_list) <- steps_temp
return(clus_tbl_list)
}
\ No newline at end of file
}
# Find nucleotide distance to first stop (half-open reading frames)
#' Find nucleotide distance to first stop (half-open reading frames)
#'
#' This function takes a DNAStringSet as input and finds the nucleotide distance
#' to the first stop codon in each of the three reading frames.
#'
#' @param seqs A DNAStringSet object containing DNA sequences
#'
#' @return A data frame with the following columns:
#' \itemize{
#' \item \code{seqnames}: The name of the DNA sequence
#' \item \code{rframe}: The reading frame number (1, 2 or 3)
#' \item \code{seq}: The translated peptide sequence
#' \item \code{fos_width}: The nucleotide distance to the first stop codon in each reading frame
#' }
#'
#' @examples
#' seqs <- DNAStringSet("ATGTCGATAGCCTAGGTCAGTAA")
#' translate_and_count_stops(seqs)
#'
#' @import Biostrings
#' @import dplyr
#' @import stringr
#' @export
translate_and_count_stops <- function(seqs = DNAStringSet) {
# Make reading frames and translate to protein
asv_rfs <- lapply(1:3, function(pos)
asv_rfs <- lapply(1:3, function(pos)
subseq(c(seqs), start=pos))
asv_peps <- suppressWarnings(lapply(asv_rfs, translate))
peps_tbl <- lapply(asv_peps, as.data.frame) %>%
lapply(rownames_to_column, var = "seqnames") %>%
peps_tbl <- lapply(asv_peps, as.data.frame) %>%
lapply(rownames_to_column, var = "seqnames") %>%
bind_rows(.id = "rframe") %>%
dplyr::rename(seq = x) %>%
mutate(nostop = str_remove(seq, ".*\\*")) %>%
mutate(first_open_start = str_locate(nostop, "M")[, "start"]) %>%
mutate(first_open_start = str_locate(nostop, "M")[, "start"]) %>%
mutate(fos_width = (str_length(nostop) - first_open_start)*3) %>%
mutate(fos_width = replace_na(fos_width, 0)) %>%
select(-nostop, -first_open_start) %>%
......@@ -23,4 +45,4 @@ translate_and_count_stops <- function(seqs = DNAStringSet) {
((first_stop - 1)*3),
(first_stop * 3)))
return(peps_tbl)
}
\ No newline at end of file
}
#' Variant Classifier Function
#'
#' This function classifies variants based on relative abundance and similarity to a reference sequence.
#'
#' @param seqtab_file Path to a sequence table file (default: file.path(path, 'seqtab_nochim.rds'))
#' @param clustered_sequences A list of DNA sequences containing clustered ASVs (default: myDNAStringSetList)
#' @param reference_informed Logical value indicating whether the classification should be reference informed (default: FALSE)
#'
#' @return A modified master table with variant classifications
# Define a function called variant_classifier with two arguments: seqtab_file and clustered_sequences
variant_classifier <- function(
seqtab_file = file.path(path, 'seqtab_nochim.rds'),
clustered_sequences = myDNAStringSetList){
# Convert seqtab into long format and calculate relative abundance per sample for each ASV
seqtab_tbl_long <- clean_seqtab(seqtab_file, output = FALSE) %>%
pivot_longer(cols = where(is.integer), names_to = 'sample',
values_to = 'asv_counts') %>%
group_by(sample) %>%
mutate(sample_count_sum = sum(asv_counts)) %>%
ungroup() %>%
mutate(rel_abundance = asv_counts/sample_count_sum) %>%
seqtab_tbl_long <- clean_seqtab(seqtab_file, output = FALSE) %>%
pivot_longer(cols = where(is.integer), names_to = 'sample',
values_to = 'asv_counts') %>%
group_by(sample) %>%
mutate(sample_count_sum = sum(asv_counts)) %>%
ungroup() %>%
mutate(rel_abundance = asv_counts/sample_count_sum) %>%
filter(sample_count_sum > 0)
# Create a table for clustered sequences and filter out clusters with only one sequence
clustab_tbl <- cluster_tbl_named(clustered_sequences) %>%
left_join(tibble(seqnames = unlist(map(clustered_seqs, names)),
clustab_tbl <- cluster_tbl_named(clustered_sequences) %>%
left_join(tibble(seqnames = unlist(map(clustered_seqs, names)),
seqs = as.character(unlist(clustered_sequences))),
by = 'seqnames') %>%
filter(clus_size > 1)
filter(clus_size > 1)
# If reference informed, add reference info and name variants
if(reference_informed){
# Join the clustered sequences table with the long seqtab table by sequence name
master_tbl <- left_join(clustab_tbl, seqtab_tbl_long, by = 'seqs') %>%
master_tbl <- left_join(clustab_tbl, seqtab_tbl_long, by = 'seqs') %>%
filter(seqnames != clus_name)
# Calculate the similarity of each cluster to the reference sequence
sim_to_ref_tbl <- map(clustered_seqs, similiarity_to_reference,
ref = refstrings) %>%
discard(is_null) %>%
sim_to_ref_tbl <- map(clustered_seqs, similiarity_to_reference,
ref = refstrings) %>%
discard(is_null) %>%
bind_rows(.id = 'clus_name') %>%
unique()
# Join the master table with the reference similarity table
master_tbl <- left_join(master_tbl, sim_to_ref_tbl, by = c('clus_name',
'seqnames')) %>%
master_tbl <- left_join(master_tbl, sim_to_ref_tbl, by = c('clus_name',
'seqnames')) %>%
# Add sequence variants and major variant information to the table and group by cluster name
mutate(seq_var = if_else(sim_to_ref == 0, 'Reference sequence',
mutate(seq_var = if_else(sim_to_ref == 0, 'Reference sequence',
paste0('Sequence variant ', num-2))) %>%
mutate(nums_not_ref = if_else(sim_to_ref == 0, 99999, as.double(num))) %>%
group_by(clus_name) %>%
mutate(maj_var = min(nums_not_ref)) %>%
ungroup() %>%
mutate(seq_var = if_else(num == maj_var, 'Major variant', seq_var)) %>%
mutate(seq_var = factor(seq_var, levels = c('Reference sequence',
mutate(seq_var = factor(seq_var, levels = c('Reference sequence',
'Major variant',
paste0('Sequence variant ',
1:max(num))))) %>%
paste0('Sequence variant ',
1:max(num))))) %>%
select(-maj_var, -nums_not_ref)
}
# If not reference informed, join the clustered sequences table with the long seqtab table and add "x" to a new column sim_to_ref
if(!reference_informed){
master_tbl <- left_join(clustab_tbl, seqtab_tbl_long, by = 'seqs') %>%
master_tbl <- left_join(clustab_tbl, seqtab_tbl_long, by = 'seqs') %>%
# Add sequence variants and major variant information to the table and group by cluster name
mutate(seq_var = paste0('Sequence variant ', num-2)) %>%
group_by(clus_name) %>%
......@@ -61,53 +71,54 @@ variant_classifier <- function(
mutate(min_var = min(num)+1) %>%
ungroup() %>%
mutate(seq_var = if_else(num == maj_var, 'Major variant', seq_var)) %>%
mutate(seq_var = if_else(num == min_var, 'Minor variant', seq_var)) %>%
mutate(seq_var = if_else(num == min_var, 'Minor variant', seq_var)) %>%
mutate(seq_var = factor(seq_var, levels = c('Major variant',
'Minor variant',
paste0('Sequence variant ',
1:max(num))))) %>%
paste0('Sequence variant ',
1:max(num))))) %>%
select(-maj_var, -min_var)
}
# Call variant types using top 2 most abundant sequences per sample and cluster based on relative abundance
variant_types <- master_tbl %>%
filter(rel_abundance > 0) %>%
group_by(sample, clus_name) %>%
top_n(2, rel_abundance) %>%
ungroup() %>%
filter(rel_abundance > 0) %>%
group_by(sample, clus_name) %>%
top_n(2, rel_abundance) %>%
ungroup() %>%
select(sample, ASV) %>%
group_by(sample) %>%# Concatenate the top 2 most abundant sequences per sample into a single string - "profile"
mutate(profile = paste0(ASV, collapse = '')) %>%
mutate(profile = paste0(ASV, collapse = '')) %>%
# Remove duplicates
ungroup() %>%
select(-ASV) %>%
ungroup() %>%
select(-ASV) %>%
unique()
# Name each variant type (Variant_type1, Variant_type2, etc.)
var_type_names <- select(variant_types, -sample) %>%
unique() %>%
mutate(var_type = factor(paste0('Variant_type ', 1:nrow(.)),
levels = paste0('Variant_type ', 1:nrow(.)))) %>%
var_type_names <- select(variant_types, -sample) %>%
unique() %>%
mutate(var_type = factor(paste0('Variant_type ', 1:nrow(.)),
levels = paste0('Variant_type ', 1:nrow(.)))) %>%
# Join variant type names with variant profiles
left_join(variant_types, by = 'profile') %>%
left_join(variant_types, by = 'profile') %>%
select(-profile)
# Add variant type info to the master table and filter out sequences with zero relative abundance
master_tbl <- master_tbl %>%
left_join(var_type_names, by = 'sample') %>%
mutate(seqs_lengths = str_length(seqs)) %>%
master_tbl <- master_tbl %>%
left_join(var_type_names, by = 'sample') %>%
mutate(seqs_lengths = str_length(seqs)) %>%
group_by(sample, clus_name) %>%
mutate(cluster_sum_rel_abu = sum(rel_abundance)) %>%
mutate(rel_var_abundance = rel_abundance/cluster_sum_rel_abu) %>%
top_n(2, rel_abundance) %>%
ungroup() %>%
mutate(cluster_sum_rel_abu = sum(rel_abundance)) %>%
mutate(rel_var_abundance = rel_abundance/cluster_sum_rel_abu) %>%
top_n(2, rel_abundance) %>%
ungroup() %>%
filter(rel_abundance > 0)
aa_info <- find_longest_reading_frames(unlist(unname(clustered_sequences))) %>%
mutate(longest_aa_seq = max_width/3) %>%
aa_info <- find_longest_reading_frames(unlist(unname(clustered_sequences))) %>%
mutate(longest_aa_seq = max_width/3) %>%
select(seqnames, longest_aa_seq)
master_tbl <- left_join(master_tbl, aa_info, by = "seqnames")
}
\ No newline at end of file
return(master_tbl)
}
#' Veganify ASV Counts
#'
#' This function takes a cleaned sequence table and converts it to vegan format.
#'
#' @param cleaned_seqtab A cleaned sequence table.
#'
#' @return A vegan formatted count matrix.
#' @export
veganify_asvcounts <- function(cleaned_seqtab = my_cleaned_seqtab){
out <- cleaned_seqtab %>%
select(-seqs, -ASV) %>%
as.data.frame()
out <- cleaned_seqtab %>%
select(-seqs, -ASV) %>%
as.data.frame()
row.names(out) <- cleaned_seqtab[["ASV"]]
out <- out %>%
select(where( ~ is.numeric(.x) && sum(.x) != 0)) %>%
out <- out %>%
select(where( ~ is.numeric(.x) && sum(.x) != 0)) %>%
t()
return(out)
}
\ No newline at end of file
}
# Makes a vegdist compatible data.frame from a wide tibble that contains the
# rownames in the first column and input data for vegdist in all other columns
#' Makes a vegdist compatible data.frame from a wide tibble that contains the rownames in the first column and input data for vegdist in all other columns
#'
#' This function takes a wide tibble with rownames in the first column and input data for vegdist in all other columns, and converts it into a data.frame that is compatible with the vegdist function. It replaces any na values with 0 and sets the rownames of the resulting data.frame to the values in the first column of the input tibble.
#'
#' @param data A wide tibble with rownames in the first column and input data for vegdist in all other columns.
#'
#' @return A data.frame that is compatible with the vegdist function.
#'
#' @importFrom dplyr %>%
#' @importFrom dplyr replace
#' @importFrom stats as.data.frame
#'
#' @examples
#' library(tibble)
#'
#' # Create a wide tibble
#' my_wide_tibble <- tibble::tribble(
#' ~Name, ~Var1, ~Var2, ~Var3,
#' "A", 1, 2, 3,
#' "B", 4, 5, 6,
#' "C", 7, 8, 9
#' )
#'
#' # Apply the veganify_generic_wide_tbl function
#' result <- veganify_generic_wide_tbl(data = my_wide_tibble)
#' result
#'
#' @exports
veganify_generic_wide_tbl <- function(data = my_wide_tibble){
# Prepare data from the plotting table for vegdist/hclust clustering
for_vegan_df <- as.data.frame(data)[-1] %>%
for_vegan_df <- as.data.frame(data)[-1] %>%
replace(is.na(.), 0)
row.names(for_vegan_df) <- data[[1]]
return(for_vegan_df)
}
\ No newline at end of file
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment