From 2d3318b52ab5b8fdf0622e5bf866a78376bf0f1c Mon Sep 17 00:00:00 2001 From: Simeon <51403284+simeross@users.noreply.github.com> Date: Fri, 20 Oct 2023 14:13:07 +0200 Subject: [PATCH] roxygen tags --- R/plot_repeat_positions.R | 32 ++++--- R/plot_repeat_quantity.R | 31 +++++-- R/plot_repeats.R | 42 +++++---- R/plot_variants_per_sample.R | 95 ++++++++++++-------- R/quantify_repeats.R | 24 +++-- R/read_and_write_cluster_abundance.R | 60 ++++++++----- R/similiarity_to_reference.R | 38 +++++--- R/subset_by_clusters.R | 21 +++-- R/subset_variant_table.R | 29 ++++-- R/test_clustering_thresholds.R | 41 +++++++-- R/translate_and_count_stops.R | 38 ++++++-- R/variant_classifier.R | 129 +++++++++++++++------------ R/veganify_asvcounts.R | 20 +++-- R/veganify_generic_wide_tbl.R | 34 ++++++- 14 files changed, 428 insertions(+), 206 deletions(-) diff --git a/R/plot_repeat_positions.R b/R/plot_repeat_positions.R index 544347f..1c449d7 100644 --- a/R/plot_repeat_positions.R +++ b/R/plot_repeat_positions.R @@ -1,13 +1,25 @@ -plot_repeat_positions <- function(repeat_positions) { - legend_name <- "" - pl <- ggplot(repeat_positions, aes(x = start, xend = end, - y = seqname, yend = seqname, - color = fragment, size = plot_intensity)) + +#' Plot repeat positions +#' +#' This function plots repeat positions on a sequence. It takes a data frame +#' as input which should have columns 'start', 'end', 'seqname', 'fragment', +#' and 'plot_intensity'. It uses ggplot2 to create the plot. +#' +#' @param repeat_positions A data frame with columns 'start', 'end', 'seqname', +#' 'fragment', and 'plot_intensity'. +#' @return A ggplot object representing the repeat positions plot. +#' @import ggplot2 +#' @import viridis +#' @keywords plot +plot_repeat_positions <- function(repeat_positions){ + legend_name <- '' + pl <- ggplot(repeat_positions, aes(x = start, xend = end, + y = seqname, yend = seqname, + color = fragment, size = plot_intensity)) + geom_segment() + - guides(size = "none") + + guides(size = 'none') + scale_color_viridis_d(begin = 0.25, end = 0.8, name = legend_name) + - ylab("Sequence") + xlab("Sequence position (bp)") + - theme(legend.position = "top") - + ylab('Sequence') + xlab('Sequence position (bp)') + + theme(legend.position = 'top') + return(pl) -} \ No newline at end of file +} diff --git a/R/plot_repeat_quantity.R b/R/plot_repeat_quantity.R index a65b2b1..b38d49d 100644 --- a/R/plot_repeat_quantity.R +++ b/R/plot_repeat_quantity.R @@ -1,15 +1,30 @@ +#' Plot Repeat Quantity +#' +#' This function creates a plot to visualize the quantity of repeated sequences across different sequences. +#' +#' @param quantified_repeats A data frame containing information about quantified repeats. +#' +#' @return A ggplot object representing the plot. +#' +#' @export +#' +#' @examples +#' quantified_repeats <- data.frame(seqname = c('A', 'B', 'C'), +#' repeat_count = c(1, 2, 3), +#' count_type = c('Type1', 'Type2', 'Type3')) +#' plot_repeat_quantity(quantified_repeats) plot_repeat_quantity <- function(quantified_repeats) { - legend_name <- "" - + legend_name <- '' + pl <- ggplot(quantified_repeats, aes( - x = repeat_count, y = seqname, fill = count_type)) + + x = repeat_count, y = seqname, fill = count_type)) + geom_col(position = position_dodge()) + scale_fill_viridis_d(begin = 0.25, end = 0.8, name = legend_name) + - ylab("Sequence") + xlab("Repeat count") + - theme(legend.position = "top", + ylab('Sequence') + xlab('Repeat count') + + theme(legend.position = 'top', axis.title.y = element_blank(), axis.ticks.y = element_blank(), axis.text.y = element_blank()) - - return(pl) -} \ No newline at end of file + + return(pl) +} diff --git a/R/plot_repeats.R b/R/plot_repeats.R index dc4cc41..d986420 100644 --- a/R/plot_repeats.R +++ b/R/plot_repeats.R @@ -1,27 +1,35 @@ -plot_repeats <- function(sequences = DNAStringSet(), - repeat_sequence = "GATC"){ +#' Plot short sequence repeats +#' +#' This function generates a plot that shows the positions and quantities of repeated sequences in a given set of DNA sequences. +#' +#' @param sequences A DNAStringSet object containing the DNA sequences to be analyzed. Default is an empty DNAStringSet. +#' @param repeat_sequence A character vector specifying the repeat sequence to be searched. Default is 'GATC'. +#' +#' @return A ggplot object displaying both positions and quantities of repeated sequences. +#' @export +#' @name plot_repeats + +plot_repeats <- function(sequences = DNAStringSet(), + repeat_sequence = 'GATC') { clus_name <- names(sequences)[[1]] - # quantify repeats and + quant_data <- quantify_repeats(sequences, repeat_sequence) %>% - mutate(seqname = fct_reorder( - seqname, dplyr::desc(largest_repeat_contig))) %>% - pivot_longer(cols = singlets:largest_repeat_contig, - names_to = "count_type", values_to = "repeat_count") %>% - mutate(count_type = str_replace_all(count_type, "singlets", - "single instances"), - count_type = str_replace_all(count_type, "largest_repeat_contig", - "largest contiguous instance")) - + mutate(seqname = fct_reorder(seqname, dplyr::desc(largest_repeat_contig))) %>% + pivot_longer(cols = singlets:largest_repeat_contig, + names_to = 'count_type', values_to = 'repeat_count') %>% + mutate(count_type = str_replace_all(count_type, 'singlets', 'single instances'), + count_type = str_replace_all(count_type, 'largest_repeat_contig', 'largest contiguous instance')) + pos_data <- find_repeat_positions(sequences, repeat_sequence) %>% mutate(seqname = factor(seqname, levels = levels(quant_data$seqname))) - + pos_pl <- plot_repeat_positions(pos_data) quant_pl <- plot_repeat_quantity(quant_data) - + combo_pl <- ggpubr::ggarrange(pos_pl, quant_pl) combo_pl <- ggpubr::annotate_figure(combo_pl, top = ggpubr::text_grob( - paste("Repeat", repeat_sequence, "in", clus_name, "cluster"), + paste('Repeat', repeat_sequence, 'in', clus_name, 'cluster'), vjust = 0.5, x = 0.01, hjust = 0)) - + return(combo_pl) -} \ No newline at end of file +} diff --git a/R/plot_variants_per_sample.R b/R/plot_variants_per_sample.R index d161dfa..3dc413e 100644 --- a/R/plot_variants_per_sample.R +++ b/R/plot_variants_per_sample.R @@ -1,78 +1,101 @@ +#' Plot amplicon sequence variants +#' +#' This function plots the amplicon variants per sample using a provided distance clustering table. The plot can be customized by specifying the sequence type and whether hierarchical clustering should be performed. The function also creates a custom color scheme based on the viridis color palette. +#' +#' @param distclust_table A tibble containing the distance clustering table (default: mytibble) +#' @param sequence_type The type of sequence to consider (default: 'bp') +#' @param hierarchical_clustering Whether to perform hierarchical clustering (default: FALSE) +#' +#' @return A ggplot object representing the variants per sample plot +#' +#' @import ggplot2 +#' @import dplyr +#' @importFrom cowplot plot_grid +#' @importFrom cowplot get_legend +#' @importFrom viridisLite viridis +#' @importFrom viridisLite viridisLite +#' @importFrom gridExtra unit +#' @importFrom gridExtra margin +#' +#' @examples +#' plot_variants_per_sample() +#' +#' @export plot_variants_per_sample <- function(distclust_table = mytibble, - sequence_type = "bp", + sequence_type = 'bp', hierarchical_clustering = FALSE){ - # Make a custom color scheme that assigns viridis 0 (purple) to ref/major, + # Make a custom color scheme that assigns viridis 0 (purple) to ref/major, # viridis 1 (yellow) to major/minor # and a gradient between 0.7 and 0.3 to the rest. custom_colors <- c(viridisLite::viridis(1, begin = 0), - viridisLite::viridis(1, begin = 1), + viridisLite::viridis(1, begin = 1), viridisLite::viridis( - length(unique(distclust_table$seq_var))-2, - option = "D", begin = 0.7 , end = 0.3)) - - names(custom_colors) <- distclust_table$seq_var %>% - unique() %>% + length(unique(distclust_table$seq_var))-2, + option = 'D', begin = 0.7 , end = 0.3)) + + names(custom_colors) <- distclust_table$seq_var %>% + unique() %>% sort() if(hierarchical_clustering){ - for_vegan <- distclust_table %>% + for_vegan <- distclust_table %>% select(var_type, seqnames, rel_var_abundance) %>% - pivot_wider(id_cols = var_type, names_from = seqnames, + pivot_wider(id_cols = var_type, names_from = seqnames, values_from = rel_var_abundance) - + # rearrange x_axis variable according to order in hierarchical cluster dat <- dendrogram_hclust(veganify_generic_wide_tbl(for_vegan), seed = 1) - + distclust_table <- dplyr::mutate( distclust_table, var_type = factor(var_type, levels = dat$labels$label)) } - length_measure <- ifelse(sequence_type != "aa", "seqs_lengths", - "longest_aa_seq") - + length_measure <- ifelse(sequence_type != 'aa', 'seqs_lengths', + 'longest_aa_seq') + legend_text_master <- function(){ - element_text(size = 10, margin = margin(t = 0, r = 15, - b = 0, l = 0, unit = "pt")) + element_text(size = 10, margin = margin(t = 0, r = 15, + b = 0, l = 0, unit = 'pt')) } - + # plot - pl <- ggplot(distclust_table, aes(x = .data[[length_measure]], + pl <- ggplot(distclust_table, aes(x = .data[[length_measure]], y = var_type, fill = seq_var, alpha = rel_var_abundance)) + geom_col(position = position_dodge2(padding = 0)) + theme_classic() + - facet_grid(cols = vars(clus_name), scales = "free") + + facet_grid(cols = vars(clus_name), scales = 'free') + scale_fill_manual(values = custom_colors) + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5), axis.text.y = element_text(hjust = 0), strip.text.x.top = element_text(angle = 90, hjust = 0, vjust = 0.5), strip.background = element_blank(), axis.title.y = element_blank()) + - guides(fill = guide_legend(title = "Sequence variants", + guides(fill = guide_legend(title = 'Sequence variants', title.theme = legend_text_master(), label.theme = legend_text_master(), keywidth = 0.5, keyheight = 0.5), - alpha = guide_legend(title = "Prevalence", + alpha = guide_legend(title = 'Prevalence', title.theme = legend_text_master(), label.theme = legend_text_master(), keywidth = 0.5, - keyheight = 0.5)) - - + keyheight = 0.5)) + + if(hierarchical_clustering){ d_pl <- plot_dendrogram(distclust_table = distclust_table) # extract a legend that is laid out horizontally - legend <- get_legend( - pl + theme(legend.position = "bottom")# + - #guides(fill = guide_legend(nrow = 6)) - ) - - pl_p <- cowplot::plot_grid(d_pl, pl + - theme(legend.position = "none"), - align = "h", axis = "bt", + legend <- get_legend(pl + theme(legend.position = "bottom")# + + #guides(fill = guide_legend(nrow = 6)) + ) + + pl_p <- cowplot::plot_grid(d_pl, pl + + theme(legend.position = "none"), + align = "h", axis = "bt", rel_widths = c(1, 5), nrow = 1) - pl <- cowplot::plot_grid(pl_p, legend, ncol = 1, - rel_heights = c(1, .15)) + + pl <- cowplot::plot_grid(pl_p, legend, ncol = 1, + rel_heights = c(1, .15)) + theme(plot.background = element_rect(fill = "white")) } + return(pl) -} \ No newline at end of file +} diff --git a/R/quantify_repeats.R b/R/quantify_repeats.R index 37749b7..c7e854a 100644 --- a/R/quantify_repeats.R +++ b/R/quantify_repeats.R @@ -1,10 +1,22 @@ -quantify_repeats <- function(sequences = DNAStringSet, - repeat_sequence = "string") { +#' Quantify Repeats +#' +#' This function quantifies the number of repeats of a given sequence in a set of DNA sequences. +#' +#' @param sequences A DNAStringSet object containing the DNA sequences to be analyzed. +#' @param repeat_sequence A character string specifying the repeat sequence to be quantified. +#' @return A tibble with the following columns: +#' \itemize{ +#' \item \code{seqname}: The name of the sequence. +#' \item \code{repeat_seq}: The specified repeat sequence. +#' \item \code{singlets}: The number of occurrences of the repeat sequence as singlets in each sequence. +#' \item \code{largest_repeat_contig}: The number of contiguous repeats of the repeat sequence in each sequence. +#' } +#' @export +quantify_repeats <- function(sequences = DNAStringSet, repeat_sequence = 'string') { singlet_count <- str_count(as.character(sequences), repeat_sequence) - rep_count <- find_contiguous_multi_repeats(sequences, repeat_sequence, - singlet_count) + rep_count <- find_contiguous_multi_repeats(sequences, repeat_sequence, singlet_count) out <- tibble(seqname = names(sequences), repeat_seq = repeat_sequence, singlets = singlet_count, largest_repeat_contig = rep_count) - + return(out) -} \ No newline at end of file +} diff --git a/R/read_and_write_cluster_abundance.R b/R/read_and_write_cluster_abundance.R index 0b98c0b..8ac0b24 100644 --- a/R/read_and_write_cluster_abundance.R +++ b/R/read_and_write_cluster_abundance.R @@ -1,49 +1,67 @@ +#' Read and Write Cluster Abundance +#' +#' This function reads a sequence table, calculates the abundance of sequences belonging to each cluster, and writes the results to a CSV file. +#' +#' @param cluster_sequence_list A list of DNAStringSet objects representing the sequences assigned to each cluster. +#' @param reference_seqs A DNAStringSet object representing a set of reference sequences. +#' @param seqtab_nochim Name of the file containing the sequence table with no chimeric sequences. Default is 'seqtab_nochim.rds'. +#' @param outpath Path where the output files will be written. Default is 'path'. +#' +#' @return A list of data frames, with each data frame representing the abundance of sequences belonging to each cluster. +#' +#' @examples +#' \dontrun{ +#' # Read and write cluster abundance +#' read_and_write_cluster_abundance(cluster_sequence_list, reference_seqs, seqtab_nochim = 'seqtab_nochim.rds', outpath = path) +#' } +#' +#' @export read_and_write_cluster_abundance <- function( cluster_sequence_list = DNAStringSetList, reference_seqs = NULL, - seqtab_nochim = "seqtab_nochim.rds", + seqtab_nochim = 'seqtab_nochim.rds', outpath = path) { - - stab <- readRDS(seqtab_nochim) %>% + + stab <- readRDS(seqtab_nochim) %>% t() %>% as.data.frame() %>% - rownames_to_column(var = "seqs") - + rownames_to_column(var = 'seqs') + per_cluster_abundance <- function(seqs_of_one_cluster = DNAStringSet){ - seq_tbl <- tibble(seqs = as.data.frame(seqs_of_one_cluster)[[1]], + seq_tbl <- tibble(seqs = as.data.frame(seqs_of_one_cluster)[[1]], ID = names(seqs_of_one_cluster)) - named_stab <- left_join(seq_tbl, stab, by = "seqs") - + named_stab <- left_join(seq_tbl, stab, by = 'seqs') + if(!is.null(reference_seqs)){ - named_stab <- named_stab %>% + named_stab <- named_stab %>% filter(!(ID %in% names(reference_seqs))) } - + stab_tbl <- named_stab %>% select(-seqs) %>% - pivot_longer(cols = -ID, names_to = "Sample", values_to = "count") + pivot_longer(cols = -ID, names_to = 'Sample', values_to = 'count') } - + out <- lapply(cluster_sequence_list, per_cluster_abundance) - + write_cluster_abundance_tbl <- function(clus_name){ stab_tbl <- out[[clus_name]] - + if(nrow(stab_tbl > 0)) { - outpath_subdir <- file.path(outpath, "cluster_seqtabs") + outpath_subdir <- file.path(outpath, 'cluster_seqtabs') dir.create(outpath_subdir, showWarnings = FALSE) path_csv <- file.path(outpath_subdir, - paste0("seqtab_", clus_name, "_cluster.csv")) + paste0('seqtab_', clus_name, '_cluster.csv')) stab_wide <- stab_tbl %>% pivot_wider(names_from = Sample, values_from = count) write_csv(stab_wide, file = path_csv) - cat("Abundance of ASVs belonging to the cluster", clus_name, - "written to: \n", path_csv, "\n") + cat('Abundance of ASVs belonging to the cluster', clus_name, + 'written to: \n', path_csv, '\n') } else { warning(paste( - "No table written for cluster", clus_name, - "because no ASVs belong to this cluster"), call. = FALSE) + 'No table written for cluster', clus_name, + 'because no ASVs belong to this cluster'), call. = FALSE) } } lapply(names(out), write_cluster_abundance_tbl) return(out) -} \ No newline at end of file +} diff --git a/R/similiarity_to_reference.R b/R/similiarity_to_reference.R index d9fbf33..f6af472 100644 --- a/R/similiarity_to_reference.R +++ b/R/similiarity_to_reference.R @@ -1,13 +1,31 @@ -# parse similiarity matrix to extract similarity to reference -similiarity_to_reference <- function (seqs = DNAStringSet, +#' Parse similarity matrix to extract similarity to reference +#' +#' This function takes in a set of sequences and parses a similarity matrix to extract the similarity to a reference sequence. The resulting data is filtered to return only the similarity values for the reference sequence. The function supports parallel processing with multiple cores. +#' +#' @param seqs A DNAStringSet object containing the sequences. +#' @param ncores An integer specifying the number of cores to use for parallel processing. Defaults to 1. +#' +#' @export +#' @importFrom DNAtools alignment_based_distance_matrix +#' @importFrom dplyr as_tibble filter mutate pivot_longer select +#' @importFrom magrittr %>% +#' @importFrom glue if_else +#' @importFrom stringr str_c +#' @importFrom Biostrings DNAStringSet + +similiarity_to_reference <- function (seqs = DNAStringSet, ncores = 1) { if(length(seqs) > 1){ - out <- alignment_based_distance_matrix(seqs = seqs, - ncores = ncores) %>% - as_tibble(rownames = "query") %>% - pivot_longer(cols = -query, names_to = "seqnames", - values_to = "sim_to_ref") %>% + out <- alignment_based_distance_matrix(seqs = seqs, + ncores = ncores) %>% + as_tibble(rownames = "query") %>% + pivot_longer(cols = -query, names_to = "seqnames", + values_to = "sim_to_ref") %>% mutate(seq_var = if_else(query == names(seqs)[1], "ref", "none")) %>% - filter(seq_var == "ref") %>% - select(-seq_var, -query)} -} \ No newline at end of file + filter(seq_var == "ref") %>% + select(-seq_var, -query) + } +} + +#' @rdname similiarity_to_reference +#' @keywords internal diff --git a/R/subset_by_clusters.R b/R/subset_by_clusters.R index 4cf923b..605cddf 100644 --- a/R/subset_by_clusters.R +++ b/R/subset_by_clusters.R @@ -1,18 +1,27 @@ +#' Subset sequences by clusters +#' +#' This function subsets sequences based on cluster assignments provided in a cluster table. It can save the resulting sequences to files if specified. +#' +#' @param seqs A sequence object containing the sequences to be subsetted. +#' @param cluster_tbl A data frame or tibble containing the cluster assignments. It should have two columns, 'cluster' and 'seqnames', where 'cluster' contains the cluster numbers and 'seqnames' contains the corresponding sequence names. +#' @param save_to_file Logical value indicating whether to save the resulting sequences to separate files for each cluster. +#' @return A list of sequence objects, where each list element corresponds to a cluster and contains the sequences in that cluster + subset_by_clusters <- function(seqs, cluster_tbl, save_to_file = TRUE){ cluster_seqs <- cluster_tbl %>% - select(cluster, seqnames) %>% + select(cluster, seqnames) %>% arrange(cluster) %>% group_by(cluster) %>% group_split(.keep = FALSE) - + seqs_one_cluster <- function(cluster){ seqs[unname(unlist(cluster))] } out <- lapply(cluster_seqs, seqs_one_cluster) - names(out) <- map(lapply(out, names), 1) %>% - unlist() %>% + names(out) <- map(lapply(out, names), 1) %>% + unlist() %>% unname () - + save_cluster <- function(clusnum){ clus_save <- file.path( clus_save_dir, paste0(names(out)[clusnum], "_cluster.fasta")) @@ -24,4 +33,4 @@ subset_by_clusters <- function(seqs, cluster_tbl, save_to_file = TRUE){ lapply(seq_along(out), save_cluster) } return(out) -} \ No newline at end of file +} diff --git a/R/subset_variant_table.R b/R/subset_variant_table.R index dd234d0..32328c7 100644 --- a/R/subset_variant_table.R +++ b/R/subset_variant_table.R @@ -1,13 +1,30 @@ +#' Subset a variant table by excluding specific clusters, variants, and samples +#' +#' This function allows you to subset a variant table by excluding specific clusters, variants, and samples. +#' +#' @param variant_classified_table A tibble containing the variant classified table. Default: mytibble. +#' @param exclude_clusters A character vector specifying the clusters to exclude. Default: c(). +#' @param exclude_variants A character vector specifying the variants to exclude. Default: c(). +#' @param exclude_samples A character vector specifying the samples to exclude. Default: c(). +#' +#' @return A tibble containing the subsetted variant table. +#' +#' @import dplyr +#' @importFrom stats setdiff +#' +#' @examples +#' subset_variant_table(mytibble, c("cluster1", "cluster2"), c("variant1", "variant2"), c("sample1", "sample2")) + subset_variant_table <- function(variant_classified_table = mytibble, exclude_clusters = c(), exclude_variants = c(), exclude_samples = c()){ - - variant_classified_table_sub <- variant_classified_table %>% + + variant_classified_table_sub <- variant_classified_table %>% filter(clus_name %in% setdiff(unique(variant_classified_table$clus_name), - exclude_clusters)) %>% - filter(var_type %in% setdiff(unique(variant_classified_table$var_type), - exclude_variants)) %>% + exclude_clusters)) %>% + filter(var_type %in% setdiff(unique(variant_classified_table$var_type), + exclude_variants)) %>% filter(sample %in% setdiff(unique(variant_classified_table$sample), exclude_samples)) -} \ No newline at end of file +} diff --git a/R/test_clustering_thresholds.R b/R/test_clustering_thresholds.R index 74014fc..1e56a2f 100644 --- a/R/test_clustering_thresholds.R +++ b/R/test_clustering_thresholds.R @@ -1,7 +1,30 @@ -# Iterate over threshold values -test_clustering_thresholds <- function(MyDNAstring, step_size, +#' Iterate over threshold values +#' +#' This function iterates over a range of threshold values and performs clustering +#' on a DNA string dataset. +#' +#' @param MyDNAstring A DNAStringSet object containing the DNA sequences to be clustered. +#' @param step_size The step size for the threshold values. Default is 0.01. +#' @param step_max The maximum threshold value. Default is 0.99. +#' @param ncores The number of cores to use for parallel processing. Default is 1. +#' +#' @return A list of clustering results, where each element in the list corresponds to a specific threshold value. +#' +#' @import DECIPHER +#' @import tibble +#' @import dplyr +#' @import tidyr +#' +#' @examples +#' # Create a DNAStringSet object +#' my_sequences <- DNAStringSet(c("ACGT", "AGCT", "ATGC")) +#' +#' # Run the clustering function +#' results <- test_clustering_thresholds(my_sequences, step_size = 0.1, step_max = 0.9, ncores = 2) +#' @export +test_clustering_thresholds <- function(MyDNAstring, step_size, step_max = 0.99, ncores = 1) { - + #DECIPHER removed the 'type' argument from IdClusters around v2.24. Currently (Feb23), the function is renamed "Clusterize" if(numeric_version(packageVersion("DECIPHER")) < 2.24){ clus_tbl <- IdClusters(method = "inexact", myXStringSet = MyDNAstring, @@ -9,18 +32,18 @@ test_clustering_thresholds <- function(MyDNAstring, step_size, verbose = FALSE) }else{ clus_tbl <- Clusterize(MyDNAstring, - cutoff = seq(0, step_max, step_size), + cutoff = seq(0, step_max, step_size), processors = ncores, - verbose = FALSE) + verbose = FALSE) } - + steps_temp <- seq(0, step_max, step_size) clus_tbl_list <- as_tibble(clus_tbl, rownames = "seqnames") %>% pivot_longer(cols = -seqnames, names_to = "cutoff", values_to = "cluster") %>% group_by(cutoff) %>% group_split(.keep = FALSE) - - + + names(clus_tbl_list) <- steps_temp return(clus_tbl_list) -} \ No newline at end of file +} diff --git a/R/translate_and_count_stops.R b/R/translate_and_count_stops.R index 12bba83..9380240 100644 --- a/R/translate_and_count_stops.R +++ b/R/translate_and_count_stops.R @@ -1,17 +1,39 @@ -# Find nucleotide distance to first stop (half-open reading frames) +#' Find nucleotide distance to first stop (half-open reading frames) +#' +#' This function takes a DNAStringSet as input and finds the nucleotide distance +#' to the first stop codon in each of the three reading frames. +#' +#' @param seqs A DNAStringSet object containing DNA sequences +#' +#' @return A data frame with the following columns: +#' \itemize{ +#' \item \code{seqnames}: The name of the DNA sequence +#' \item \code{rframe}: The reading frame number (1, 2 or 3) +#' \item \code{seq}: The translated peptide sequence +#' \item \code{fos_width}: The nucleotide distance to the first stop codon in each reading frame +#' } +#' +#' @examples +#' seqs <- DNAStringSet("ATGTCGATAGCCTAGGTCAGTAA") +#' translate_and_count_stops(seqs) +#' +#' @import Biostrings +#' @import dplyr +#' @import stringr +#' @export translate_and_count_stops <- function(seqs = DNAStringSet) { # Make reading frames and translate to protein - asv_rfs <- lapply(1:3, function(pos) + asv_rfs <- lapply(1:3, function(pos) subseq(c(seqs), start=pos)) - + asv_peps <- suppressWarnings(lapply(asv_rfs, translate)) - - peps_tbl <- lapply(asv_peps, as.data.frame) %>% - lapply(rownames_to_column, var = "seqnames") %>% + + peps_tbl <- lapply(asv_peps, as.data.frame) %>% + lapply(rownames_to_column, var = "seqnames") %>% bind_rows(.id = "rframe") %>% dplyr::rename(seq = x) %>% mutate(nostop = str_remove(seq, ".*\\*")) %>% - mutate(first_open_start = str_locate(nostop, "M")[, "start"]) %>% + mutate(first_open_start = str_locate(nostop, "M")[, "start"]) %>% mutate(fos_width = (str_length(nostop) - first_open_start)*3) %>% mutate(fos_width = replace_na(fos_width, 0)) %>% select(-nostop, -first_open_start) %>% @@ -23,4 +45,4 @@ translate_and_count_stops <- function(seqs = DNAStringSet) { ((first_stop - 1)*3), (first_stop * 3))) return(peps_tbl) -} \ No newline at end of file +} diff --git a/R/variant_classifier.R b/R/variant_classifier.R index b64342d..e4bdd9e 100644 --- a/R/variant_classifier.R +++ b/R/variant_classifier.R @@ -1,59 +1,69 @@ +#' Variant Classifier Function +#' +#' This function classifies variants based on relative abundance and similarity to a reference sequence. +#' +#' @param seqtab_file Path to a sequence table file (default: file.path(path, 'seqtab_nochim.rds')) +#' @param clustered_sequences A list of DNA sequences containing clustered ASVs (default: myDNAStringSetList) +#' @param reference_informed Logical value indicating whether the classification should be reference informed (default: FALSE) +#' +#' @return A modified master table with variant classifications + # Define a function called variant_classifier with two arguments: seqtab_file and clustered_sequences variant_classifier <- function( seqtab_file = file.path(path, 'seqtab_nochim.rds'), clustered_sequences = myDNAStringSetList){ - + # Convert seqtab into long format and calculate relative abundance per sample for each ASV - seqtab_tbl_long <- clean_seqtab(seqtab_file, output = FALSE) %>% - pivot_longer(cols = where(is.integer), names_to = 'sample', - values_to = 'asv_counts') %>% - group_by(sample) %>% - mutate(sample_count_sum = sum(asv_counts)) %>% - ungroup() %>% - mutate(rel_abundance = asv_counts/sample_count_sum) %>% + seqtab_tbl_long <- clean_seqtab(seqtab_file, output = FALSE) %>% + pivot_longer(cols = where(is.integer), names_to = 'sample', + values_to = 'asv_counts') %>% + group_by(sample) %>% + mutate(sample_count_sum = sum(asv_counts)) %>% + ungroup() %>% + mutate(rel_abundance = asv_counts/sample_count_sum) %>% filter(sample_count_sum > 0) - + # Create a table for clustered sequences and filter out clusters with only one sequence - clustab_tbl <- cluster_tbl_named(clustered_sequences) %>% - left_join(tibble(seqnames = unlist(map(clustered_seqs, names)), + clustab_tbl <- cluster_tbl_named(clustered_sequences) %>% + left_join(tibble(seqnames = unlist(map(clustered_seqs, names)), seqs = as.character(unlist(clustered_sequences))), by = 'seqnames') %>% - filter(clus_size > 1) - + filter(clus_size > 1) + # If reference informed, add reference info and name variants if(reference_informed){ # Join the clustered sequences table with the long seqtab table by sequence name - master_tbl <- left_join(clustab_tbl, seqtab_tbl_long, by = 'seqs') %>% + master_tbl <- left_join(clustab_tbl, seqtab_tbl_long, by = 'seqs') %>% filter(seqnames != clus_name) - + # Calculate the similarity of each cluster to the reference sequence - sim_to_ref_tbl <- map(clustered_seqs, similiarity_to_reference, - ref = refstrings) %>% - discard(is_null) %>% + sim_to_ref_tbl <- map(clustered_seqs, similiarity_to_reference, + ref = refstrings) %>% + discard(is_null) %>% bind_rows(.id = 'clus_name') %>% unique() - + # Join the master table with the reference similarity table - master_tbl <- left_join(master_tbl, sim_to_ref_tbl, by = c('clus_name', - 'seqnames')) %>% + master_tbl <- left_join(master_tbl, sim_to_ref_tbl, by = c('clus_name', + 'seqnames')) %>% # Add sequence variants and major variant information to the table and group by cluster name - mutate(seq_var = if_else(sim_to_ref == 0, 'Reference sequence', + mutate(seq_var = if_else(sim_to_ref == 0, 'Reference sequence', paste0('Sequence variant ', num-2))) %>% mutate(nums_not_ref = if_else(sim_to_ref == 0, 99999, as.double(num))) %>% group_by(clus_name) %>% mutate(maj_var = min(nums_not_ref)) %>% ungroup() %>% mutate(seq_var = if_else(num == maj_var, 'Major variant', seq_var)) %>% - mutate(seq_var = factor(seq_var, levels = c('Reference sequence', + mutate(seq_var = factor(seq_var, levels = c('Reference sequence', 'Major variant', - paste0('Sequence variant ', - 1:max(num))))) %>% + paste0('Sequence variant ', + 1:max(num))))) %>% select(-maj_var, -nums_not_ref) } - + # If not reference informed, join the clustered sequences table with the long seqtab table and add "x" to a new column sim_to_ref if(!reference_informed){ - master_tbl <- left_join(clustab_tbl, seqtab_tbl_long, by = 'seqs') %>% + master_tbl <- left_join(clustab_tbl, seqtab_tbl_long, by = 'seqs') %>% # Add sequence variants and major variant information to the table and group by cluster name mutate(seq_var = paste0('Sequence variant ', num-2)) %>% group_by(clus_name) %>% @@ -61,53 +71,54 @@ variant_classifier <- function( mutate(min_var = min(num)+1) %>% ungroup() %>% mutate(seq_var = if_else(num == maj_var, 'Major variant', seq_var)) %>% - mutate(seq_var = if_else(num == min_var, 'Minor variant', seq_var)) %>% + mutate(seq_var = if_else(num == min_var, 'Minor variant', seq_var)) %>% mutate(seq_var = factor(seq_var, levels = c('Major variant', 'Minor variant', - paste0('Sequence variant ', - 1:max(num))))) %>% + paste0('Sequence variant ', + 1:max(num))))) %>% select(-maj_var, -min_var) } - - + + # Call variant types using top 2 most abundant sequences per sample and cluster based on relative abundance variant_types <- master_tbl %>% - filter(rel_abundance > 0) %>% - group_by(sample, clus_name) %>% - top_n(2, rel_abundance) %>% - ungroup() %>% + filter(rel_abundance > 0) %>% + group_by(sample, clus_name) %>% + top_n(2, rel_abundance) %>% + ungroup() %>% select(sample, ASV) %>% group_by(sample) %>%# Concatenate the top 2 most abundant sequences per sample into a single string - "profile" - mutate(profile = paste0(ASV, collapse = '')) %>% - + mutate(profile = paste0(ASV, collapse = '')) %>% + # Remove duplicates - ungroup() %>% - select(-ASV) %>% + ungroup() %>% + select(-ASV) %>% unique() - + # Name each variant type (Variant_type1, Variant_type2, etc.) - var_type_names <- select(variant_types, -sample) %>% - unique() %>% - mutate(var_type = factor(paste0('Variant_type ', 1:nrow(.)), - levels = paste0('Variant_type ', 1:nrow(.)))) %>% + var_type_names <- select(variant_types, -sample) %>% + unique() %>% + mutate(var_type = factor(paste0('Variant_type ', 1:nrow(.)), + levels = paste0('Variant_type ', 1:nrow(.)))) %>% # Join variant type names with variant profiles - left_join(variant_types, by = 'profile') %>% + left_join(variant_types, by = 'profile') %>% select(-profile) - + # Add variant type info to the master table and filter out sequences with zero relative abundance - master_tbl <- master_tbl %>% - left_join(var_type_names, by = 'sample') %>% - mutate(seqs_lengths = str_length(seqs)) %>% + master_tbl <- master_tbl %>% + left_join(var_type_names, by = 'sample') %>% + mutate(seqs_lengths = str_length(seqs)) %>% group_by(sample, clus_name) %>% - mutate(cluster_sum_rel_abu = sum(rel_abundance)) %>% - mutate(rel_var_abundance = rel_abundance/cluster_sum_rel_abu) %>% - top_n(2, rel_abundance) %>% - ungroup() %>% + mutate(cluster_sum_rel_abu = sum(rel_abundance)) %>% + mutate(rel_var_abundance = rel_abundance/cluster_sum_rel_abu) %>% + top_n(2, rel_abundance) %>% + ungroup() %>% filter(rel_abundance > 0) - - aa_info <- find_longest_reading_frames(unlist(unname(clustered_sequences))) %>% - mutate(longest_aa_seq = max_width/3) %>% + + aa_info <- find_longest_reading_frames(unlist(unname(clustered_sequences))) %>% + mutate(longest_aa_seq = max_width/3) %>% select(seqnames, longest_aa_seq) - + master_tbl <- left_join(master_tbl, aa_info, by = "seqnames") -} \ No newline at end of file + return(master_tbl) +} diff --git a/R/veganify_asvcounts.R b/R/veganify_asvcounts.R index 69388dd..2070bd4 100644 --- a/R/veganify_asvcounts.R +++ b/R/veganify_asvcounts.R @@ -1,10 +1,18 @@ +#' Veganify ASV Counts +#' +#' This function takes a cleaned sequence table and converts it to vegan format. +#' +#' @param cleaned_seqtab A cleaned sequence table. +#' +#' @return A vegan formatted count matrix. +#' @export veganify_asvcounts <- function(cleaned_seqtab = my_cleaned_seqtab){ - out <- cleaned_seqtab %>% - select(-seqs, -ASV) %>% - as.data.frame() + out <- cleaned_seqtab %>% + select(-seqs, -ASV) %>% + as.data.frame() row.names(out) <- cleaned_seqtab[["ASV"]] - out <- out %>% - select(where( ~ is.numeric(.x) && sum(.x) != 0)) %>% + out <- out %>% + select(where( ~ is.numeric(.x) && sum(.x) != 0)) %>% t() return(out) -} \ No newline at end of file +} diff --git a/R/veganify_generic_wide_tbl.R b/R/veganify_generic_wide_tbl.R index 9fbe40d..16f9fc9 100644 --- a/R/veganify_generic_wide_tbl.R +++ b/R/veganify_generic_wide_tbl.R @@ -1,9 +1,35 @@ -# Makes a vegdist compatible data.frame from a wide tibble that contains the -# rownames in the first column and input data for vegdist in all other columns +#' Makes a vegdist compatible data.frame from a wide tibble that contains the rownames in the first column and input data for vegdist in all other columns +#' +#' This function takes a wide tibble with rownames in the first column and input data for vegdist in all other columns, and converts it into a data.frame that is compatible with the vegdist function. It replaces any na values with 0 and sets the rownames of the resulting data.frame to the values in the first column of the input tibble. +#' +#' @param data A wide tibble with rownames in the first column and input data for vegdist in all other columns. +#' +#' @return A data.frame that is compatible with the vegdist function. +#' +#' @importFrom dplyr %>% +#' @importFrom dplyr replace +#' @importFrom stats as.data.frame +#' +#' @examples +#' library(tibble) +#' +#' # Create a wide tibble +#' my_wide_tibble <- tibble::tribble( +#' ~Name, ~Var1, ~Var2, ~Var3, +#' "A", 1, 2, 3, +#' "B", 4, 5, 6, +#' "C", 7, 8, 9 +#' ) +#' +#' # Apply the veganify_generic_wide_tbl function +#' result <- veganify_generic_wide_tbl(data = my_wide_tibble) +#' result +#' +#' @exports veganify_generic_wide_tbl <- function(data = my_wide_tibble){ # Prepare data from the plotting table for vegdist/hclust clustering - for_vegan_df <- as.data.frame(data)[-1] %>% + for_vegan_df <- as.data.frame(data)[-1] %>% replace(is.na(.), 0) row.names(for_vegan_df) <- data[[1]] return(for_vegan_df) -} \ No newline at end of file +} -- GitLab