From 534b09380736f0721c8280b7f96dd703cd73712b Mon Sep 17 00:00:00 2001 From: Simeon <51403284+simeross@users.noreply.github.com> Date: Tue, 31 Oct 2023 11:18:10 +0100 Subject: [PATCH] Update variant_classifier.R --- R/variant_classifier.R | 58 +++++++++++++++++++++++++++--------------- 1 file changed, 37 insertions(+), 21 deletions(-) diff --git a/R/variant_classifier.R b/R/variant_classifier.R index 87baece..29a5944 100644 --- a/R/variant_classifier.R +++ b/R/variant_classifier.R @@ -15,26 +15,27 @@ variant_classifier <- function( seqtab_file = file.path(path, 'seqtab_nochim.rds'), clustered_sequences = myDNAStringSetList){ - # Convert seqtab into long format and calculate relative abundance per sample for each ASV - seqtab_tbl_long <- clean_seqtab(seqtab_file, output = FALSE) %>% - pivot_longer(cols = where(is.integer), names_to = 'sample', - values_to = 'asv_counts') %>% - group_by(sample) %>% - mutate(sample_count_sum = sum(asv_counts)) %>% - ungroup() %>% - mutate(rel_abundance = asv_counts/sample_count_sum) %>% - dplyr::filter(sample_count_sum > 0) - - # Create a table for clustered sequences and filter out clusters with only one sequence - clustab_tbl <- cluster_tbl_named(clustered_sequences) %>% - left_join(tibble(seqnames = unlist(map(clustered_sequences, names)), - seqs = as.character(unlist(Biostrings::DNAStringSetList( - clustered_sequences)))), - by = 'seqnames') %>% - dplyr::filter(clus_size > 1) - - # If reference informed, add reference info and name variants + # If reference informed, add reference info and name variants if(reference_informed){ + # Convert seqtab into long format and calculate relative abundance per sample for each ASV + seqtab_tbl_long <- clean_seqtab(seqtab_file, output = FALSE) %>% + pivot_longer(cols = where(is.integer), names_to = 'sample', + values_to = 'asv_counts') %>% + dplyr::filter(!(ASV %in% names(refstrings))) %>% + group_by(sample) %>% + mutate(sample_count_sum = sum(asv_counts)) %>% + ungroup() %>% + mutate(rel_abundance = asv_counts/sample_count_sum) %>% + dplyr::filter(sample_count_sum > 0) + + # Create a table for clustered sequences and filter out clusters with only one sequence + clustab_tbl <- cluster_tbl_named(clustered_sequences) %>% + left_join(tibble(seqnames = unlist(map(clustered_sequences, names)), + seqs = as.character(unlist(Biostrings::DNAStringSetList( + clustered_sequences)))), + by = 'seqnames') %>% + dplyr::filter(clus_size > 1) %>% + dplyr::filter(!(seqnames %in% names(refstrings))) # Join the clustered sequences table with the long seqtab table by sequence name master_tbl <- left_join(clustab_tbl, seqtab_tbl_long, by = 'seqs') %>% dplyr::filter(seqnames != clus_name) @@ -66,6 +67,23 @@ variant_classifier <- function( # If not reference informed, join the clustered sequences table with the long seqtab table and add "x" to a new column sim_to_ref if(!reference_informed){ + # Convert seqtab into long format and calculate relative abundance per sample for each ASV + seqtab_tbl_long <- clean_seqtab(seqtab_file, output = FALSE) %>% + pivot_longer(cols = where(is.integer), names_to = 'sample', + values_to = 'asv_counts') %>% + group_by(sample) %>% + mutate(sample_count_sum = sum(asv_counts)) %>% + ungroup() %>% + mutate(rel_abundance = asv_counts/sample_count_sum) %>% + dplyr::filter(sample_count_sum > 0) + + # Create a table for clustered sequences and filter out clusters with only one sequence + clustab_tbl <- cluster_tbl_named(clustered_sequences) %>% + left_join(tibble(seqnames = unlist(map(clustered_sequences, names)), + seqs = as.character(unlist(Biostrings::DNAStringSetList( + clustered_sequences)))), + by = 'seqnames') %>% + dplyr::filter(clus_size > 1) master_tbl <- left_join(clustab_tbl, seqtab_tbl_long, by = 'seqs') %>% # Add sequence variants and major variant information to the table and group by cluster name mutate(seq_var = paste0('Sequence variant ', num-2)) %>% @@ -92,8 +110,6 @@ variant_classifier <- function( select(sample, ASV) %>% group_by(sample) %>%# Concatenate the top 2 most abundant sequences per sample into a single string - "profile" mutate(profile = paste0(ASV, collapse = '')) %>% - - # Remove duplicates ungroup() %>% select(-ASV) %>% unique() -- GitLab