Update variant_classifier.R

534b0938 · Simeon · 7223ad02 · 534b0938
Commit 534b0938 authored 1 year ago by Simeon
--- a/R/variant_classifier.R
+++ b/R/variant_classifier.R
@@ -15,26 +15,27 @@ variant_classifier <- function(
    seqtab_file = file.path(path, 'seqtab_nochim.rds'),
    clustered_sequences = myDNAStringSetList){

-  # Convert seqtab into long format and calculate relative abundance per sample for each ASV
-  seqtab_tbl_long <- clean_seqtab(seqtab_file, output = FALSE) %>%
-    pivot_longer(cols = where(is.integer), names_to = 'sample',
-                 values_to = 'asv_counts') %>%
-    group_by(sample) %>%
-    mutate(sample_count_sum = sum(asv_counts)) %>%
-    ungroup() %>%
-    mutate(rel_abundance = asv_counts/sample_count_sum) %>%
-    dplyr::filter(sample_count_sum > 0)
-
-  # Create a table for clustered sequences and filter out clusters with only one sequence
-  clustab_tbl <- cluster_tbl_named(clustered_sequences) %>%
-    left_join(tibble(seqnames = unlist(map(clustered_sequences, names)),
-                     seqs = as.character(unlist(Biostrings::DNAStringSetList(
-                       clustered_sequences)))),
-              by = 'seqnames') %>%
-    dplyr::filter(clus_size > 1)
-
-  # If reference informed, add reference info and name variants
+    # If reference informed, add reference info and name variants
  if(reference_informed){
+    # Convert seqtab into long format and calculate relative abundance per sample for each ASV
+    seqtab_tbl_long <- clean_seqtab(seqtab_file, output = FALSE) %>%
+      pivot_longer(cols = where(is.integer), names_to = 'sample',
+                   values_to = 'asv_counts') %>%
+      dplyr::filter(!(ASV %in% names(refstrings))) %>%
+      group_by(sample) %>%
+      mutate(sample_count_sum = sum(asv_counts)) %>%
+      ungroup() %>%
+      mutate(rel_abundance = asv_counts/sample_count_sum) %>%
+      dplyr::filter(sample_count_sum > 0)
+
+    # Create a table for clustered sequences and filter out clusters with only one sequence
+    clustab_tbl <- cluster_tbl_named(clustered_sequences) %>%
+      left_join(tibble(seqnames = unlist(map(clustered_sequences, names)),
+                       seqs = as.character(unlist(Biostrings::DNAStringSetList(
+                         clustered_sequences)))),
+                by = 'seqnames') %>%
+      dplyr::filter(clus_size > 1) %>%
+      dplyr::filter(!(seqnames %in% names(refstrings)))
    # Join the clustered sequences table with the long seqtab table by sequence name
    master_tbl <- left_join(clustab_tbl, seqtab_tbl_long, by = 'seqs') %>%
      dplyr::filter(seqnames != clus_name)
@@ -66,6 +67,23 @@ variant_classifier <- function(

  # If not reference informed, join the clustered sequences table with the long seqtab table and add "x" to a new column sim_to_ref
  if(!reference_informed){
+    # Convert seqtab into long format and calculate relative abundance per sample for each ASV
+    seqtab_tbl_long <- clean_seqtab(seqtab_file, output = FALSE) %>%
+      pivot_longer(cols = where(is.integer), names_to = 'sample',
+                   values_to = 'asv_counts') %>%
+      group_by(sample) %>%
+      mutate(sample_count_sum = sum(asv_counts)) %>%
+      ungroup() %>%
+      mutate(rel_abundance = asv_counts/sample_count_sum) %>%
+      dplyr::filter(sample_count_sum > 0)
+
+    # Create a table for clustered sequences and filter out clusters with only one sequence
+    clustab_tbl <- cluster_tbl_named(clustered_sequences) %>%
+      left_join(tibble(seqnames = unlist(map(clustered_sequences, names)),
+                       seqs = as.character(unlist(Biostrings::DNAStringSetList(
+                         clustered_sequences)))),
+                by = 'seqnames') %>%
+      dplyr::filter(clus_size > 1)
    master_tbl <- left_join(clustab_tbl, seqtab_tbl_long, by = 'seqs') %>%
      # Add sequence variants and major variant information to the table and group by cluster name
      mutate(seq_var = paste0('Sequence variant ', num-2)) %>%
@@ -92,8 +110,6 @@ variant_classifier <- function(
    select(sample, ASV) %>%
    group_by(sample) %>%# Concatenate the top 2 most abundant sequences per sample into a single string - "profile"
    mutate(profile = paste0(ASV, collapse = '')) %>%
-
-    # Remove duplicates
    ungroup() %>%
    select(-ASV) %>%
    unique()