Skip to content
Snippets Groups Projects
Commit 534b0938 authored by Simeon's avatar Simeon
Browse files

Update variant_classifier.R

parent 7223ad02
Branches
No related tags found
No related merge requests found
......@@ -15,26 +15,27 @@ variant_classifier <- function(
seqtab_file = file.path(path, 'seqtab_nochim.rds'),
clustered_sequences = myDNAStringSetList){
# Convert seqtab into long format and calculate relative abundance per sample for each ASV
seqtab_tbl_long <- clean_seqtab(seqtab_file, output = FALSE) %>%
pivot_longer(cols = where(is.integer), names_to = 'sample',
values_to = 'asv_counts') %>%
group_by(sample) %>%
mutate(sample_count_sum = sum(asv_counts)) %>%
ungroup() %>%
mutate(rel_abundance = asv_counts/sample_count_sum) %>%
dplyr::filter(sample_count_sum > 0)
# Create a table for clustered sequences and filter out clusters with only one sequence
clustab_tbl <- cluster_tbl_named(clustered_sequences) %>%
left_join(tibble(seqnames = unlist(map(clustered_sequences, names)),
seqs = as.character(unlist(Biostrings::DNAStringSetList(
clustered_sequences)))),
by = 'seqnames') %>%
dplyr::filter(clus_size > 1)
# If reference informed, add reference info and name variants
# If reference informed, add reference info and name variants
if(reference_informed){
# Convert seqtab into long format and calculate relative abundance per sample for each ASV
seqtab_tbl_long <- clean_seqtab(seqtab_file, output = FALSE) %>%
pivot_longer(cols = where(is.integer), names_to = 'sample',
values_to = 'asv_counts') %>%
dplyr::filter(!(ASV %in% names(refstrings))) %>%
group_by(sample) %>%
mutate(sample_count_sum = sum(asv_counts)) %>%
ungroup() %>%
mutate(rel_abundance = asv_counts/sample_count_sum) %>%
dplyr::filter(sample_count_sum > 0)
# Create a table for clustered sequences and filter out clusters with only one sequence
clustab_tbl <- cluster_tbl_named(clustered_sequences) %>%
left_join(tibble(seqnames = unlist(map(clustered_sequences, names)),
seqs = as.character(unlist(Biostrings::DNAStringSetList(
clustered_sequences)))),
by = 'seqnames') %>%
dplyr::filter(clus_size > 1) %>%
dplyr::filter(!(seqnames %in% names(refstrings)))
# Join the clustered sequences table with the long seqtab table by sequence name
master_tbl <- left_join(clustab_tbl, seqtab_tbl_long, by = 'seqs') %>%
dplyr::filter(seqnames != clus_name)
......@@ -66,6 +67,23 @@ variant_classifier <- function(
# If not reference informed, join the clustered sequences table with the long seqtab table and add "x" to a new column sim_to_ref
if(!reference_informed){
# Convert seqtab into long format and calculate relative abundance per sample for each ASV
seqtab_tbl_long <- clean_seqtab(seqtab_file, output = FALSE) %>%
pivot_longer(cols = where(is.integer), names_to = 'sample',
values_to = 'asv_counts') %>%
group_by(sample) %>%
mutate(sample_count_sum = sum(asv_counts)) %>%
ungroup() %>%
mutate(rel_abundance = asv_counts/sample_count_sum) %>%
dplyr::filter(sample_count_sum > 0)
# Create a table for clustered sequences and filter out clusters with only one sequence
clustab_tbl <- cluster_tbl_named(clustered_sequences) %>%
left_join(tibble(seqnames = unlist(map(clustered_sequences, names)),
seqs = as.character(unlist(Biostrings::DNAStringSetList(
clustered_sequences)))),
by = 'seqnames') %>%
dplyr::filter(clus_size > 1)
master_tbl <- left_join(clustab_tbl, seqtab_tbl_long, by = 'seqs') %>%
# Add sequence variants and major variant information to the table and group by cluster name
mutate(seq_var = paste0('Sequence variant ', num-2)) %>%
......@@ -92,8 +110,6 @@ variant_classifier <- function(
select(sample, ASV) %>%
group_by(sample) %>%# Concatenate the top 2 most abundant sequences per sample into a single string - "profile"
mutate(profile = paste0(ASV, collapse = '')) %>%
# Remove duplicates
ungroup() %>%
select(-ASV) %>%
unique()
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment