Skip to content
Snippets Groups Projects
title: "Mummer-like plotting"
author: "Simeon Rossmann"
date: "2/18/2021"
output: pdf_document
urlcolor: blue

Script description

This script was used to parse and plot NUCmer alignments and is based heavily on code from a blog post by Dr. Jean Monlong. It is implemented in an RMarkdown document, read more about RMarkdown here.

This code is documentation for an analysis that was conducted as part of the publication "Genomic signatures and insights into host niche adaptation of the entomopathogenic fungus Metarhizium humberi" by Iwanicki et al., 2021. Attached packages and their versions are given on the bottom of the document.

knitr::opts_chunk$set(echo = TRUE)
#install.packages('tidyverse')
library(tidyverse)
#if (!requireNamespace("BiocManager", quietly = TRUE))
#  install.packages("BiocManager")
# BiocManager::install("GenomicRanges")
library(GenomicRanges)

Read in data

Nucmer alignments (delta files) were generated in command line using query and ref scaffold files as follows:

# ref_fasta: 'Mhumberi_scaffolds_final.fasta'
nucmer -p nucmer_align ref_fasta query_fasta
# CHANGE ME to the correct location/naming patter of the Nucmer files
list_delta_files <- list.files("Genomes_with_proteins",
                               pattern = "nucmer_align.delta", 
                               recursive = TRUE, full.names = TRUE)

# Clean up names to use them in list and output naming
names_delta_files <- str_remove(list_delta_files, ".*_with_proteins/") %>% 
  str_remove("/nucmer_.*")

# Name list with genomes
names(list_delta_files) <- names_delta_files

# Function to read/parse one delta file
readDelta <- function(deltafile){
  lines <- scan(deltafile, 'a', sep = '\n', quiet = TRUE)
  lines <- lines[-1]
  lines.l <- strsplit(lines, ' ')
  lines.len <- lapply(lines.l, length) %>% as.numeric
  lines.l <- lines.l[lines.len != 1]
  lines.len <- lines.len[lines.len != 1]
  head.pos <- which(lines.len == 4)
  head.id <- rep(head.pos, c(head.pos[-1], length(lines.l) + 1) - head.pos)
  mat <- matrix(as.numeric(unlist(lines.l[lines.len == 7])), 7)
  res <- as.data.frame(t(mat[1:5,]))
  colnames(res) <- c('rs','re','qs','qe','error')
  res$qid <- unlist(lapply(lines.l[head.id[lines.len == 7]], '[', 2))
  res$rid <- unlist(lapply(lines.l[head.id[lines.len == 7]], '[', 1)) %>% 
    gsub('^>', '', .)
  res$strand <- ifelse(res$qe - res$qs > 0, '+', '-')
  res$genome <- str_remove(deltafile, ".*_with_proteins/") %>% 
                str_remove("/nucmer_.*")
  res
}

# Iterate all delta files through read/parse function
mumgp <- lapply(list_delta_files, readDelta)
# Name list
names(mumgp) <- names_delta_files
# Check first entry in list
head(mumgp[[1]])

Contig filter

Comment of the original author:

# FILTER CONTIGS WITH POOR ALIGNMENTS:
# For now, I filter contigs simply based on the size of 
# the aligned segment. I keep only contigs with at least 
# one aligned segment larger than a minimum size. Smaller 
# alignment in these contigs are kept if in the same range 
# as the large aligned segments. Eventually, I could 
# also filter segment based on the number/proportion of errors.

SWR: This code was rewritten because it did not deliver the expected results and I could not quite understand the flanks as they were implemented. For multiple short contigs it seemed to deliver erroneous output. The new code keeps contigs with at least one segment larger than minimum size and all segments in the same area as the original code (replaced > with >= and reverse). Also removes segments in the same region as a larger seqment if they are smaller than 'noiseminl'.

filterMum <- function(df, minl=1000, flanks=0, noiseminl = 100){
  coord = df %>% 
    filter(abs(re - rs) >= minl) %>% 
    group_by(qid, rid) %>%
    summarize(qsL = min(qs) - flanks, 
              qeL = max(qe) + flanks, 
              rs = median(rs)) %>%
    ungroup %>% 
    arrange(desc(rs)) %>%
    mutate(qid = factor(qid, levels = unique(qid))) %>% 
    select(-rs)
  merge(df, coord) %>% filter(qs >= qsL, 
                              qe <= qeL, 
                              abs(qe-qs) >= noiseminl) %>%
    mutate(qid = factor(qid, levels = levels(coord$qid))) %>% 
    select(-qsL, -qeL)
}

mumgp_filt <- lapply(mumgp, filterMum)
names(mumgp_filt) <- names_delta_files

head(mumgp_filt[[1]])
tail(mumgp_filt[[1]])

Plot

# CHANGE ME to the directory for the plot outputs
outpath <- "Contig_sorting"

# GRAPH
# Non-diagonalized plot, use to check filtered if necessary, no output
filt_check <- ggplot(mumgp_filt[[1]], 
                     aes(x = rs, xend = re, y = qs, 
                         yend = qe, colour = strand)) + 
  geom_segment() +
  geom_point(alpha = .5) + 
  facet_grid(qid~rid, scales = 'free', space = "free", switch = 'y') +
  theme_void() + theme(strip.text.y = element_blank(),
                     axis.ticks = element_blank(),
                     axis.text = element_blank(),
                     legend.position = c(.99,.01), legend.justification = c(1,0),
                     strip.background = element_blank(),
                     ) +
  xlab('reference sequence') + 
  ylab('assembly') + 
  scale_colour_brewer(palette = 'Set1')

# Comment of the original author: 
# DIAGONALIZE
# For each contig, I compute the major strand (strand with 
# most bases aligned) and flip if necessary. The contigs are
# also ordered based on the reference region with most bases 
# and the weighted means of the start position in this matched 
# reference region.
diagMum <- function(df){
  ## Find best qid order
  rid.o = df %>% 
    group_by(qid, rid) %>% 
    summarize(base = sum(abs(qe - qs)),                                                 
              rs = weighted.mean(rs, abs(qe - qs))) %>%
    ungroup %>% 
    arrange(desc(base)) %>% 
    group_by(qid) %>% 
    do(head(., 1)) %>%
    ungroup %>% 
    arrange(desc(rid), desc(rs)) %>%
    mutate(qid = factor(qid, levels = unique(qid)))
  ## Find best qid strand
  major.strand = df %>% group_by(qid) %>%
    summarize(major.strand = ifelse(
      sum(sign(qe - qs) * abs(qe - qs)) > 0, '+', '-'),
              maxQ = max(c(qe, qs)))
  merge(df, major.strand) %>% 
    mutate(qs = ifelse(major.strand == '-', maxQ-qs, qs),
           qe = ifelse(major.strand == '-', maxQ-qe, qe),
           qid = factor(qid, levels = levels(rid.o$qid)))
}

# Iterate all filtered files over diagonalization function
mumgp_filt_diag <- lapply(mumgp_filt, diagMum)
# Name list
names(mumgp_filt_diag) <- names_delta_files

# Plotting function, takes diagonalized table and plots it (large output)
plot_diag <- function(diagonal_mum){
  genome <- unique(diagonal_mum$genome)
  y <- ggplot(diagonal_mum, 
              aes(x = rs, xend = re, y = qs, yend = qe, 
                  colour = strand, size = 3)) +
    geom_segment() + 
    geom_point(alpha = .5, size = 3) + 
    theme_void() + 
    facet_grid(qid~rid, scales = 'free', space = "free", switch = 'y') +
    theme(strip.background = element_blank(),
          strip.text = element_blank(),
          axis.ticks = element_blank(),
          axis.text.x = element_blank(),
          legend.position = "none",
          panel.spacing = unit(0, "npc")) +
    xlab('reference sequence') + 
    ylab('assembly') + 
    scale_colour_brewer(palette = 'Set1')
  out_fp <- file.path(outpath,
                      paste0(genome, ".pdf"))
  ggsave(filename = out_fp, y, width = 100, height = 100, units = "cm", limitsize = FALSE)
}

# Plot all filtered and diagonalized alignments
lapply(mumgp_filt_diag, plot_diag)

# Optional saving of filter check
#ggsave(filename = outpath, filt_check, width = 100, height = 100, 
# units = "cm", limitsize = FALSE)
sessionInfo()