Contig_sorting_Metar.rmd

title: "Mummer-like plotting"
author: "Simeon Rossmann"
date: "2/18/2021"
output: pdf_document
urlcolor: blue
knitr::opts_chunk$set(echo = TRUE)
#install.packages('tidyverse')
library(tidyverse)
#if (!requireNamespace("BiocManager", quietly = TRUE))
#  install.packages("BiocManager")
# BiocManager::install("GenomicRanges")
library(GenomicRanges)
# ref_fasta: 'Mhumberi_scaffolds_final.fasta'
nucmer -p nucmer_align ref_fasta query_fasta
# CHANGE ME to the correct location/naming patter of the Nucmer files
list_delta_files <- list.files("Genomes_with_proteins",
                               pattern = "nucmer_align.delta",
                               recursive = TRUE, full.names = TRUE)

# Clean up names to use them in list and output naming
names_delta_files <- str_remove(list_delta_files, ".*_with_proteins/") %>%
  str_remove("/nucmer_.*")

# Name list with genomes
names(list_delta_files) <- names_delta_files

# Function to read/parse one delta file
readDelta <- function(deltafile){
  lines <- scan(deltafile, 'a', sep = '\n', quiet = TRUE)
  lines <- lines[-1]
  lines.l <- strsplit(lines, ' ')
  lines.len <- lapply(lines.l, length) %>% as.numeric
  lines.l <- lines.l[lines.len != 1]
  lines.len <- lines.len[lines.len != 1]
  head.pos <- which(lines.len == 4)
  head.id <- rep(head.pos, c(head.pos[-1], length(lines.l) + 1) - head.pos)
  mat <- matrix(as.numeric(unlist(lines.l[lines.len == 7])), 7)
  res <- as.data.frame(t(mat[1:5,]))
  colnames(res) <- c('rs','re','qs','qe','error')
  res$qid <- unlist(lapply(lines.l[head.id[lines.len == 7]], '[', 2))
  res$rid <- unlist(lapply(lines.l[head.id[lines.len == 7]], '[', 1)) %>%
    gsub('^>', '', .)
  res$strand <- ifelse(res$qe - res$qs > 0, '+', '-')
  res$genome <- str_remove(deltafile, ".*_with_proteins/") %>%
                str_remove("/nucmer_.*")
  res
}

# Iterate all delta files through read/parse function
mumgp <- lapply(list_delta_files, readDelta)
# Name list
names(mumgp) <- names_delta_files
# Check first entry in list
head(mumgp[[1]])
# FILTER CONTIGS WITH POOR ALIGNMENTS:
# For now, I filter contigs simply based on the size of
# the aligned segment. I keep only contigs with at least
# one aligned segment larger than a minimum size. Smaller
# alignment in these contigs are kept if in the same range
# as the large aligned segments. Eventually, I could
# also filter segment based on the number/proportion of errors.
filterMum <- function(df, minl=1000, flanks=0, noiseminl = 100){
  coord = df %>%
    filter(abs(re - rs) >= minl) %>%
    group_by(qid, rid) %>%
    summarize(qsL = min(qs) - flanks,
              qeL = max(qe) + flanks,
              rs = median(rs)) %>%
    ungroup %>%
    arrange(desc(rs)) %>%
    mutate(qid = factor(qid, levels = unique(qid))) %>%
    select(-rs)
  merge(df, coord) %>% filter(qs >= qsL,
                              qe <= qeL,
                              abs(qe-qs) >= noiseminl) %>%
    mutate(qid = factor(qid, levels = levels(coord$qid))) %>%
    select(-qsL, -qeL)
}

mumgp_filt <- lapply(mumgp, filterMum)
names(mumgp_filt) <- names_delta_files

head(mumgp_filt[[1]])
tail(mumgp_filt[[1]])
# CHANGE ME to the directory for the plot outputs
outpath <- "Contig_sorting"

# GRAPH
# Non-diagonalized plot, use to check filtered if necessary, no output
filt_check <- ggplot(mumgp_filt[[1]],
                     aes(x = rs, xend = re, y = qs,
                         yend = qe, colour = strand)) +
  geom_segment() +
  geom_point(alpha = .5) +
  facet_grid(qid~rid, scales = 'free', space = "free", switch = 'y') +
  theme_void() + theme(strip.text.y = element_blank(),
                     axis.ticks = element_blank(),
                     axis.text = element_blank(),
                     legend.position = c(.99,.01), legend.justification = c(1,0),
                     strip.background = element_blank(),
                     ) +
  xlab('reference sequence') +
  ylab('assembly') +
  scale_colour_brewer(palette = 'Set1')

# Comment of the original author:
# DIAGONALIZE
# For each contig, I compute the major strand (strand with
# most bases aligned) and flip if necessary. The contigs are
# also ordered based on the reference region with most bases
# and the weighted means of the start position in this matched
# reference region.
diagMum <- function(df){
  ## Find best qid order
  rid.o = df %>%
    group_by(qid, rid) %>%
    summarize(base = sum(abs(qe - qs)),
              rs = weighted.mean(rs, abs(qe - qs))) %>%
    ungroup %>%
    arrange(desc(base)) %>%
    group_by(qid) %>%
    do(head(., 1)) %>%
    ungroup %>%
    arrange(desc(rid), desc(rs)) %>%
    mutate(qid = factor(qid, levels = unique(qid)))
  ## Find best qid strand
  major.strand = df %>% group_by(qid) %>%
    summarize(major.strand = ifelse(
      sum(sign(qe - qs) * abs(qe - qs)) > 0, '+', '-'),
              maxQ = max(c(qe, qs)))
  merge(df, major.strand) %>%
    mutate(qs = ifelse(major.strand == '-', maxQ-qs, qs),
           qe = ifelse(major.strand == '-', maxQ-qe, qe),
           qid = factor(qid, levels = levels(rid.o$qid)))
}

# Iterate all filtered files over diagonalization function
mumgp_filt_diag <- lapply(mumgp_filt, diagMum)
# Name list
names(mumgp_filt_diag) <- names_delta_files

# Plotting function, takes diagonalized table and plots it (large output)
plot_diag <- function(diagonal_mum){
  genome <- unique(diagonal_mum$genome)
  y <- ggplot(diagonal_mum,
              aes(x = rs, xend = re, y = qs, yend = qe,
                  colour = strand, size = 3)) +
    geom_segment() +
    geom_point(alpha = .5, size = 3) +
    theme_void() +
    facet_grid(qid~rid, scales = 'free', space = "free", switch = 'y') +
    theme(strip.background = element_blank(),
          strip.text = element_blank(),
          axis.ticks = element_blank(),
          axis.text.x = element_blank(),
          legend.position = "none",
          panel.spacing = unit(0, "npc")) +
    xlab('reference sequence') +
    ylab('assembly') +
    scale_colour_brewer(palette = 'Set1')
  out_fp <- file.path(outpath,
                      paste0(genome, ".pdf"))
  ggsave(filename = out_fp, y, width = 100, height = 100, units = "cm", limitsize = FALSE)
}

# Plot all filtered and diagonalized alignments
lapply(mumgp_filt_diag, plot_diag)

# Optional saving of filter check
#ggsave(filename = outpath, filt_check, width = 100, height = 100,
# units = "cm", limitsize = FALSE)
sessionInfo()