Adding files

0f4345c5 · Simeon · 8beccafe · 0f4345c5 · 0f4345c5 · 0f4345c5
Commit 0f4345c5 authored 1 year ago by Simeon
--- a/DADA2_output/ASVs_raw.fasta
+++ b/DADA2_output/ASVs_raw.fasta
--- a/DADA2_output/out.txt
+++ b/DADA2_output/out.txt
+"reads.in" "reads.out"
+"RyiharPL.bc1005--bc1033.fastq" 13824 9695
+"RyiharPL.bc1005--bc1035.fastq" 23835 16968
+"RyiharPL.bc1005--bc1044.fastq" 14757 10629
+"RyiharPL.bc1005--bc1045.fastq" 12891 7273
+"RyiharPL.bc1005--bc1054.fastq" 11046 7627
+"RyiharPL.bc1005--bc1056.fastq" 12967 9360
+"RyiharPL.bc1005--bc1057.fastq" 14551 10423
+"RyiharPL.bc1005--bc1059.fastq" 33570 22456
+"RyiharPL.bc1005--bc1060.fastq" 19392 14366
+"RyiharPL.bc1005--bc1062.fastq" 12263 9141
+"RyiharPL.bc1005--bc1065.fastq" 20513 15241
+"RyiharPL.bc1007--bc1033.fastq" 12565 9360
+"RyiharPL.bc1007--bc1035.fastq" 10401 7657
+"RyiharPL.bc1007--bc1044.fastq" 11176 8314
+"RyiharPL.bc1007--bc1045.fastq" 29178 21657
+"RyiharPL.bc1007--bc1054.fastq" 10456 7710
+"RyiharPL.bc1007--bc1056.fastq" 12872 9502
+"RyiharPL.bc1007--bc1057.fastq" 9885 7282
+"RyiharPL.bc1007--bc1059.fastq" 13815 10125
+"RyiharPL.bc1007--bc1060.fastq" 13278 9778
+"RyiharPL.bc1007--bc1062.fastq" 18308 13457
+"RyiharPL.bc1007--bc1065.fastq" 11757 8660
+"RyiharPL.bc1007--bc1075.fastq" 16043 11795
+"RyiharPL.bc1008--bc1033.fastq" 17563 12393
+"RyiharPL.bc1008--bc1035.fastq" 10545 7739
+"RyiharPL.bc1008--bc1044.fastq" 7166 5256
+"RyiharPL.bc1008--bc1045.fastq" 7482 5506
+"RyiharPL.bc1008--bc1054.fastq" 20096 14690
+"RyiharPL.bc1008--bc1056.fastq" 18370 13041
+"RyiharPL.bc1008--bc1057.fastq" 14518 10539
+"RyiharPL.bc1008--bc1059.fastq" 24420 14399
+"RyiharPL.bc1008--bc1060.fastq" 22511 14538
+"RyiharPL.bc1008--bc1062.fastq" 7682 5727
+"RyiharPL.bc1008--bc1065.fastq" 3637 2722
+"RyiharPL.bc1008--bc1075.fastq" 27745 16261
+"RyiharPL.bc1012--bc1078.fastq" 26860 16005
+"RyiharPL.bc1012--bc1079.fastq" 10998 8007
+"RyiharPL.bc1012--bc1084.fastq" 10778 7422
+"RyiharPL.bc1012--bc1085.fastq" 23982 17429
+"RyiharPL.bc1012--bc1093.fastq" 16991 12355
+"RyiharPL.bc1012--bc1094.fastq" 4051 2966
+"RyiharPL.bc1012--bc1096.fastq" 17784 11563
+"RyiharPL.bc1012--bc1102.fastq" 12865 9091
+"RyiharPL.bc1012--bc1106.fastq" 15994 11706
+"RyiharPL.bc1012--bc1108.fastq" 14837 11090
+"RyiharPL.bc1015--bc1077.fastq" 34552 20842
+"RyiharPL.bc1015--bc1078.fastq" 32348 20315
+"RyiharPL.bc1015--bc1079.fastq" 20332 14694
+"RyiharPL.bc1015--bc1082.fastq" 21291 14538
+"RyiharPL.bc1015--bc1085.fastq" 9453 6344
+"RyiharPL.bc1015--bc1093.fastq" 18189 13512
+"RyiharPL.bc1015--bc1094.fastq" 13938 9975
+"RyiharPL.bc1015--bc1096.fastq" 12251 9054
+"RyiharPL.bc1015--bc1102.fastq" 11878 8817
+"RyiharPL.bc1015--bc1106.fastq" 5932 4460
+"RyiharPL.bc1015--bc1108.fastq" 10720 7885
+"RyiharPL.bc1020--bc1077.fastq" 39022 27300
+"RyiharPL.bc1020--bc1078.fastq" 39090 27344
+"RyiharPL.bc1020--bc1082.fastq" 9700 6822
+"RyiharPL.bc1020--bc1084.fastq" 10677 7626
+"RyiharPL.bc1020--bc1085.fastq" 16304 12066
+"RyiharPL.bc1020--bc1093.fastq" 11945 8881
+"RyiharPL.bc1020--bc1094.fastq" 21487 14727
+"RyiharPL.bc1020--bc1096.fastq" 16867 12064
+"RyiharPL.bc1020--bc1102.fastq" 11018 7784
+"RyiharPL.bc1022--bc1086.fastq" 15229 10950
+"RyiharPL.bc1022--bc1087.fastq" 25195 18735
+"RyiharPL.bc1022--bc1088.fastq" 15572 11731
+"RyiharPL.bc1022--bc1091.fastq" 8371 6035
+"RyiharPL.bc1022--bc1095.fastq" 18887 13833
+"RyiharPL.bc1022--bc1097.fastq" 13639 9722
+"RyiharPL.bc1022--bc1103.fastq" 10421 6978
+"RyiharPL.bc1022--bc1107.fastq" 8084 5994
+"RyiharPL.bc1022--bc1109.fastq" 17239 13003
+"RyiharPL.bc1022--bc1114.fastq" 5825 4492
+"RyiharPL.bc1022--bc1117.fastq" 10714 8103
+"RyiharPL.bc1022--bc1118.fastq" 15243 11556
+"RyiharPL.bc1024--bc1086.fastq" 13662 9385
+"RyiharPL.bc1024--bc1087.fastq" 18268 12385
+"RyiharPL.bc1024--bc1088.fastq" 10518 8025
+"RyiharPL.bc1024--bc1091.fastq" 6666 5078
+"RyiharPL.bc1024--bc1095.fastq" 10099 7576
+"RyiharPL.bc1024--bc1103.fastq" 10381 7761
+"RyiharPL.bc1024--bc1107.fastq" 17255 12938
+"RyiharPL.bc1024--bc1109.fastq" 30110 22101
+"RyiharPL.bc1024--bc1114.fastq" 11814 8846
+"RyiharPL.bc1024--bc1117.fastq" 11370 8545
+"RyiharPL.bc1024--bc1118.fastq" 21389 15876
+"RyiharPL.bc1026--bc1121.fastq" 4361 3894
+"RyiharPL.bc1026--bc1124.fastq" 4616 4038
+"RyiharPL.bc1026--bc1126.fastq" 3523 3076
+"RyiharPL.bc1026--bc1139.fastq" 3747 3285
+"RyiharPL.bc1026--bc1147.fastq" 4382 3842
+"RyiharPL.bc1026--bc1149.fastq" 3892 3443
+"RyiharPL.bc1026--bc1154.fastq" 6489 5738
+"RyiharPL.bc1026--bc1157.fastq" 4853 4244
--- a/DADA2_output/sequence_table.xlsx
+++ b/DADA2_output/sequence_table.xlsx
--- a/DADA2_output/trackR1.txt
+++ b/DADA2_output/trackR1.txt
+"input" "filtered" "denoisedF" "nonchim" "percent_retained"
+"RyiharPL.bc1005--bc1033" 13824 9695 8450 6217 44.9725115740741
+"RyiharPL.bc1005--bc1035" 23835 16968 15372 11808 49.5405915670233
+"RyiharPL.bc1005--bc1044" 14757 10629 7108 7096 48.0856542657722
+"RyiharPL.bc1005--bc1045" 12891 7273 6013 5935 46.0398727794585
+"RyiharPL.bc1005--bc1054" 11046 7627 6626 5492 49.7193554227775
+"RyiharPL.bc1005--bc1056" 12967 9360 7599 7001 53.9908999768643
+"RyiharPL.bc1005--bc1057" 14551 10423 8484 8181 56.2229400041234
+"RyiharPL.bc1005--bc1059" 33570 22456 19168 16381 48.7965445338099
+"RyiharPL.bc1005--bc1060" 19392 14366 13988 13988 72.1328382838284
+"RyiharPL.bc1005--bc1062" 12263 9141 9051 9051 73.8073880779581
+"RyiharPL.bc1005--bc1065" 20513 15241 14818 14818 72.2371179252182
+"RyiharPL.bc1007--bc1033" 12565 9360 9164 9164 72.9327497015519
+"RyiharPL.bc1007--bc1035" 10401 7657 7412 7412 71.2623786174406
+"RyiharPL.bc1007--bc1044" 11176 8314 8235 8235 73.684681460272
+"RyiharPL.bc1007--bc1045" 29178 21657 20963 20963 71.8452258550963
+"RyiharPL.bc1007--bc1054" 10456 7710 6732 6732 64.3840856924254
+"RyiharPL.bc1007--bc1056" 12872 9502 8503 8503 66.0581106277191
+"RyiharPL.bc1007--bc1057" 9885 7282 6422 6422 64.9671219018715
+"RyiharPL.bc1007--bc1059" 13815 10125 9009 9009 65.2117263843648
+"RyiharPL.bc1007--bc1060" 13278 9778 8785 8785 66.1620726012954
+"RyiharPL.bc1007--bc1062" 18308 13457 12202 12202 66.6484596897531
+"RyiharPL.bc1007--bc1065" 11757 8660 7704 7704 65.5269201326869
+"RyiharPL.bc1007--bc1075" 16043 11795 9442 9442 58.8543289908371
+"RyiharPL.bc1008--bc1033" 17563 12393 11139 6870 39.1163240904174
+"RyiharPL.bc1008--bc1035" 10545 7739 6533 6468 61.3371266002845
+"RyiharPL.bc1008--bc1044" 7166 5256 4459 4459 62.2243929667876
+"RyiharPL.bc1008--bc1045" 7482 5506 4881 4881 65.2365677626303
+"RyiharPL.bc1008--bc1054" 20096 14690 12220 12026 59.8427547770701
+"RyiharPL.bc1008--bc1056" 18370 13041 10891 10381 56.5106151333696
+"RyiharPL.bc1008--bc1057" 14518 10539 8999 8877 61.1447857831657
+"RyiharPL.bc1008--bc1059" 24420 14399 11394 9313 38.1367731367731
+"RyiharPL.bc1008--bc1060" 22511 14538 11870 10838 48.1453511616543
+"RyiharPL.bc1008--bc1062" 7682 5727 5286 5281 68.7451184587347
+"RyiharPL.bc1008--bc1065" 3637 2722 2686 2686 73.8520758867198
+"RyiharPL.bc1008--bc1075" 27745 16261 12240 10584 38.1474139484592
+"RyiharPL.bc1012--bc1078" 26860 16005 10910 9927 36.9583023082651
+"RyiharPL.bc1012--bc1079" 10998 8007 7245 7120 64.7390434624477
+"RyiharPL.bc1012--bc1084" 10778 7422 6694 5958 55.2792725923177
+"RyiharPL.bc1012--bc1085" 23982 17429 13726 13401 55.8794095571679
+"RyiharPL.bc1012--bc1093" 16991 12355 9678 9519 56.0237772938615
+"RyiharPL.bc1012--bc1094" 4051 2966 2385 2362 58.3065909651938
+"RyiharPL.bc1012--bc1096" 17784 11563 9897 9816 55.195681511471
+"RyiharPL.bc1012--bc1102" 12865 9091 8419 6206 48.2394092499028
+"RyiharPL.bc1012--bc1106" 15994 11706 8614 8367 52.3133675128173
+"RyiharPL.bc1012--bc1108" 14837 11090 8703 8411 56.6893576868639
+"RyiharPL.bc1015--bc1077" 34552 20842 16217 15601 45.1522343134985
+"RyiharPL.bc1015--bc1078" 32348 20315 17785 11580 35.7981946333622
+"RyiharPL.bc1015--bc1079" 20332 14694 12926 11965 58.8481211882746
+"RyiharPL.bc1015--bc1082" 21291 14538 12011 11397 53.5296604198957
+"RyiharPL.bc1015--bc1085" 9453 6344 5404 5119 54.1521210197821
+"RyiharPL.bc1015--bc1093" 18189 13512 11813 11813 64.9458463906757
+"RyiharPL.bc1015--bc1094" 13938 9975 7862 7630 54.7424307648156
+"RyiharPL.bc1015--bc1096" 12251 9054 7803 7683 62.7132478981308
+"RyiharPL.bc1015--bc1102" 11878 8817 7845 7737 65.1372284896447
+"RyiharPL.bc1015--bc1106" 5932 4460 4165 4165 70.2124072825354
+"RyiharPL.bc1015--bc1108" 10720 7885 6888 6813 63.5541044776119
+"RyiharPL.bc1020--bc1077" 39022 27300 22501 21453 54.9766798216391
+"RyiharPL.bc1020--bc1078" 39090 27344 23349 22352 57.1808646712714
+"RyiharPL.bc1020--bc1082" 9700 6822 6248 5219 53.8041237113402
+"RyiharPL.bc1020--bc1084" 10677 7626 6564 6518 61.047110611595
+"RyiharPL.bc1020--bc1085" 16304 12066 10119 9794 60.0711481844946
+"RyiharPL.bc1020--bc1093" 11945 8881 7473 7360 61.6157388028464
+"RyiharPL.bc1020--bc1094" 21487 14727 12321 11787 54.8564248150044
+"RyiharPL.bc1020--bc1096" 16867 12064 10297 9992 59.2399359696449
+"RyiharPL.bc1020--bc1102" 11018 7784 6569 6555 59.4935559992739
+"RyiharPL.bc1022--bc1086" 15229 10950 10016 8518 55.932759866045
+"RyiharPL.bc1022--bc1087" 25195 18735 12518 12518 49.6844612026196
+"RyiharPL.bc1022--bc1088" 15572 11731 10730 10730 68.9057282301567
+"RyiharPL.bc1022--bc1091" 8371 6035 4307 4307 51.4514394934894
+"RyiharPL.bc1022--bc1095" 18887 13833 9527 9518 50.3944512098269
+"RyiharPL.bc1022--bc1097" 13639 9722 8664 8513 62.4165994574382
+"RyiharPL.bc1022--bc1103" 10421 6978 6328 5750 55.1770463487189
+"RyiharPL.bc1022--bc1107" 8084 5994 5214 5214 64.4977733795151
+"RyiharPL.bc1022--bc1109" 17239 13003 10390 10390 60.2703173037879
+"RyiharPL.bc1022--bc1114" 5825 4492 3612 3612 62.0085836909871
+"RyiharPL.bc1022--bc1117" 10714 8103 6495 6495 60.621616576442
+"RyiharPL.bc1022--bc1118" 15243 11556 9257 9257 60.7295151872991
+"RyiharPL.bc1024--bc1086" 13662 9385 8736 7290 53.3596837944664
+"RyiharPL.bc1024--bc1087" 18268 12385 11544 9692 53.0545215677688
+"RyiharPL.bc1024--bc1088" 10518 8025 7396 7396 70.3175508651835
+"RyiharPL.bc1024--bc1091" 6666 5078 4689 4689 70.3420342034203
+"RyiharPL.bc1024--bc1095" 10099 7576 6922 6922 68.5414397465096
+"RyiharPL.bc1024--bc1103" 10381 7761 7217 7217 69.5212407282535
+"RyiharPL.bc1024--bc1107" 17255 12938 11500 11500 66.6473485946102
+"RyiharPL.bc1024--bc1109" 30110 22101 15162 15162 50.355363666556
+"RyiharPL.bc1024--bc1114" 11814 8846 7919 7919 67.0306416116472
+"RyiharPL.bc1024--bc1117" 11370 8545 7618 7618 67.0008795074758
+"RyiharPL.bc1024--bc1118" 21389 15876 10663 10663 49.8527280377764
+"RyiharPL.bc1026--bc1121" 4361 3894 3894 3894 89.291446915845
+"RyiharPL.bc1026--bc1124" 4616 4038 4037 4037 87.4566724436742
+"RyiharPL.bc1026--bc1126" 3523 3076 3075 3075 87.2835651433438
+"RyiharPL.bc1026--bc1139" 3747 3285 3282 3282 87.5900720576461
+"RyiharPL.bc1026--bc1147" 4382 3842 3841 3841 87.6540392514833
+"RyiharPL.bc1026--bc1149" 3892 3443 3442 3442 88.4378211716341
+"RyiharPL.bc1026--bc1154" 6489 5738 5730 5730 88.3032824780398
+"RyiharPL.bc1026--bc1157" 4853 4244 4242 4242 87.4098495775809
--- a/Dada2_pipeline_var_PacBio_v4-3.Rmd
+++ b/Dada2_pipeline_var_PacBio_v4-3.Rmd
+---
+title: "dada2 microbiome pipeline v.4.3"
+author: "Simeon Rossmann, Marie L. Davey"
+date: "16.12.2019"
+output:
+  pdf_document: default
+  html_document: default
+last_modified: "Simeon Rossmann (SWR)"
+date_modified: "20.11.2020"
+version: 4.3
+---
+
+This is an R markdown document for the analysis of MiSeq amplicon sequencing data. In order to customise the pipeline, search 'CHANGE ME' to find different settings you should adjust according to your data. All parameters requiring changes are located within the first 4 chunks. After the fourth chunk has been run, you can simply click 'Run'\>'Run all chunks below' in order to process the data
+
+Comment SWR: A new chunk was added to allow for quick reanalysis of taxonomy and pooled analysis of multiple samples after error inference. Small changes in chunk 4 (exporting seqtab.RDP in addition) were made to facilitate this. Therefore it's not recommended to 'Run all chunks below' from chunk 4 in this version.
+
+## Changelog
+
+### Ampseq variant
+
+This variant is geared towards sequencing runs with multiple primers. It takes a list of primers as input for the cutadapt processing instead of a single primer pair.
+
+### v4 update
+
+Comment SWR: Two more chunks were added to integrate [LULU](https://www.nature.com/articles/s41467-017-01312-x) post-processing into this pipeline. The first of the two new chunks is a shell script chunk that generates a match list (most similar ASVs for each ASV) using vsearch. The match list can also be generated by other means (f. eks. BLAST). The second of the new chunks is the core LULU chunk that takes the seqtab_nochim file from the dada2 analysis and the match list and performs the LULU post-processing with standard parameters. After this, a new seqtab_nochim, ASV-fasta and RDP taxonomy (optional) are generated in a lulu-subdirectory.
+
+#### v4.3
+
+This update addresses inconsistencies in code usage between authors that were the likely cause of occasional warning messages and errors when creating files and directories in-function on some machines/setups. These problems occurred inconsistently and were not reproduced but seemed to be fixed by cleaner file path encoding.
+
+-   minor code format fixes:
+
+    -   implementation of file path concatenation cleaned up by replacing all uses of `paste0()` for file paths with `file.path()` and removal of leading and closing slashes `/`
+    -   replaced all instances of in-code object assignment using `=` with `<-` to clearly distinguish it from user assignable objects and argument assignment in functions. This means that objects assigned with `=` are intended for user input while objects assigned with `<-` are not to be directly manipulated by users.
+
+-   other format fixes:
+
+    -   improved cosmetics when knitting by manually adjusted line breaks in code chunks
+    -   added spaces surrounding `=` and following `,` to conform with best practices
+    -   more consistent line spacing in code chunks
+    -   Better text formatting in the Markdown text portions
+
+#### v4.2
+
+-   improved cutadapt with quality control and fwd primer removal, can be run through R in separate chunk
+-   filterAndTrim option expanded with trimLeft to remove the first n 5' bases regardless of content
+
+#### v4.1
+
+-   fixed an error when attempting to plot empty files
+-   fixed an error when attempting to sample more files than available for plotting
+
+
+## Setup R Environment
+
+Comment SWR: If the OS is Linux, this chunk now passes the path and marker variables to the shell script chunk, making it unnecessary to specify this separately below and avoiding issues with misspelled variables. It is possible that this also works with Mac OS, but this was not included since no Mac machine was available for testing.
+
+```{r setup, eval=FALSE}
+############set paths and marker name
+# CHANGE ME to the directory (ABSOLUTE FILEPATH e.g. on Linux /home/usr/...) 
+# containing the fastq files (can be zipped).
+path = "/home/simeon/Documents/PacBio/Barcoded_samples" 
+#CHANGE ME to the AmpSeq project identifier, will be used for folder naming.
+marker = c("DADA2_test")
+
+
+#set knitr chunk options
+knitr::opts_chunk$set(echo = TRUE)
+knitr::opts_chunk$set(eval = FALSE)
+knitr::opts_chunk$set(tidy = TRUE)
+knitr::opts_chunk$set(root.dir = file.path(path))
+
+#load libraries for R session
+library(dada2)
+library(phyloseq)
+library(ggplot2)
+library(Biostrings)
+library(grid)
+library(gridExtra)
+library(ShortRead)
+library(tidyverse)
+
+# Get info on the operating system. If Linux, make 'path' and 'marker' available 
+# globally, if not, warn that the script needs additional attention.
+os <- Sys.info()['sysname']
+if (os == "Linux") {
+  Sys.setenv(projectpath = path)
+  Sys.setenv(marker = marker)
+}else{
+  cat(os, "Please run this script on Linux, 
+      it is no longer tested and optimized for Windows")
+}
+```
+
+# Pacbio pre-processing
+This is an experimental chunk to test pre-processing of HiFi PacBio samples for DADA2
+
+```{r PacBio-pre-processing-and-QC}
+#### NO FURTHER CHANGES NEEDED IN THIS CHUNK ####
+#list of primers (implemented later)
+#pacbio_fwd_primer_fasta <- "/home/simeon/Documents/AmpSeq/Test_Katie/primers_fwd.fasta"
+#pacbio_rev_primer_fasta <- "/home/simeon/Documents/AmpSeq/Test_Katie/primers_rev.fasta"
+
+#Check if Marker folder exists, otherwise create it
+outpath <- file.path(path, marker)
+dir.create(outpath)
+dir.create(file.path(outpath, "primer_trim"))
+dir.create(file.path(outpath, "quality_trim"))
+dir.create(file.path(outpath, "dada2_output_files"))
+#list all fastq.gz files in path
+pacbio_fastqs <- list.files(path, pattern="fastq.*", full.names=TRUE)
+
+#read primers
+#fwds <- Biostrings::readDNAStringSet(pacbio_fwd_primer_fasta)
+#revs <- Biostrings::readDNAStringSet(pacbio_rev_primer_fasta)
+
+#no_primers_path <- file.path(outpath, "noprimers", basename(pacbio_fastqs))
+#no_primer_reads <- dada2::removePrimers(pacbio_fastqs, no_primers_path, )
+
+lens.fn <- lapply(pacbio_fastqs, function(fn) nchar(dada2::getSequences(fn)))
+lens <- do.call(c, lens.fn)
+hist(lens, 100)
+
+# SWR: Make a list of fastqs for quality plotting that excludes very small 
+# files (default is < 100 bytes) which may cause problems in quality plotting 
+# and exit the chunk with error
+min_file_size <- function(files, minsize=10000){
+  info <- file.info(files)
+  index_bigs <- which(info$size > minsize)
+  keep <- files[index_bigs]
+}
+
+pacbio_fastqs_big <- min_file_size(pacbio_fastqs)
+
+#sample_number <- if (length(pacbio_fastqs_big) < 10) length(pacbio_fastqs_big) else 10
+
+#plts_trim <- lapply(sample(pacbio_fastqs_big, sample_number), 
+#                    dada2::plotQualityProfile)
+
+#m_plts <- marrangeGrob(plts_trim, nrow = 2, ncol = 1)
+#ggsave(file.path(outpath, "qual_plots.pdf"),
+#       m_plts, width = 20, height = 26, unit = "cm", dpi = 300)
+
+#print(plts_trim)
+```
+
+## set parameters for dada2
+This chunk contains several options for dada2 that will be used in the processing of the trimmed reads. The settings are saved to a file with a time stamp.
+
+```{r, set_params, eval=FALSE}
+# CHANGE ME use "TRUE" to plot quality profiles for each sample, and "FALSE" to 
+# speed up analysis by skipping this step (plotting takes some time) and "SUB" 
+# to plot a subset of 10 samples
+plotQC = "FALSE" 
+
+# CHANGE ME according to the quality of the sequencing run. This determines the 
+# maximum expected errors for the reads during the filtering step. A reasonable 
+# value for PacBio HiFi reads is 2.
+my_maxEEf = 2 
+
+# CHANGE ME according to the quality of the sequencing run. This is a PHRED score 
+# quality treshold - all sequences will be truncated at the first base with a 
+# quality score below this value
+my_truncQ = 3 
+
+# CHANGE ME according to the marker used - this is the minimum length of the 
+# reads after trimming
+my_minLen = 1000 
+
+# SWR: CHANGE ME to TRUE and give a value (number of bases) to collapse ASVs 
+# with an overlap of 'Collaps_minOverlap' (recommended to be the length of the 
+# shortest ASV that is identical to a longer one). This is only recommended to 
+# be used in cases where a number of ASVs occur that differ in length but are 
+# otherwise identical. If this frequently occurs, it may be an issue of primer 
+# trimming.
+Collapse_overlapping = FALSE
+Collapse_minOverlap = 200
+
+#### NO FURTHER CHANGES NEEDED IN THIS CHUNK ####
+# DO NOT CHANGE for AmpSeq, unless you are sure what you're doing!
+my_minBootstrap = 80 
+assign_taxonomy = "FALSE"
+tax_database = "none"
+```
+
+## Processing chunk
+#### DO NOT ALTER SCRIPT BEYOND THIS POINT!!!
+
+Comment SWR: This chunk now passes the output directory to a global variable on Linux, so that the shell script chunk that generates the match list can access it.
+
+```{r, dada2, eval=FALSE}
+# Write parameter file
+parameters <- paste(
+    paste("Run on", format(Sys.time(), "%d/%m/%y %H:%M")),
+    paste("Path: ",path),"Samples:",
+    paste(c(sapply(strsplit(basename(sort(
+    list.files(file.path(path, marker),
+              pattern = "fastq.gz", 
+              full.names = TRUE))), "_"), `[`, 1)), collapse = ";"),
+    paste("Database type:",tax_database),
+    "\ndada2 Parameters\n",
+    paste("maximum expected errors R1:",my_maxEEf),
+    paste("minimum length: ",my_minLen),
+    paste("truncate at first instance of Qscore: ",my_truncQ),
+    paste("taxonomy bootstrap threshold: ",my_minBootstrap), 
+    paste("Collapse overlapping ASVs: ", Collapse_overlapping), 
+    paste("Minimum overlap if collapsing overlapping ASVs:",
+          Collapse_minOverlap), 
+    sep = "\n")
+write.table(parameters, file.path(outpath, paste0("parameters_PacBio_DADA2", 
+                              format(Sys.time(), "%d-%m-%y_%H%M") ,".txt")),
+            row.names = FALSE)
+
+# Create lists of forward and reverse file names and a list of sample names
+#pacbio_fastqs <- sort(list.files(file.path(path, marker,"trim"), 
+#                     pattern = "_R1_001.fastq(.*)trim.fastq.gz", 
+#                     full.names = TRUE))
+
+
+#Extract sample names
+sample.names <- sapply(strsplit(basename(pacbio_fastqs), "\\.fastq"), `[`, 1)
+
+
+  ######################################################################################
+  #  Forward Reads Only
+  ###################################################################################
+
+  # Create paths for the filtered files in a subdirectory called filtered/
+   
+  filtFsR1 <- file.path(outpath, "quality_filtered",
+                     paste0(sample.names, "_filt.fastq.gz"))
+  
+  outp <- file.path(outpath, "dada2_output_files")
+
+  # Filter sequences
+  outR1 <- filterAndTrim(pacbio_fastqs, filtFsR1,
+              maxEE = my_maxEEf, truncQ = my_truncQ, minLen = my_minLen, 
+              maxLen = 9999,
+              maxN = 0, rm.phix = FALSE, compress = TRUE, multithread = TRUE)
+  head(outR1)
+  write.table(outR1, file.path(outp, "out.txt"))
+
+  #if (plotQC == "TRUE") {
+    #Plot Forward/Reverse Quality scores for each sample (not strictly 
+    # necessary, time intensive)
+  #  pl <- lapply(c(fnFL), plotQualityProfile)
+   # print(pl)
+  #  ml <- marrangeGrob(pl, nrow = 2,ncol = 1)
+  #  ggsave(file.path(outp, "qualityplots_fwd.pdf"), 
+  #         ml, width = 20, height = 26, unit = "cm", dpi = 300)
+  #}
+
+  #if (plotQC == "SUB") {
+    # Plot Forward/Reverse Quality scores for each sample (not strictly 
+    # necessary, time intensive)
+  #  sample_number <- if (length(fnFL) < 10) length(fnFL) else 10
+  #  pl <- lapply(sample(c(fnFL), sample_number), plotQualityProfile)
+  #  print(pl)
+  #  ml <- marrangeGrob(pl, nrow = 2, ncol = 1)
+  #  ggsave(file.path(outp, "qualityplots_fwd.pdf"), 
+  #         ml, width = 20, height = 26, unit = "cm", dpi = 300)
+  #}  
+ 
+    # dereplicate
+  derepFsR1 <- derepFastq(filtFsR1[outR1[,2] > 0], verbose = TRUE)
+  # Name the derep-class objects by the sample names
+  names(derepFsR1) <- sample.names[outR1[,2] > 0]
+  
+  # Train dada2 to your dataset
+  errFR1 <- learnErrors(derepFsR1, multithread = TRUE,
+                        errorEstimationFunction=PacBioErrfun, BAND_SIZE=32)
+  saveRDS(errFR1, file.path(outp, paste0("errs.rds")))
+  
+  #eFplot <- plotErrors(errFR1, nominalQ = TRUE)
+  #ggsave(file.path(outp, "R1_error_profile.pdf"),
+  #       eFplot, width = 20, height = 26, unit = "cm", dpi = 300)
+
+  # sample inference
+  dadaFsR1 <- dada(derepFsR1, err = errFR1, multithread = TRUE)
+
+
+  # make sequence table and distribution table for sequence lengths
+  seqtabR1 <- makeSequenceTable(dadaFsR1)
+  ## SWR: If Collapse_overlapping=TRUE, collapses identical sequences together 
+  # based on an overlap specified in 'Collapse_minOverlap'. 
+  if (Collapse_overlapping == TRUE) {
+    seqtabR1 <- collapseNoMismatch(seqtabR1, minOverlap = Collapse_minOverlap, 
+                                   orderBy = "abundance", identicalOnly = FALSE, 
+                                   vec = TRUE, verbose = FALSE)
+  }
+  dim(seqtabR1)
+  table(nchar(getSequences(seqtabR1)))
+  # RDP can't assign taxonomy to sequences shorter than 50bp, so filter to > 50bp
+  seqtabR1 <- seqtabR1[,nchar(colnames(seqtabR1)) > 49]
+
+  ## SWR: save RDS format seqtab to allow for merging of multiple runs and later 
+  # analysis without rerunning the whole pipeline
+  saveRDS(seqtabR1, file.path(outp, "seqtab.rds"))
+
+  # remove chimeric sequences
+  seqtabR1.nochim <- removeBimeraDenovo(seqtabR1, method = "consensus", 
+                                        multithread = TRUE, verbose = TRUE)
+  dim(seqtabR1.nochim)
+  # calculate percent of sequences that are non-chimeric
+  sum(seqtabR1.nochim)/sum(seqtabR1)
+  write.table(seqtabR1.nochim, file.path(path, marker, 
+                                         "R01_only", "seqtabR1.nochim"))
+
+  ## SWR: additional .RDS output table for easy downstream use in e.g. phyloseq
+  saveRDS(seqtabR1.nochim, file.path(path, marker, 
+                                     "R01_only", "seqtab_nochim.rds"))
+
+  # output table for each sample 
+  getN <- function(x) sum(getUniques(x))
+  if (min(outR1[,2]) > 0) {
+    trackR1 <- cbind(outR1, sapply(dadaFsR1,getN), rowSums(seqtabR1.nochim),
+                     rowSums(seqtabR1.nochim)/outR1[,1]*100)
+    colnames(trackR1) <- c("input", "filtered", "denoisedF", 
+                           "nonchim","percent_retained")
+    rownames(trackR1) <- c(sample.names)
+  } else {
+    trackR1 <- cbind(outR1[outR1[,2] > 0,], sapply(dadaFsR1, getN), 
+                     rowSums(seqtabR1.nochim), 
+                     rowSums(seqtabR1.nochim)/outR1[,1][outR1[,2] > 0]*100)
+    trackR1 <- rbind(trackR1,cbind(outR1[,1][outR1[,2] == 0], 
+                                   outR1[,2][outR1[,2] == 0],
+                                   NA,NA,NA))
+    colnames(trackR1) <- c("input", "filtered", "denoisedF", 
+                           "nonchim","percent_retained")
+    rownames(trackR1) <- c(sample.names[!outR1[,2] == 0], 
+                           sample.names[outR1[,2] == 0])
+  }
+
+  head(trackR1)
+  write.table(trackR1, file.path(outp, "trackR1.txt"))
+
+  if (assign_taxonomy == "TRUE") {
+    taxaR1 <- assignTaxonomy(seqtabR1.nochim, tax_database, multithread = TRUE, 
+                             minBoot = my_minBootstrap, outputBootstraps = TRUE)
+    sum(rownames(taxaR1[[1]]) == colnames(seqtabR1.nochim)) == nrow(taxaR1[[1]])
+    tmp <- cbind(rownames(taxaR1[[1]]), taxaR1[[1]], taxaR1[[2]])
+    rownames(tmp) <- paste0("ASV", 1:length(colnames(seqtabR1.nochim)))
+    tmp <- data.frame(tmp)
+    write.table(tmp, file.path(path, marker, 
+                               "R01_only", "taxonomy_ASVs_NC_R1.txt"))
+
+    ## SWR: save RDS file omitting bootstrap values for downstream use in 
+    # e.g phyloseq
+    saveRDS(taxaR1$tax, file.path(outp, "taxa.rds"))
+
+    tmp2 <- cbind(t(seqtabR1.nochim),tmp)
+    rownames(tmp2) <- paste0("ASV", 1:length(colnames(seqtabR1.nochim)))
+    write.table(tmp2, file.path(path, marker, 
+                                "R01_only", "seqtabR1.nochim_withtax.txt"))
+    
+    #write raw ASVs to a fasta file
+    seqsnochimR1 <- DNAStringSet(colnames(seqtabR1.nochim))
+    seqsnochimR1@ranges@NAMES <- paste0("ASV",1:length(
+      colnames(seqtabR1.nochim)))
+    writeXStringSet(seqsnochimR1, file.path(path, marker,
+                                            "R01_only", "ASVs_raw.fasta"),
+                    format = "fasta")
+    
+    #write ASVs with taxonomy in the header to a fasta file
+    seqsnochim <- DNAStringSet(colnames(seqtabR1.nochim))
+   seqsnochim@ranges@NAMES <- paste0("ASV", 
+                                   1:length(colnames(seqtabR1.nochim)),
+                                   "|","|",
+                                   paste(tmp$Kingdom, tmp$Phylum, tmp$Class,
+                                         tmp$Order, tmp$Family, tmp$Genus, 
+                                         tmp$Species, sep = ";"))
+    writeXStringSet(seqsnochim, file.path(path, marker, 
+                                          "R01_only", "ASVs_withtax_R1.fasta"), 
+                    format = "fasta")
+  } else {
+    #write raw ASVs to a fasta file
+    seqsnochimR1 <- DNAStringSet(colnames(seqtabR1.nochim))
+    seqsnochimR1@ranges@NAMES <- paste0("ASV",1:length(
+      colnames(seqtabR1.nochim)))
+    writeXStringSet(seqsnochimR1, file.path(path, marker, 
+                                            "R01_only", "ASVs_raw.fasta"), 
+                    format = "fasta")
+  }
+}
+
+#make paths available to subsequent chunks
+dada2_dir <- outp
+```
--- a/README.md
+++ b/README.md
 # Rysto supplementary code

 This repository contains supplementary code and intermediary files for the submitted manuscript "Diversity of the _Rysto_ gene conferring resistance to potato virus Y in wild relatives of potato" by Paluchowska et al.
+
+Raw data deposited in the SRA was demultiplexed using Lima (see lima_command.txt) and ASVs were determined from the resulting files using the script in "Dada2_pipeline_var_PacBio_v4-3.Rmd". The resulting ASVs and sequence table are found in the folder "DADA2 output".
--- a/lima_command.txt
+++ b/lima_command.txt
+lima --split-named xxxx.gz barcodes.fasta xxxx.demux.fastq