From 08ee62b65402d761e5fc04ec44b48cccb83d2e07 Mon Sep 17 00:00:00 2001 From: simeon <simeon.rossmann@nibio.no> Date: Fri, 25 Oct 2024 14:49:56 +0200 Subject: [PATCH] ENA parser --- ENA submission/ena_parser.Rmd | 69 +++++++++++++++++++ .../fastq2_template_1729686589164.tsv | 2 + 2 files changed, 71 insertions(+) create mode 100644 ENA submission/ena_parser.Rmd create mode 100644 ENA submission/fastq2_template_1729686589164.tsv diff --git a/ENA submission/ena_parser.Rmd b/ENA submission/ena_parser.Rmd new file mode 100644 index 0000000..243a876 --- /dev/null +++ b/ENA submission/ena_parser.Rmd @@ -0,0 +1,69 @@ +--- +title: "ena parser" +author: "Simeon Lim Rossmann" +date: "2024-10-23" +output: html_document +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) +library(tidyverse) +``` + +```{r fill tbl} +template <- "fastq2_template_1729686589164.tsv" +header <- scan(template, nlines = 1, what = character()) +tbl <- readr::read_tsv(template, skip = 1) %>% + dplyr::right_join(tibble( + instrument_model= "Illumina MiSeq", + library_source= "METAGENOMIC", + library_selection = "PCR", + library_strategy= "AMPLICON", + library_layout= "PAIRED") + ) + +file_reader <- function(marker){ + fs <- list.files(file.path("~/Documents/Marte_Metabarcoding/Run_August21", + marker, "raw_data"), pattern = "R1_001.fastq.gz") + fs_full <- list.files(file.path("~/Documents/Marte_Metabarcoding/Run_August21", + marker, "raw_data"), pattern = "R1_001.fastq.gz", + full.names = TRUE) + rs <- list.files(file.path("~/Documents/Marte_Metabarcoding/Run_August21", marker, "raw_data"), pattern = "R2_001.fastq.gz") + rs_full <- list.files(file.path("~/Documents/Marte_Metabarcoding/Run_August21", marker, "raw_data"), pattern = "R2_001.fastq.gz", + full.names = TRUE) + + md5_f <- sapply(fs_full, tools::md5sum) + md5_r <- sapply(rs_full, tools::md5sum) + + samp <- stringr::str_replace(fs, ".*_S", "S") %>% + stringr::str_remove("_L001.*") + + lib_name <- paste0("NIBIO_mpt_", marker) + stud <- "ENA_acc" + + tibble(sample = samp, + study = stud, + instrument_model= "Illumina MiSeq", + library_name = lib_name, + library_source= "METAGENOMIC", + library_selection = "PCR", + library_strategy= "AMPLICON", + library_layout= "PAIRED", + forward_file_name = fs, + forward_file_md5 = md5_f, + reverse_file_name = rs, + reverse_file_md5 = md5_r + ) + +} + +markers <- c("Nems", + "16S", + "FITS1", + "FITS2", + "OITS", + "Trich") + +tb <- lapply(markers, file_reader) %>% + bind_rows() +``` \ No newline at end of file diff --git a/ENA submission/fastq2_template_1729686589164.tsv b/ENA submission/fastq2_template_1729686589164.tsv new file mode 100644 index 0000000..da3b31c --- /dev/null +++ b/ENA submission/fastq2_template_1729686589164.tsv @@ -0,0 +1,2 @@ +FileType fastq Read submission file type +sample study instrument_model library_name library_source library_selection library_strategy library_layout forward_file_name forward_file_md5 reverse_file_name reverse_file_md5 -- GitLab