From 08ee62b65402d761e5fc04ec44b48cccb83d2e07 Mon Sep 17 00:00:00 2001
From: simeon <simeon.rossmann@nibio.no>
Date: Fri, 25 Oct 2024 14:49:56 +0200
Subject: [PATCH] ENA parser

---
 ENA submission/ena_parser.Rmd                 | 69 +++++++++++++++++++
 .../fastq2_template_1729686589164.tsv         |  2 +
 2 files changed, 71 insertions(+)
 create mode 100644 ENA submission/ena_parser.Rmd
 create mode 100644 ENA submission/fastq2_template_1729686589164.tsv

diff --git a/ENA submission/ena_parser.Rmd b/ENA submission/ena_parser.Rmd
new file mode 100644
index 0000000..243a876
--- /dev/null
+++ b/ENA submission/ena_parser.Rmd	
@@ -0,0 +1,69 @@
+---
+title: "ena parser"
+author: "Simeon Lim Rossmann"
+date: "2024-10-23"
+output: html_document
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE)
+library(tidyverse)
+```
+
+```{r fill tbl}
+template <- "fastq2_template_1729686589164.tsv"
+header <- scan(template, nlines = 1, what = character())
+tbl <- readr::read_tsv(template, skip = 1) %>% 
+  dplyr::right_join(tibble(
+    instrument_model= "Illumina MiSeq",
+    library_source= "METAGENOMIC",
+    library_selection = "PCR",
+    library_strategy= "AMPLICON",
+    library_layout= "PAIRED")
+  )
+
+file_reader <- function(marker){
+  fs <- list.files(file.path("~/Documents/Marte_Metabarcoding/Run_August21", 
+                             marker, "raw_data"), pattern = "R1_001.fastq.gz")
+  fs_full <- list.files(file.path("~/Documents/Marte_Metabarcoding/Run_August21", 
+                                  marker, "raw_data"), pattern = "R1_001.fastq.gz", 
+                        full.names = TRUE)
+  rs <- list.files(file.path("~/Documents/Marte_Metabarcoding/Run_August21", marker, "raw_data"), pattern = "R2_001.fastq.gz")
+  rs_full <- list.files(file.path("~/Documents/Marte_Metabarcoding/Run_August21", marker, "raw_data"), pattern = "R2_001.fastq.gz",
+                        full.names = TRUE)
+  
+  md5_f <- sapply(fs_full, tools::md5sum)
+  md5_r <- sapply(rs_full, tools::md5sum)
+                                
+  samp <- stringr::str_replace(fs, ".*_S", "S") %>% 
+    stringr::str_remove("_L001.*")
+  
+  lib_name <- paste0("NIBIO_mpt_", marker)
+  stud <- "ENA_acc"
+  
+  tibble(sample = samp,
+         study = stud,
+         instrument_model= "Illumina MiSeq",
+         library_name = lib_name,
+         library_source= "METAGENOMIC",
+         library_selection = "PCR",
+         library_strategy= "AMPLICON",
+         library_layout= "PAIRED",
+         forward_file_name = fs,
+         forward_file_md5 = md5_f,
+         reverse_file_name = rs,
+         reverse_file_md5 = md5_r
+         )
+  
+}
+
+markers <- c("Nems",
+             "16S",
+             "FITS1",
+             "FITS2",
+             "OITS",
+             "Trich")
+
+tb <- lapply(markers, file_reader) %>% 
+  bind_rows()
+```
\ No newline at end of file
diff --git a/ENA submission/fastq2_template_1729686589164.tsv b/ENA submission/fastq2_template_1729686589164.tsv
new file mode 100644
index 0000000..da3b31c
--- /dev/null
+++ b/ENA submission/fastq2_template_1729686589164.tsv	
@@ -0,0 +1,2 @@
+FileType	fastq	Read submission file type
+sample	study	instrument_model	library_name	library_source	library_selection	library_strategy	library_layout	forward_file_name	forward_file_md5	reverse_file_name	reverse_file_md5
-- 
GitLab