From a9bca3ea476d75191a71a86415139752ff506866 Mon Sep 17 00:00:00 2001
From: Simeon <51403284+simeross@users.noreply.github.com>
Date: Fri, 22 Nov 2024 11:30:20 +0100
Subject: [PATCH] sra parser

---
 .../SRA_metadata.txt                          |  0
 SRA submission/SRA_submission.tsv             |  1 +
 SRA submission/sra_parser.Rmd                 | 91 +++++++++++++++++++
 3 files changed, 92 insertions(+)
 rename SRA_metadata.txt => SRA submission/SRA_metadata.txt (100%)
 create mode 100644 SRA submission/SRA_submission.tsv
 create mode 100644 SRA submission/sra_parser.Rmd

diff --git a/SRA_metadata.txt b/SRA submission/SRA_metadata.txt
similarity index 100%
rename from SRA_metadata.txt
rename to SRA submission/SRA_metadata.txt
diff --git a/SRA submission/SRA_submission.tsv b/SRA submission/SRA_submission.tsv
new file mode 100644
index 0000000..9eaa882
--- /dev/null
+++ b/SRA submission/SRA_submission.tsv	
@@ -0,0 +1 @@
+sample_name	library_ID	title	library_strategy	library_source	library_selection	library_layout	platform	instrument_model	design_description	filetype	filename	filename2	filename3	filename4	assembly	fasta_file	sample
diff --git a/SRA submission/sra_parser.Rmd b/SRA submission/sra_parser.Rmd
new file mode 100644
index 0000000..bb4977f
--- /dev/null
+++ b/SRA submission/sra_parser.Rmd	
@@ -0,0 +1,91 @@
+---
+title: "sra parser"
+author: "Simeon Lim Rossmann"
+date: "2024-11-22"
+output: html_document
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE)
+library(tidyverse)
+```
+
+```{r fill tbl}
+template <- "SRA_metadata.txt"
+tbl <- readr::read_tsv(template) %>% 
+  dplyr::right_join(tibble(
+    instrument_model= "Illumina MiSeq",
+    platform = 'ILLUMINA',
+    library_source= "METAGENOMIC",
+    library_selection = "PCR",
+    library_strategy= "AMPLICON",
+    library_layout= "PAIRED",
+    filetype = 'fastq'
+    )
+  )
+
+file_reader <- function(marker){
+  fs <- list.files(file.path("~/Documents/Marte_Metabarcoding/Run_August21", 
+                             marker[1], "raw_data"), 
+                   pattern = "R1_001.fastq.gz")
+  fs_full <- list.files(file.path("~/Documents/Marte_Metabarcoding/Run_August21", 
+                                  marker[1], "raw_data"), 
+                        pattern = "R1_001.fastq.gz", 
+                        full.names = TRUE)
+  rs <- list.files(file.path("~/Documents/Marte_Metabarcoding/Run_August21", 
+                             marker[1], "raw_data"), 
+                   pattern = "R2_001.fastq.gz")
+  rs_full <- list.files(file.path("~/Documents/Marte_Metabarcoding/Run_August21", 
+                                  marker[1], "raw_data"), 
+                        pattern = "R2_001.fastq.gz",
+                        full.names = TRUE)
+  
+  # md5_f <- sapply(fs_full, tools::md5sum)
+  # md5_r <- sapply(rs_full, tools::md5sum)
+                                
+  samp <- stringr::str_replace(fs, ".*_S", "S") %>% 
+    stringr::str_remove("_L001.*")
+  
+  lib_name <- paste0("NIBIO_mpt_", marker[1], 
+                     '_', samp)
+  
+  out <- tibble(sample = samp,
+         library_ID = lib_name,
+         instrument_model= "Illumina MiSeq",
+         platform = 'ILLUMINA',
+         library_source= "METAGENOMIC",
+         library_selection = "PCR",
+         library_strategy= "AMPLICON",
+         library_layout= "PAIRED",
+         filetype = 'fastq',
+         design_description = paste0('Total DNA was extracted from 45 mL soil samples. ',
+                                'Each soil sample and DNA from the appropriate ',
+                                'mock control was amplified with "', marker[1],
+                                '" PCR primers targeting the amplification ',
+                                'of ', marker[2], ' sequences. Samples were ',
+                                'indexed in the amplification PCR (1-step) ',
+                                'and demultiplexed by the MiSeq.'),
+         filename=fs,
+         filename2=rs
+         )
+  
+  return(out)
+  
+}
+
+markers <- list(c("Nems",'nematode 18S'),
+             c("16S",'bacterial 16S'),
+             c("FITS1", 'fungal ITS1'),
+             c("FITS2", 'fungal ITS2'),
+             c("OITS", 'oomycete ITS1'),
+             c("Trich", 'Trichodoridae 18S'))
+
+tb <- lapply(markers, file_reader) %>% 
+  bind_rows()
+
+### Fuse back to template to make sure all column names are identical
+tbl <- right_join(tbl, tb)
+all.equal(as.data.frame(tbl), as.data.frame(tb))
+
+write_tsv(tbl, "SRA_submission.tsv", col_names = TRUE)
+```
\ No newline at end of file
-- 
GitLab