From a9bca3ea476d75191a71a86415139752ff506866 Mon Sep 17 00:00:00 2001 From: Simeon <51403284+simeross@users.noreply.github.com> Date: Fri, 22 Nov 2024 11:30:20 +0100 Subject: [PATCH] sra parser --- .../SRA_metadata.txt | 0 SRA submission/SRA_submission.tsv | 1 + SRA submission/sra_parser.Rmd | 91 +++++++++++++++++++ 3 files changed, 92 insertions(+) rename SRA_metadata.txt => SRA submission/SRA_metadata.txt (100%) create mode 100644 SRA submission/SRA_submission.tsv create mode 100644 SRA submission/sra_parser.Rmd diff --git a/SRA_metadata.txt b/SRA submission/SRA_metadata.txt similarity index 100% rename from SRA_metadata.txt rename to SRA submission/SRA_metadata.txt diff --git a/SRA submission/SRA_submission.tsv b/SRA submission/SRA_submission.tsv new file mode 100644 index 0000000..9eaa882 --- /dev/null +++ b/SRA submission/SRA_submission.tsv @@ -0,0 +1 @@ +sample_name library_ID title library_strategy library_source library_selection library_layout platform instrument_model design_description filetype filename filename2 filename3 filename4 assembly fasta_file sample diff --git a/SRA submission/sra_parser.Rmd b/SRA submission/sra_parser.Rmd new file mode 100644 index 0000000..bb4977f --- /dev/null +++ b/SRA submission/sra_parser.Rmd @@ -0,0 +1,91 @@ +--- +title: "sra parser" +author: "Simeon Lim Rossmann" +date: "2024-11-22" +output: html_document +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) +library(tidyverse) +``` + +```{r fill tbl} +template <- "SRA_metadata.txt" +tbl <- readr::read_tsv(template) %>% + dplyr::right_join(tibble( + instrument_model= "Illumina MiSeq", + platform = 'ILLUMINA', + library_source= "METAGENOMIC", + library_selection = "PCR", + library_strategy= "AMPLICON", + library_layout= "PAIRED", + filetype = 'fastq' + ) + ) + +file_reader <- function(marker){ + fs <- list.files(file.path("~/Documents/Marte_Metabarcoding/Run_August21", + marker[1], "raw_data"), + pattern = "R1_001.fastq.gz") + fs_full <- list.files(file.path("~/Documents/Marte_Metabarcoding/Run_August21", + marker[1], "raw_data"), + pattern = "R1_001.fastq.gz", + full.names = TRUE) + rs <- list.files(file.path("~/Documents/Marte_Metabarcoding/Run_August21", + marker[1], "raw_data"), + pattern = "R2_001.fastq.gz") + rs_full <- list.files(file.path("~/Documents/Marte_Metabarcoding/Run_August21", + marker[1], "raw_data"), + pattern = "R2_001.fastq.gz", + full.names = TRUE) + + # md5_f <- sapply(fs_full, tools::md5sum) + # md5_r <- sapply(rs_full, tools::md5sum) + + samp <- stringr::str_replace(fs, ".*_S", "S") %>% + stringr::str_remove("_L001.*") + + lib_name <- paste0("NIBIO_mpt_", marker[1], + '_', samp) + + out <- tibble(sample = samp, + library_ID = lib_name, + instrument_model= "Illumina MiSeq", + platform = 'ILLUMINA', + library_source= "METAGENOMIC", + library_selection = "PCR", + library_strategy= "AMPLICON", + library_layout= "PAIRED", + filetype = 'fastq', + design_description = paste0('Total DNA was extracted from 45 mL soil samples. ', + 'Each soil sample and DNA from the appropriate ', + 'mock control was amplified with "', marker[1], + '" PCR primers targeting the amplification ', + 'of ', marker[2], ' sequences. Samples were ', + 'indexed in the amplification PCR (1-step) ', + 'and demultiplexed by the MiSeq.'), + filename=fs, + filename2=rs + ) + + return(out) + +} + +markers <- list(c("Nems",'nematode 18S'), + c("16S",'bacterial 16S'), + c("FITS1", 'fungal ITS1'), + c("FITS2", 'fungal ITS2'), + c("OITS", 'oomycete ITS1'), + c("Trich", 'Trichodoridae 18S')) + +tb <- lapply(markers, file_reader) %>% + bind_rows() + +### Fuse back to template to make sure all column names are identical +tbl <- right_join(tbl, tb) +all.equal(as.data.frame(tbl), as.data.frame(tb)) + +write_tsv(tbl, "SRA_submission.tsv", col_names = TRUE) +``` \ No newline at end of file -- GitLab