clustering threshold test for meshclust

e1761832 · Simeon · 2a6eb09c · e1761832 · e1761832
Commit e1761832 authored 1 year ago by Simeon
--- a/R/test_clustering_thresholds.R
+++ b/R/test_clustering_thresholds.R
@@ -7,6 +7,11 @@
 #' @param step_size The step size for the threshold values. Default is 0.01.
 #' @param step_max The maximum threshold value. Default is 0.99.
 #' @param ncores The number of cores to use for parallel processing. Default is 1.
+#' @param method Method used for clustering, can be "Clusterize" (default) or a
+#' file path to the location of meshclust on the users machine, e.g. obtained by
+#' running `which meshclust`, see also \code{\link{meshclustR}}.
+#' @param meshclust_temp_dir temporary directory for meshclust files.
+#' Temporary files will be removed after running the function.
 #'
 #' @return A list of clustering results, where each element in the list corresponds to a specific threshold value.
 #'
@@ -22,28 +27,45 @@
 #' results <- test_clustering_thresholds(my_sequences, step_size = 0.1, step_max = 0.9, ncores = 2)
 #' @export
 test_clustering_thresholds <- function(MyDNAstring, step_size,
-                                       step_max = 0.99, ncores = 1) {
+                                       step_max = 0.99, ncores = 1,
+                                       method = "Clusterize",
+                                       meshclust_temp_dir = "tmp") {

-  #DECIPHER removed the 'type' argument from IdClusters around v2.24. Currently (Feb23), the function is renamed "Clusterize"
-  if(numeric_version(packageVersion("DECIPHER")) < 2.24){
-    clus_tbl <- IdClusters(method = "inexact", myXStringSet = MyDNAstring,
-                           cutoff = seq(0, step_max, step_size),
-                           processors = ncores,
-                           verbose = FALSE)
+  if(method == "Clusterize"){
+    #DECIPHER removed the 'type' argument from IdClusters around v2.24. Currently (Feb23), the function is renamed "Clusterize"
+    if(numeric_version(packageVersion("DECIPHER")) < 2.24){
+      clus_tbl <- IdClusters(method = "inexact", myXStringSet = MyDNAstring,
+                             cutoff = seq(0, step_max, step_size),
+                             processors = ncores,
+                             verbose = FALSE)
+    }else{
+      clus_tbl <- Clusterize(MyDNAstring,
+                             cutoff = seq(0, step_max, step_size),
+                             processors = ncores,
+                             verbose = FALSE)
+    }
+
+    clus_tbl_list <- as_tibble(clus_tbl, rownames = "seqnames") %>%
+      pivot_longer(cols = -seqnames, names_to = "cutoff", values_to = "cluster") %>%
+      group_by(cutoff) %>%
+      group_split(.keep = FALSE)
+
+  }else if(dir.exists(method)){
+    preexisting <- dir.exists(meshclust_temp_dir)
+    dir.create(meshclust_temp_dir)
+    clust_tbl_list <- lapply(seq(0, step_max, step_size),
+                             meshclustR, seqs = MyDNAstring,
+                             filepath = meshclust_temp_dir,
+                             meshclust_bin = method)
+    if(!preexisting){
+      unlink(meshclust_temp_dir)
+    }
  }else{
-    clus_tbl <- Clusterize(MyDNAstring,
-                           cutoff = seq(0, step_max, step_size),
-                           processors = ncores,
-                           verbose = FALSE)
+    stop("'method' argument needs to be 'Clusterize' or the file path to the meshclust bin ('which meshclust' in shell).")
  }

-  steps_temp <- seq(0, step_max, step_size)
-  clus_tbl_list <- as_tibble(clus_tbl, rownames = "seqnames") %>%
-    pivot_longer(cols = -seqnames, names_to = "cutoff", values_to = "cluster") %>%
-    group_by(cutoff) %>%
-    group_split(.keep = FALSE)
-

+  steps_temp <- seq(0, step_max, step_size)
  names(clus_tbl_list) <- steps_temp
  return(clus_tbl_list)
 }
--- a/man/test_clustering_thresholds.Rd
+++ b/man/test_clustering_thresholds.Rd
@@ -4,7 +4,14 @@
 \alias{test_clustering_thresholds}
 \title{Iterate over threshold values}
 \usage{
-test_clustering_thresholds(MyDNAstring, step_size, step_max = 0.99, ncores = 1)
+test_clustering_thresholds(
+  MyDNAstring,
+  step_size,
+  step_max = 0.99,
+  ncores = 1,
+  method = "Clusterize",
+  meshclust_temp_dir = "tmp"
+)
 }
 \arguments{
 \item{MyDNAstring}{A DNAStringSet object containing the DNA sequences to be clustered.}
@@ -14,6 +21,13 @@ test_clustering_thresholds(MyDNAstring, step_size, step_max = 0.99, ncores = 1)
 \item{step_max}{The maximum threshold value. Default is 0.99.}

 \item{ncores}{The number of cores to use for parallel processing. Default is 1.}
+
+\item{method}{Method used for clustering, can be "Clusterize" (default) or a
+file path to the location of meshclust on the users machine, e.g. obtained by
+running `which meshclust`, see also \code{\link{meshclustR}}.}
+
+\item{meshclust_temp_dir}{temporary directory for meshclust files.
+Temporary files will be removed after running the function.}
 }
 \value{
 A list of clustering results, where each element in the list corresponds to a specific threshold value.