diff --git a/R/test_clustering_thresholds.R b/R/test_clustering_thresholds.R index 766885c90090854105eaeaadae5d32f846143a00..203cbb82ef35727398a61cfc0237111c690ec9ac 100644 --- a/R/test_clustering_thresholds.R +++ b/R/test_clustering_thresholds.R @@ -7,6 +7,11 @@ #' @param step_size The step size for the threshold values. Default is 0.01. #' @param step_max The maximum threshold value. Default is 0.99. #' @param ncores The number of cores to use for parallel processing. Default is 1. +#' @param method Method used for clustering, can be "Clusterize" (default) or a +#' file path to the location of meshclust on the users machine, e.g. obtained by +#' running `which meshclust`, see also \code{\link{meshclustR}}. +#' @param meshclust_temp_dir temporary directory for meshclust files. +#' Temporary files will be removed after running the function. #' #' @return A list of clustering results, where each element in the list corresponds to a specific threshold value. #' @@ -22,28 +27,45 @@ #' results <- test_clustering_thresholds(my_sequences, step_size = 0.1, step_max = 0.9, ncores = 2) #' @export test_clustering_thresholds <- function(MyDNAstring, step_size, - step_max = 0.99, ncores = 1) { + step_max = 0.99, ncores = 1, + method = "Clusterize", + meshclust_temp_dir = "tmp") { - #DECIPHER removed the 'type' argument from IdClusters around v2.24. Currently (Feb23), the function is renamed "Clusterize" - if(numeric_version(packageVersion("DECIPHER")) < 2.24){ - clus_tbl <- IdClusters(method = "inexact", myXStringSet = MyDNAstring, - cutoff = seq(0, step_max, step_size), - processors = ncores, - verbose = FALSE) + if(method == "Clusterize"){ + #DECIPHER removed the 'type' argument from IdClusters around v2.24. Currently (Feb23), the function is renamed "Clusterize" + if(numeric_version(packageVersion("DECIPHER")) < 2.24){ + clus_tbl <- IdClusters(method = "inexact", myXStringSet = MyDNAstring, + cutoff = seq(0, step_max, step_size), + processors = ncores, + verbose = FALSE) + }else{ + clus_tbl <- Clusterize(MyDNAstring, + cutoff = seq(0, step_max, step_size), + processors = ncores, + verbose = FALSE) + } + + clus_tbl_list <- as_tibble(clus_tbl, rownames = "seqnames") %>% + pivot_longer(cols = -seqnames, names_to = "cutoff", values_to = "cluster") %>% + group_by(cutoff) %>% + group_split(.keep = FALSE) + + }else if(dir.exists(method)){ + preexisting <- dir.exists(meshclust_temp_dir) + dir.create(meshclust_temp_dir) + clust_tbl_list <- lapply(seq(0, step_max, step_size), + meshclustR, seqs = MyDNAstring, + filepath = meshclust_temp_dir, + meshclust_bin = method) + if(!preexisting){ + unlink(meshclust_temp_dir) + } }else{ - clus_tbl <- Clusterize(MyDNAstring, - cutoff = seq(0, step_max, step_size), - processors = ncores, - verbose = FALSE) + stop("'method' argument needs to be 'Clusterize' or the file path to the meshclust bin ('which meshclust' in shell).") } - steps_temp <- seq(0, step_max, step_size) - clus_tbl_list <- as_tibble(clus_tbl, rownames = "seqnames") %>% - pivot_longer(cols = -seqnames, names_to = "cutoff", values_to = "cluster") %>% - group_by(cutoff) %>% - group_split(.keep = FALSE) - + steps_temp <- seq(0, step_max, step_size) names(clus_tbl_list) <- steps_temp return(clus_tbl_list) } diff --git a/man/test_clustering_thresholds.Rd b/man/test_clustering_thresholds.Rd index 3081ae33a2ce5af5014e4d69834e9cdf0c2734e6..88e2194527332181db60e8393bf5d32d376cd1a1 100644 --- a/man/test_clustering_thresholds.Rd +++ b/man/test_clustering_thresholds.Rd @@ -4,7 +4,14 @@ \alias{test_clustering_thresholds} \title{Iterate over threshold values} \usage{ -test_clustering_thresholds(MyDNAstring, step_size, step_max = 0.99, ncores = 1) +test_clustering_thresholds( + MyDNAstring, + step_size, + step_max = 0.99, + ncores = 1, + method = "Clusterize", + meshclust_temp_dir = "tmp" +) } \arguments{ \item{MyDNAstring}{A DNAStringSet object containing the DNA sequences to be clustered.} @@ -14,6 +21,13 @@ test_clustering_thresholds(MyDNAstring, step_size, step_max = 0.99, ncores = 1) \item{step_max}{The maximum threshold value. Default is 0.99.} \item{ncores}{The number of cores to use for parallel processing. Default is 1.} + +\item{method}{Method used for clustering, can be "Clusterize" (default) or a +file path to the location of meshclust on the users machine, e.g. obtained by +running `which meshclust`, see also \code{\link{meshclustR}}.} + +\item{meshclust_temp_dir}{temporary directory for meshclust files. +Temporary files will be removed after running the function.} } \value{ A list of clustering results, where each element in the list corresponds to a specific threshold value.