#' Iterate over threshold values #' #' This function iterates over a range of threshold values and performs clustering #' on a DNA string dataset. #' #' @param MyDNAstring A DNAStringSet object containing the DNA sequences to be clustered. #' @param step_size The step size for the threshold values. Default is 0.01. #' @param step_max The maximum threshold value. Default is 0.99. #' @param ncores The number of cores to use for parallel processing. Default is 1. #' @param method Method used for clustering, can be "Clusterize" (default) or a #' file path to the location of meshclust on the users machine, e.g. obtained by #' running `which meshclust`, see also \code{\link{meshclustR}}. #' @param meshclust_temp_dir temporary directory for meshclust files. #' Temporary files will be removed after running the function. #' @param ... pass on any additional strings to the meshclust commandline tool. #' #' @return A list of clustering results, where each element in the list corresponds to a specific threshold value. #' #' @import DECIPHER #' @import dplyr tibble tidyr #' @importFrom Biostrings DNAStringSet #' #' @examples #' # Create a DNAStringSet object #' my_sequences <- DNAStringSet(c("ACGT", "AGCT", "ATGC")) #' #' # Run the clustering function #' results <- test_clustering_thresholds(my_sequences, step_size = 0.1, step_max = 0.9, ncores = 2) #' @export test_clustering_thresholds <- function(MyDNAstring, step_size, step_max = 0.99, ncores = 1, method = "Clusterize", meshclust_temp_dir = "tmp", ...) { if(method == "Clusterize"){ #DECIPHER removed the 'type' argument from IdClusters around v2.24. Currently (Feb23), the function is renamed "Clusterize" if(numeric_version(packageVersion("DECIPHER")) < 2.24){ clus_tbl <- IdClusters(method = "inexact", myXStringSet = MyDNAstring, cutoff = seq(0, step_max, step_size), processors = ncores, verbose = FALSE) }else{ clus_tbl <- Clusterize(MyDNAstring, cutoff = seq(0, step_max, step_size), processors = ncores, verbose = FALSE) } clus_tbl_list <- as_tibble(clus_tbl, rownames = "seqnames") %>% pivot_longer(cols = -seqnames, names_to = "cutoff", values_to = "cluster") %>% group_by(cutoff) %>% group_split(.keep = FALSE) }else if(file.exists(method)){ preexisting <- dir.exists(meshclust_temp_dir) dir.create(meshclust_temp_dir) clus_tbl_list <- lapply(seq(0, step_max, step_size), meshclustR, seqs = MyDNAstring, filepath = meshclust_temp_dir, meshclust_bin = method, ...) if(!preexisting){ unlink(meshclust_temp_dir) } }else{ stop("'method' argument needs to be 'Clusterize' or the absolute file path to the meshclust bin ('which meshclust' in shell).") } steps_temp <- seq(0, step_max, step_size) names(clus_tbl_list) <- steps_temp return(clus_tbl_list) }