test_clustering_thresholds.R

  
 #' Iterate over threshold values
#'
#' This function iterates over a range of threshold values and performs clustering
#' on a DNA string dataset.
#'
#' @param MyDNAstring A DNAStringSet object containing the DNA sequences to be clustered.
#' @param step_size The step size for the threshold values. Default is 0.01.
#' @param step_max The maximum threshold value. Default is 0.99.
#' @param ncores The number of cores to use for parallel processing. Default is 1.
#'
#' @return A list of clustering results, where each element in the list corresponds to a specific threshold value.
#'
#' @import DECIPHER
#' @import tibble
#' @import dplyr
#' @import tidyr
#'
#' @examples
#' # Create a DNAStringSet object
#' my_sequences <- DNAStringSet(c("ACGT", "AGCT", "ATGC"))
#'
#' # Run the clustering function
#' results <- test_clustering_thresholds(my_sequences, step_size = 0.1, step_max = 0.9, ncores = 2)
#' @export
test_clustering_thresholds <- function(MyDNAstring, step_size,
                                       step_max = 0.99, ncores = 1) {

  #DECIPHER removed the 'type' argument from IdClusters around v2.24. Currently (Feb23), the function is renamed "Clusterize"
  if(numeric_version(packageVersion("DECIPHER")) < 2.24){
    clus_tbl <- IdClusters(method = "inexact", myXStringSet = MyDNAstring,
                           cutoff = seq(0, step_max, step_size), processors = ncores,
                           verbose = FALSE)
  }else{
    clus_tbl <- Clusterize(MyDNAstring,
                           cutoff = seq(0, step_max, step_size),
                           processors = ncores,
                           verbose = FALSE)
  }

  steps_temp <- seq(0, step_max, step_size)
  clus_tbl_list <- as_tibble(clus_tbl, rownames = "seqnames") %>%
    pivot_longer(cols = -seqnames, names_to = "cutoff", values_to = "cluster") %>%
    group_by(cutoff) %>%
    group_split(.keep = FALSE)


  names(clus_tbl_list) <- steps_temp
  return(clus_tbl_list)
}