test_clustering_thresholds.R 1.92 KiB
#' Iterate over threshold values
#'
#' This function iterates over a range of threshold values and performs clustering
#' on a DNA string dataset.
#'
#' @param MyDNAstring A DNAStringSet object containing the DNA sequences to be clustered.
#' @param step_size The step size for the threshold values. Default is 0.01.
#' @param step_max The maximum threshold value. Default is 0.99.
#' @param ncores The number of cores to use for parallel processing. Default is 1.
#'
#' @return A list of clustering results, where each element in the list corresponds to a specific threshold value.
#'
#' @import DECIPHER
#' @import tibble
#' @import dplyr
#' @import tidyr
#'
#' @examples
#' # Create a DNAStringSet object
#' my_sequences <- DNAStringSet(c("ACGT", "AGCT", "ATGC"))
#'
#' # Run the clustering function
#' results <- test_clustering_thresholds(my_sequences, step_size = 0.1, step_max = 0.9, ncores = 2)
#' @export
test_clustering_thresholds <- function(MyDNAstring, step_size,
step_max = 0.99, ncores = 1) {
#DECIPHER removed the 'type' argument from IdClusters around v2.24. Currently (Feb23), the function is renamed "Clusterize"
if(numeric_version(packageVersion("DECIPHER")) < 2.24){
clus_tbl <- IdClusters(method = "inexact", myXStringSet = MyDNAstring,
cutoff = seq(0, step_max, step_size), processors = ncores,
verbose = FALSE)
}else{
clus_tbl <- Clusterize(MyDNAstring,
cutoff = seq(0, step_max, step_size),
processors = ncores,
verbose = FALSE)
}
steps_temp <- seq(0, step_max, step_size)
clus_tbl_list <- as_tibble(clus_tbl, rownames = "seqnames") %>%
pivot_longer(cols = -seqnames, names_to = "cutoff", values_to = "cluster") %>%
group_by(cutoff) %>%
group_split(.keep = FALSE)
names(clus_tbl_list) <- steps_temp
return(clus_tbl_list)
}