From 7515e0568641031b4c4522cdfc867ad9505370fc Mon Sep 17 00:00:00 2001 From: Chris McComb Date: Sat, 27 Feb 2021 11:42:40 -0500 Subject: [PATCH] Roughing out code for OPTICS --- src/cluster/mod.rs | 1 + src/cluster/optics.rs | 325 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 326 insertions(+) create mode 100644 src/cluster/optics.rs diff --git a/src/cluster/mod.rs b/src/cluster/mod.rs index be6ef9f0..ca040e52 100644 --- a/src/cluster/mod.rs +++ b/src/cluster/mod.rs @@ -6,3 +6,4 @@ pub mod dbscan; /// An iterative clustering algorithm that aims to find local maxima in each iteration. pub mod kmeans; +pub mod optics; diff --git a/src/cluster/optics.rs b/src/cluster/optics.rs new file mode 100644 index 00000000..e8a84937 --- /dev/null +++ b/src/cluster/optics.rs @@ -0,0 +1,325 @@ +//! # OPTICS Clustering +//! +//! Description goes here +//! +//! Example: +//! +//! ``` +//! use smartcore::linalg::naive::dense_matrix::*; +//! use smartcore::cluster::optics::*; +//! use smartcore::math::distance::Distances; +//! use smartcore::algorithm::neighbour::KNNAlgorithmName; +//! use smartcore::dataset::generator; +//! +//! // Generate three blobs +//! let blobs = generator::make_blobs(100, 2, 3); +//! let x = DenseMatrix::from_vec(blobs.num_samples, blobs.num_features, &blobs.data); +//! // Fit the algorithm and predict cluster labels +//! let labels = OPTICS::fit(&x, OPTICSParameters::default().with_eps(3.0)). +//! and_then(|optics| optics.predict(&x)); +//! +//! println!("{:?}", labels); +//! ``` +//! +//! ## References: +//! + +use std::fmt::Debug; +use std::iter::Sum; + +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; + +use crate::algorithm::neighbour::{KNNAlgorithm, KNNAlgorithmName}; +use crate::api::{Predictor, UnsupervisedEstimator}; +use crate::error::Failed; +use crate::linalg::{row_iter, Matrix}; +use crate::math::distance::euclidian::Euclidian; +use crate::math::distance::{Distance, Distances}; +use crate::math::num::RealNumber; +use crate::tree::decision_tree_classifier::which_max; + +/// OPTICS clustering algorithm +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Debug)] +pub struct OPTICS, T>> { + cluster_labels: Vec, + num_classes: usize, + knn_algorithm: KNNAlgorithm, + eps: T, +} + +#[derive(Debug, Clone)] +/// DBSCAN clustering algorithm parameters +pub struct OPTICSParameters, T>> { + /// a function that defines a distance between each pair of point in training data. + /// This function should extend [`Distance`](../../math/distance/trait.Distance.html) trait. + /// See [`Distances`](../../math/distance/struct.Distances.html) for a list of available functions. + pub distance: D, + /// The number of samples (or total weight) in a neighborhood for a point to be considered as a core point. + pub min_samples: usize, + /// The maximum distance between two samples for one to be considered as in the neighborhood of the other. + pub eps: T, + /// KNN algorithm to use. + pub algorithm: KNNAlgorithmName, +} + +impl, T>> OPTICSParameters { + /// a function that defines a distance between each pair of point in training data. + /// This function should extend [`Distance`](../../math/distance/trait.Distance.html) trait. + /// See [`Distances`](../../math/distance/struct.Distances.html) for a list of available functions. + pub fn with_distance, T>>(self, distance: DD) -> OPTICSParameters { + OPTICSParameters { + distance, + min_samples: self.min_samples, + eps: self.eps, + algorithm: self.algorithm, + } + } + /// The number of samples (or total weight) in a neighborhood for a point to be considered as a core point. + pub fn with_min_samples(mut self, min_samples: usize) -> Self { + self.min_samples = min_samples; + self + } + /// The maximum distance between two samples for one to be considered as in the neighborhood of the other. + pub fn with_eps(mut self, eps: T) -> Self { + self.eps = eps; + self + } + /// KNN algorithm to use. + pub fn with_algorithm(mut self, algorithm: KNNAlgorithmName) -> Self { + self.algorithm = algorithm; + self + } +} + +impl, T>> PartialEq for OPTICS { + fn eq(&self, other: &Self) -> bool { + self.cluster_labels.len() == other.cluster_labels.len() + && self.num_classes == other.num_classes + && self.eps == other.eps + && self.cluster_labels == other.cluster_labels + } +} + +impl Default for OPTICSParameters { + fn default() -> Self { + OPTICSParameters { + distance: Distances::euclidian(), + min_samples: 5, + eps: T::half(), + algorithm: KNNAlgorithmName::CoverTree, + } + } +} + +impl, D: Distance, T>> + UnsupervisedEstimator> for OPTICS +{ + fn fit(x: &M, parameters: OPTICSParameters) -> Result { + OPTICS::fit(x, parameters) + } +} + +impl, D: Distance, T>> Predictor + for OPTICS +{ + fn predict(&self, x: &M) -> Result { + self.predict(x) + } +} + +impl, T>> OPTICS { + /// Fit algorithm to _NxM_ matrix where _N_ is number of samples and _M_ is number of features. + /// * `data` - training instances to cluster + /// * `k` - number of clusters + /// * `parameters` - cluster parameters + pub fn fit>( + x: &M, + parameters: OPTICSParameters, + ) -> Result, Failed> { + if parameters.min_samples < 1 { + return Err(Failed::fit(&"Invalid minPts".to_string())); + } + + if parameters.eps <= T::zero() { + return Err(Failed::fit(&"Invalid radius: ".to_string())); + } + + let mut k = 0; + let queued = -2; + let outlier = -1; + let undefined = -3; + + let n = x.shape().0; + let mut y = vec![undefined; n]; + + let algo = parameters + .algorithm + .fit(row_iter(x).collect(), parameters.distance)?; + + for (i, e) in row_iter(x).enumerate() { + if y[i] == undefined { + let mut neighbors = algo.find_radius(&e, parameters.eps)?; + if neighbors.len() < parameters.min_samples { + y[i] = outlier; + } else { + y[i] = k; + + for j in 0..neighbors.len() { + if y[neighbors[j].0] == undefined { + y[neighbors[j].0] = queued; + } + } + + while !neighbors.is_empty() { + let neighbor = neighbors.pop().unwrap(); + let index = neighbor.0; + + if y[index] == outlier { + y[index] = k; + } + + if y[index] == undefined || y[index] == queued { + y[index] = k; + + let secondary_neighbors = + algo.find_radius(neighbor.2, parameters.eps)?; + + if secondary_neighbors.len() >= parameters.min_samples { + for j in 0..secondary_neighbors.len() { + let label = y[secondary_neighbors[j].0]; + if label == undefined { + y[secondary_neighbors[j].0] = queued; + } + + if label == undefined || label == outlier { + neighbors.push(secondary_neighbors[j]); + } + } + } + } + } + + k += 1; + } + } + } + + Ok(OPTICS { + cluster_labels: y, + num_classes: k as usize, + knn_algorithm: algo, + eps: parameters.eps, + }) + } + + /// Predict clusters for `x` + /// * `x` - matrix with new data to transform of size _KxM_ , where _K_ is number of new samples and _M_ is number of features. + pub fn predict>(&self, x: &M) -> Result { + let (n, m) = x.shape(); + let mut result = M::zeros(1, n); + let mut row = vec![T::zero(); m]; + + for i in 0..n { + x.copy_row_as_vec(i, &mut row); + let neighbors = self.knn_algorithm.find_radius(&row, self.eps)?; + let mut label = vec![0usize; self.num_classes + 1]; + for neighbor in neighbors { + let yi = self.cluster_labels[neighbor.0]; + if yi < 0 { + label[self.num_classes] += 1; + } else { + label[yi as usize] += 1; + } + } + let class = which_max(&label); + if class != self.num_classes { + result.set(0, i, T::from(class).unwrap()); + } else { + result.set(0, i, -T::one()); + } + } + + Ok(result.to_row_vector()) + } + + fn compute_optics_graph(&self) {} + + fn compute_core_distances(&self) {} + + fn set_reach_dist(&self) {} +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::linalg::naive::dense_matrix::DenseMatrix; + #[cfg(feature = "serde")] + use crate::math::distance::euclidian::Euclidian; + + #[test] + fn fit_predict_optics() { + let x = DenseMatrix::from_2d_array(&[ + &[1.0, 2.0], + &[1.1, 2.1], + &[0.9, 1.9], + &[1.2, 2.2], + &[0.8, 1.8], + &[2.0, 1.0], + &[2.1, 1.1], + &[1.9, 0.9], + &[2.2, 1.2], + &[1.8, 0.8], + &[3.0, 5.0], + ]); + + let expected_labels = vec![0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, -1.0]; + + let optics = OPTICS::fit( + &x, + OPTICSParameters::default() + .with_eps(0.5) + .with_min_samples(2), + ) + .unwrap(); + + let predicted_labels = optics.predict(&x).unwrap(); + + assert_eq!(expected_labels, predicted_labels); + } + + #[test] + #[cfg(feature = "serde")] + fn serde() { + let x = DenseMatrix::from_2d_array(&[ + &[5.1, 3.5, 1.4, 0.2], + &[4.9, 3.0, 1.4, 0.2], + &[4.7, 3.2, 1.3, 0.2], + &[4.6, 3.1, 1.5, 0.2], + &[5.0, 3.6, 1.4, 0.2], + &[5.4, 3.9, 1.7, 0.4], + &[4.6, 3.4, 1.4, 0.3], + &[5.0, 3.4, 1.5, 0.2], + &[4.4, 2.9, 1.4, 0.2], + &[4.9, 3.1, 1.5, 0.1], + &[7.0, 3.2, 4.7, 1.4], + &[6.4, 3.2, 4.5, 1.5], + &[6.9, 3.1, 4.9, 1.5], + &[5.5, 2.3, 4.0, 1.3], + &[6.5, 2.8, 4.6, 1.5], + &[5.7, 2.8, 4.5, 1.3], + &[6.3, 3.3, 4.7, 1.6], + &[4.9, 2.4, 3.3, 1.0], + &[6.6, 2.9, 4.6, 1.3], + &[5.2, 2.7, 3.9, 1.4], + ]); + + let optics = OPTICS::fit(&x, Default::default()).unwrap(); + + let deserialized_optics: OPTICS = + serde_json::from_str(&serde_json::to_string(&optics).unwrap()).unwrap(); + + assert_eq!(optics, deserialized_optics); + } +}