From d2e1907cf1c8a23c6694559f612fcb3cccd86582 Mon Sep 17 00:00:00 2001 From: Dopamine Date: Fri, 16 Jan 2026 23:33:33 +0800 Subject: [PATCH 1/4] add naive-bayes --- DIRECTORY.md | 1 + src/machine_learning/mod.rs | 2 + src/machine_learning/naive_bayes.rs | 291 ++++++++++++++++++++++++++++ 3 files changed, 294 insertions(+) create mode 100644 src/machine_learning/naive_bayes.rs diff --git a/DIRECTORY.md b/DIRECTORY.md index c506a261352..ac27b183e8f 100644 --- a/DIRECTORY.md +++ b/DIRECTORY.md @@ -207,6 +207,7 @@ * [K-Nearest Neighbors](https://github.com/TheAlgorithms/Rust/blob/master/src/machine_learning/k_nearest_neighbors.rs) * [Linear Regression](https://github.com/TheAlgorithms/Rust/blob/master/src/machine_learning/linear_regression.rs) * [Logistic Regression](https://github.com/TheAlgorithms/Rust/blob/master/src/machine_learning/logistic_regression.rs) + * [Naive Bayes](https://github.com/TheAlgorithms/Rust/blob/master/src/machine_learning/naive_bayes.rs) * Loss Function * [Average Margin Ranking Loss](https://github.com/TheAlgorithms/Rust/blob/master/src/machine_learning/loss_function/average_margin_ranking_loss.rs) * [Hinge Loss](https://github.com/TheAlgorithms/Rust/blob/master/src/machine_learning/loss_function/hinge_loss.rs) diff --git a/src/machine_learning/mod.rs b/src/machine_learning/mod.rs index b4baa8025cf..9856b9a67e5 100644 --- a/src/machine_learning/mod.rs +++ b/src/machine_learning/mod.rs @@ -4,6 +4,7 @@ mod k_nearest_neighbors; mod linear_regression; mod logistic_regression; mod loss_function; +mod naive_bayes; mod optimization; pub use self::cholesky::cholesky; @@ -18,5 +19,6 @@ pub use self::loss_function::kld_loss; pub use self::loss_function::mae_loss; pub use self::loss_function::mse_loss; pub use self::loss_function::neg_log_likelihood; +pub use self::naive_bayes::naive_bayes; pub use self::optimization::gradient_descent; pub use self::optimization::Adam; diff --git a/src/machine_learning/naive_bayes.rs b/src/machine_learning/naive_bayes.rs new file mode 100644 index 00000000000..bc15b671c16 --- /dev/null +++ b/src/machine_learning/naive_bayes.rs @@ -0,0 +1,291 @@ +/// Naive Bayes classifier for classification tasks. +/// This implementation uses Gaussian Naive Bayes, which assumes that +/// features follow a normal (Gaussian) distribution. +/// The algorithm calculates class priors and feature statistics (mean and variance) +/// for each class, then uses Bayes' theorem to predict class probabilities. + +pub struct ClassStatistics { + pub class_label: f64, + pub prior: f64, + pub feature_means: Vec, + pub feature_variances: Vec, +} + +fn calculate_class_statistics( + training_data: &[(Vec, f64)], + class_label: f64, + num_features: usize, +) -> Option { + let class_samples: Vec<&(Vec, f64)> = training_data + .iter() + .filter(|(_, label)| (*label - class_label).abs() < 1e-10) + .collect(); + + if class_samples.is_empty() { + return None; + } + + let prior = class_samples.len() as f64 / training_data.len() as f64; + + let mut feature_means = vec![0.0; num_features]; + let mut feature_variances = vec![0.0; num_features]; + + // Calculate means + for (features, _) in &class_samples { + for (i, &feature) in features.iter().enumerate() { + if i < num_features { + feature_means[i] += feature; + } + } + } + + let n = class_samples.len() as f64; + for mean in &mut feature_means { + *mean /= n; + } + + // Calculate variances + for (features, _) in &class_samples { + for (i, &feature) in features.iter().enumerate() { + if i < num_features { + let diff = feature - feature_means[i]; + feature_variances[i] += diff * diff; + } + } + } + + let epsilon = 1e-9; + for variance in &mut feature_variances { + *variance = (*variance / n).max(epsilon); + } + + Some(ClassStatistics { + class_label, + prior, + feature_means, + feature_variances, + }) +} + +fn gaussian_log_pdf(x: f64, mean: f64, variance: f64) -> f64 { + let diff = x - mean; + let exponent_term = -(diff * diff) / (2.0 * variance); + let log_coefficient = -0.5 * (2.0 * std::f64::consts::PI * variance).ln(); + log_coefficient + exponent_term +} + + +pub fn train_naive_bayes(training_data: Vec<(Vec, f64)>) -> Option> { + if training_data.is_empty() { + return None; + } + + let num_features = training_data[0].0.len(); + if num_features == 0 { + return None; + } + + // Verify all samples have the same number of features + if !training_data + .iter() + .all(|(features, _)| features.len() == num_features) + { + return None; + } + + // Get unique class labels + let mut unique_classes = Vec::new(); + for (_, label) in &training_data { + if !unique_classes + .iter() + .any(|&c: &f64| (c - *label).abs() < 1e-10) + { + unique_classes.push(*label); + } + } + + let mut class_stats = Vec::new(); + + for class_label in unique_classes { + if let Some(mut stats) = + calculate_class_statistics(&training_data, class_label, num_features) + { + stats.class_label = class_label; + class_stats.push(stats); + } + } + + if class_stats.is_empty() { + return None; + } + + Some(class_stats) +} + + +pub fn predict_naive_bayes(model: &[ClassStatistics], test_point: &[f64]) -> Option { + if model.is_empty() || test_point.is_empty() { + return None; + } + + // Get number of features from the first class statistics + let num_features = model[0].feature_means.len(); + if test_point.len() != num_features { + return None; + } + + let mut best_class = None; + let mut best_log_prob = f64::NEG_INFINITY; + + for stats in model { + // Calculate log probability to avoid underflow + let mut log_prob = stats.prior.ln(); + + for (i, &feature) in test_point.iter().enumerate() { + if i < stats.feature_means.len() && i < stats.feature_variances.len() { + // Use log PDF directly to avoid numerical underflow + log_prob += + gaussian_log_pdf(feature, stats.feature_means[i], stats.feature_variances[i]); + } + } + + if log_prob > best_log_prob { + best_log_prob = log_prob; + best_class = Some(stats.class_label); + } + } + + best_class +} + + +pub fn naive_bayes(training_data: Vec<(Vec, f64)>, test_point: Vec) -> Option { + let model = train_naive_bayes(training_data)?; + predict_naive_bayes(&model, &test_point) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_naive_bayes_simple_classification() { + let training_data = vec![ + (vec![1.0, 1.0], 0.0), + (vec![1.1, 1.0], 0.0), + (vec![1.0, 1.1], 0.0), + (vec![5.0, 5.0], 1.0), + (vec![5.1, 5.0], 1.0), + (vec![5.0, 5.1], 1.0), + ]; + + // Test point closer to class 0 + let test_point = vec![1.05, 1.05]; + let result = naive_bayes(training_data.clone(), test_point); + assert_eq!(result, Some(0.0)); + + // Test point closer to class 1 + let test_point = vec![5.05, 5.05]; + let result = naive_bayes(training_data, test_point); + assert_eq!(result, Some(1.0)); + } + + #[test] + fn test_naive_bayes_one_dimensional() { + let training_data = vec![ + (vec![1.0], 0.0), + (vec![1.1], 0.0), + (vec![1.2], 0.0), + (vec![5.0], 1.0), + (vec![5.1], 1.0), + (vec![5.2], 1.0), + ]; + + let test_point = vec![1.15]; + let result = naive_bayes(training_data.clone(), test_point); + assert_eq!(result, Some(0.0)); + + let test_point = vec![5.15]; + let result = naive_bayes(training_data, test_point); + assert_eq!(result, Some(1.0)); + } + + #[test] + fn test_naive_bayes_empty_training_data() { + let training_data = vec![]; + let test_point = vec![1.0, 2.0]; + let result = naive_bayes(training_data, test_point); + assert_eq!(result, None); + } + + #[test] + fn test_naive_bayes_empty_test_point() { + let training_data = vec![(vec![1.0, 2.0], 0.0)]; + let test_point = vec![]; + let result = naive_bayes(training_data, test_point); + assert_eq!(result, None); + } + + #[test] + fn test_naive_bayes_dimension_mismatch() { + let training_data = vec![(vec![1.0, 2.0], 0.0), (vec![3.0, 4.0], 1.0)]; + let test_point = vec![1.0]; // Wrong dimension + let result = naive_bayes(training_data, test_point); + assert_eq!(result, None); + } + + #[test] + fn test_naive_bayes_inconsistent_feature_dimensions() { + let training_data = vec![ + (vec![1.0, 2.0], 0.0), + (vec![3.0], 1.0), // Different dimension + ]; + let test_point = vec![1.0, 2.0]; + let result = naive_bayes(training_data, test_point); + assert_eq!(result, None); + } + + #[test] + fn test_naive_bayes_multiple_classes() { + let training_data = vec![ + (vec![1.0, 1.0], 0.0), + (vec![1.1, 1.0], 0.0), + (vec![5.0, 5.0], 1.0), + (vec![5.1, 5.0], 1.0), + (vec![9.0, 9.0], 2.0), + (vec![9.1, 9.0], 2.0), + ]; + + let test_point = vec![1.05, 1.05]; + let result = naive_bayes(training_data.clone(), test_point); + assert_eq!(result, Some(0.0)); + + let test_point = vec![5.05, 5.05]; + let result = naive_bayes(training_data.clone(), test_point); + assert_eq!(result, Some(1.0)); + + let test_point = vec![9.05, 9.05]; + let result = naive_bayes(training_data, test_point); + assert_eq!(result, Some(2.0)); + } + + #[test] + fn test_train_and_predict_separately() { + let training_data = vec![ + (vec![1.0, 1.0], 0.0), + (vec![1.1, 1.0], 0.0), + (vec![5.0, 5.0], 1.0), + (vec![5.1, 5.0], 1.0), + ]; + + let model = train_naive_bayes(training_data); + assert!(model.is_some()); + + let model = model.unwrap(); + assert_eq!(model.len(), 2); + + let test_point = vec![1.05, 1.05]; + let result = predict_naive_bayes(&model, &test_point); + assert_eq!(result, Some(0.0)); + } +} From cb1e2f91a9feed9cfaf79194c7730568fa5e6901 Mon Sep 17 00:00:00 2001 From: Dopamine Date: Fri, 16 Jan 2026 23:43:15 +0800 Subject: [PATCH 2/4] fmt --- src/machine_learning/naive_bayes.rs | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/machine_learning/naive_bayes.rs b/src/machine_learning/naive_bayes.rs index bc15b671c16..a27ebc35384 100644 --- a/src/machine_learning/naive_bayes.rs +++ b/src/machine_learning/naive_bayes.rs @@ -74,7 +74,6 @@ fn gaussian_log_pdf(x: f64, mean: f64, variance: f64) -> f64 { log_coefficient + exponent_term } - pub fn train_naive_bayes(training_data: Vec<(Vec, f64)>) -> Option> { if training_data.is_empty() { return None; @@ -122,7 +121,6 @@ pub fn train_naive_bayes(training_data: Vec<(Vec, f64)>) -> Option Option { if model.is_empty() || test_point.is_empty() { return None; @@ -158,7 +156,6 @@ pub fn predict_naive_bayes(model: &[ClassStatistics], test_point: &[f64]) -> Opt best_class } - pub fn naive_bayes(training_data: Vec<(Vec, f64)>, test_point: Vec) -> Option { let model = train_naive_bayes(training_data)?; predict_naive_bayes(&model, &test_point) From cdd36b62148bf85abd0385e0bb3adeaaca225f35 Mon Sep 17 00:00:00 2001 From: Dopamine Date: Sun, 18 Jan 2026 21:56:36 +0800 Subject: [PATCH 3/4] pca --- DIRECTORY.md | 1 + src/machine_learning/mod.rs | 2 + .../principal_component_analysis.rs | 325 ++++++++++++++++++ 3 files changed, 328 insertions(+) create mode 100644 src/machine_learning/principal_component_analysis.rs diff --git a/DIRECTORY.md b/DIRECTORY.md index ac27b183e8f..61b38224b8e 100644 --- a/DIRECTORY.md +++ b/DIRECTORY.md @@ -208,6 +208,7 @@ * [Linear Regression](https://github.com/TheAlgorithms/Rust/blob/master/src/machine_learning/linear_regression.rs) * [Logistic Regression](https://github.com/TheAlgorithms/Rust/blob/master/src/machine_learning/logistic_regression.rs) * [Naive Bayes](https://github.com/TheAlgorithms/Rust/blob/master/src/machine_learning/naive_bayes.rs) + * [Principal Component Analysis](https://github.com/TheAlgorithms/Rust/blob/master/src/machine_learning/principal_component_analysis.rs) * Loss Function * [Average Margin Ranking Loss](https://github.com/TheAlgorithms/Rust/blob/master/src/machine_learning/loss_function/average_margin_ranking_loss.rs) * [Hinge Loss](https://github.com/TheAlgorithms/Rust/blob/master/src/machine_learning/loss_function/hinge_loss.rs) diff --git a/src/machine_learning/mod.rs b/src/machine_learning/mod.rs index 9856b9a67e5..40a5e6f5165 100644 --- a/src/machine_learning/mod.rs +++ b/src/machine_learning/mod.rs @@ -6,6 +6,7 @@ mod logistic_regression; mod loss_function; mod naive_bayes; mod optimization; +mod principal_component_analysis; pub use self::cholesky::cholesky; pub use self::k_means::k_means; @@ -22,3 +23,4 @@ pub use self::loss_function::neg_log_likelihood; pub use self::naive_bayes::naive_bayes; pub use self::optimization::gradient_descent; pub use self::optimization::Adam; +pub use self::principal_component_analysis::principal_component_analysis; diff --git a/src/machine_learning/principal_component_analysis.rs b/src/machine_learning/principal_component_analysis.rs new file mode 100644 index 00000000000..d2546565e85 --- /dev/null +++ b/src/machine_learning/principal_component_analysis.rs @@ -0,0 +1,325 @@ +/// Principal Component Analysis (PCA) for dimensionality reduction. +/// PCA transforms data to a new coordinate system where the greatest +/// variance lies on the first coordinate (first principal component), +/// the second greatest variance on the second coordinate, and so on. + +/// Compute the mean of each feature across all samples +fn compute_means(data: &[Vec]) -> Vec { + if data.is_empty() { + return vec![]; + } + + let num_features = data[0].len(); + let mut means = vec![0.0; num_features]; + + for sample in data { + for (i, &feature) in sample.iter().enumerate() { + means[i] += feature; + } + } + + let n = data.len() as f64; + for mean in &mut means { + *mean /= n; + } + + means +} + +/// Center the data by subtracting the mean from each feature +fn center_data(data: &[Vec], means: &[f64]) -> Vec> { + data.iter() + .map(|sample| { + sample + .iter() + .zip(means.iter()) + .map(|(&x, &mean)| x - mean) + .collect() + }) + .collect() +} + +/// Compute covariance matrix from centered data +fn compute_covariance_matrix(centered_data: &[Vec]) -> Vec { + if centered_data.is_empty() { + return vec![]; + } + + let n = centered_data.len(); + let num_features = centered_data[0].len(); + + let mut cov_matrix = vec![0.0; num_features * num_features]; + + for i in 0..num_features { + for j in i..num_features { + let mut cov = 0.0; + for sample in centered_data { + cov += sample[i] * sample[j]; + } + cov /= n as f64; + + cov_matrix[i * num_features + j] = cov; + cov_matrix[j * num_features + i] = cov; + } + } + + cov_matrix +} + +/// Power iteration method to find the dominant eigenvalue and eigenvector +fn power_iteration(matrix: &[f64], n: usize, max_iter: usize, tolerance: f64) -> (f64, Vec) { + let mut b_k = vec![1.0; n]; + let mut b_k_prev = vec![0.0; n]; + + for _ in 0..max_iter { + b_k_prev.clone_from(&b_k); + + let mut b_k_new = vec![0.0; n]; + for i in 0..n { + for j in 0..n { + b_k_new[i] += matrix[i * n + j] * b_k[j]; + } + } + + let norm = b_k_new.iter().map(|x| x * x).sum::().sqrt(); + if norm > 1e-10 { + for val in &mut b_k_new { + *val /= norm; + } + } + + b_k = b_k_new; + + let diff: f64 = b_k + .iter() + .zip(b_k_prev.iter()) + .map(|(a, b)| (a - b).abs()) + .fold(0.0, |acc, x| acc.max(x)); + + if diff < tolerance { + break; + } + } + + let eigenvalue = b_k + .iter() + .enumerate() + .map(|(i, &val)| { + let mut row_sum = 0.0; + for j in 0..n { + row_sum += matrix[i * n + j] * b_k[j]; + } + row_sum * val + }) + .sum::() + / b_k.iter().map(|x| x * x).sum::(); + + (eigenvalue, b_k) +} + +/// Deflate a matrix by removing the component along a given eigenvector +fn deflate_matrix(matrix: &[f64], eigenvector: &[f64], eigenvalue: f64, n: usize) -> Vec { + let mut deflated = matrix.to_vec(); + + for i in 0..n { + for j in 0..n { + deflated[i * n + j] -= eigenvalue * eigenvector[i] * eigenvector[j]; + } + } + + deflated +} + +/// Perform PCA on the input data +/// Returns transformed data with reduced dimensions +pub fn principal_component_analysis( + data: Vec>, + num_components: usize, +) -> Option>> { + if data.is_empty() { + return None; + } + + let num_features = data[0].len(); + + if num_features == 0 { + return None; + } + + if num_components > num_features { + return None; + } + + if num_components == 0 { + return None; + } + + let means = compute_means(&data); + let centered_data = center_data(&data, &means); + let cov_matrix = compute_covariance_matrix(¢ered_data); + + let mut eigenvectors = Vec::new(); + let mut deflated_matrix = cov_matrix; + + for _ in 0..num_components { + let (_eigenvalue, eigenvector) = + power_iteration(&deflated_matrix, num_features, 1000, 1e-10); + eigenvectors.push(eigenvector); + deflated_matrix = deflate_matrix( + &deflated_matrix, + eigenvectors.last().unwrap(), + _eigenvalue, + num_features, + ); + } + + let transformed_data: Vec> = centered_data + .iter() + .map(|sample| { + (0..num_components) + .map(|k| { + eigenvectors[k] + .iter() + .zip(sample.iter()) + .map(|(&ev, &s)| ev * s) + .sum::() + }) + .collect() + }) + .collect(); + + Some(transformed_data) +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_pca_simple() { + let data = vec![ + vec![1.0, 2.0], + vec![2.0, 3.0], + vec![3.0, 4.0], + vec![4.0, 5.0], + vec![5.0, 6.0], + ]; + + let result = principal_component_analysis(data, 1); + assert!(result.is_some()); + + let transformed = result.unwrap(); + assert_eq!(transformed.len(), 5); + assert_eq!(transformed[0].len(), 1); + + let all_values: Vec = transformed.iter().map(|v| v[0]).collect(); + let mean = all_values.iter().sum::() / all_values.len() as f64; + + assert!((mean).abs() < 1e-5); + } + + #[test] + fn test_pca_empty_data() { + let data = vec![]; + let result = principal_component_analysis(data, 2); + assert_eq!(result, None); + } + + #[test] + fn test_pca_empty_features() { + let data = vec![vec![], vec![]]; + let result = principal_component_analysis(data, 1); + assert_eq!(result, None); + } + + #[test] + fn test_pca_invalid_num_components() { + let data = vec![vec![1.0, 2.0], vec![2.0, 3.0]]; + + let result = principal_component_analysis(data.clone(), 3); + assert_eq!(result, None); + + let result = principal_component_analysis(data, 0); + assert_eq!(result, None); + } + + #[test] + fn test_pca_preserves_dimensions() { + let data = vec![ + vec![1.0, 2.0, 3.0], + vec![4.0, 5.0, 6.0], + vec![7.0, 8.0, 9.0], + ]; + + let result = principal_component_analysis(data, 2); + assert!(result.is_some()); + + let transformed = result.unwrap(); + assert_eq!(transformed.len(), 3); + assert_eq!(transformed[0].len(), 2); + } + + #[test] + fn test_pca_reconstruction_variance() { + let data = vec![ + vec![2.5, 2.4], + vec![0.5, 0.7], + vec![2.2, 2.9], + vec![1.9, 2.2], + vec![3.1, 3.0], + vec![2.3, 2.7], + vec![2.0, 1.6], + vec![1.0, 1.1], + vec![1.5, 1.6], + vec![1.1, 0.9], + ]; + + let result = principal_component_analysis(data, 1); + assert!(result.is_some()); + + let transformed = result.unwrap(); + assert_eq!(transformed.len(), 10); + assert_eq!(transformed[0].len(), 1); + } + + #[test] + fn test_center_data() { + let data = vec![ + vec![1.0, 2.0, 3.0], + vec![4.0, 5.0, 6.0], + vec![7.0, 8.0, 9.0], + ]; + + let means = vec![4.0, 5.0, 6.0]; + let centered = center_data(&data, &means); + + assert_eq!(centered[0], vec![-3.0, -3.0, -3.0]); + assert_eq!(centered[1], vec![0.0, 0.0, 0.0]); + assert_eq!(centered[2], vec![3.0, 3.0, 3.0]); + } + + #[test] + fn test_compute_means() { + let data = vec![ + vec![1.0, 2.0, 3.0], + vec![4.0, 5.0, 6.0], + vec![7.0, 8.0, 9.0], + ]; + + let means = compute_means(&data); + assert_eq!(means, vec![4.0, 5.0, 6.0]); + } + + #[test] + fn test_power_iteration() { + let matrix = vec![4.0, 1.0, 1.0, 1.0, 3.0, 1.0, 1.0, 1.0, 2.0]; + + let (eigenvalue, eigenvector) = power_iteration(&matrix, 3, 1000, 1e-10); + + assert!(eigenvalue > 0.0); + assert_eq!(eigenvector.len(), 3); + + let norm: f64 = eigenvector.iter().map(|x| x * x).sum::().sqrt(); + assert!((norm - 1.0).abs() < 1e-6); + } +} From 0df25c7478f1ec2a0280630896e96a255d51f87f Mon Sep 17 00:00:00 2001 From: Dopamine Date: Sun, 18 Jan 2026 22:08:04 +0800 Subject: [PATCH 4/4] Fix clippy: remove redundant type annotation --- src/machine_learning/principal_component_analysis.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/machine_learning/principal_component_analysis.rs b/src/machine_learning/principal_component_analysis.rs index d2546565e85..09b6bc1ec32 100644 --- a/src/machine_learning/principal_component_analysis.rs +++ b/src/machine_learning/principal_component_analysis.rs @@ -319,7 +319,7 @@ mod test { assert!(eigenvalue > 0.0); assert_eq!(eigenvector.len(), 3); - let norm: f64 = eigenvector.iter().map(|x| x * x).sum::().sqrt(); + let norm = eigenvector.iter().map(|x| x * x).sum::().sqrt(); assert!((norm - 1.0).abs() < 1e-6); } }