diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8910a61d3..7883bb169 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -81,6 +81,7 @@ jobs: run: composer analyze-ci - name: Unit Tests + #run: vendor/bin/phpunit --display-warning --display-deprecations --display-notices --testsuite="Anomaly Detectors,Backends,Base,Classifiers,Clusterers,Cross Validation,Datasets,Extractors,Graph,Helpers,Kernels,Loggers,NeuralNet,Persisters,Regressors,Serializers,Specifications,Strategies,Tokenizers,Transformers" run: composer test - name: Check Coding Style diff --git a/composer.json b/composer.json index 59cd8d197..cdc8a4c34 100644 --- a/composer.json +++ b/composer.json @@ -38,6 +38,7 @@ "andrewdalpino/okbloomer": "^1.0", "psr/log": "^1.1|^2.0|^3.0", "rubix/tensor": "^3.0", + "rubixml/numpower": "dev-main", "symfony/polyfill-mbstring": "^1.0", "symfony/polyfill-php80": "^1.17", "symfony/polyfill-php82": "^1.27", @@ -52,7 +53,8 @@ "phpstan/phpstan": "^2.0", "phpstan/phpstan-phpunit": "^2.0", "phpunit/phpunit": "^12.0", - "swoole/ide-helper": "^5.1" + "swoole/ide-helper": "^5.1", + "apphp/pretty-print": "^0.6.0" }, "suggest": { "ext-tensor": "For fast Matrix/Vector computing", diff --git a/docs/datasets/generators/hyperplane.md b/docs/datasets/generators/hyperplane.md index a9bc71cfe..65e2e8b9e 100644 --- a/docs/datasets/generators/hyperplane.md +++ b/docs/datasets/generators/hyperplane.md @@ -1,4 +1,4 @@ -[source] +[source] # Hyperplane Generates a labeled dataset whose samples form a hyperplane in n-dimensional vector space and whose labels are continuous values drawn from a uniform random distribution between -1 and 1. When the number of coefficients is either 1, 2 or 3, the samples form points, lines, and planes respectively. Due to its linearity, Hyperplane is especially useful for testing linear regression models. @@ -16,7 +16,7 @@ Generates a labeled dataset whose samples form a hyperplane in n-dimensional vec ## Example ```php -use Rubix\ML\Datasets\Generators\Hyperplane; +use Rubix\ML\Datasets\Generators\Hyperplane\Hyperplane; $generator = new Hyperplane([0.1, 3, -5, 0.01], 150.0, 0.25); ``` diff --git a/docs/datasets/generators/swiss-roll.md b/docs/datasets/generators/swiss-roll.md index 3b3bf4927..3c9e770d8 100644 --- a/docs/datasets/generators/swiss-roll.md +++ b/docs/datasets/generators/swiss-roll.md @@ -1,4 +1,4 @@ -[source] +[source] # Swiss Roll Generate a non-linear 3-dimensional dataset resembling a *swiss roll* or spiral. The labels are the seeds to the swiss roll transformation. @@ -19,7 +19,7 @@ Generate a non-linear 3-dimensional dataset resembling a *swiss roll* or spiral. ## Example ```php -use Rubix\ML\Datasets\Generators\SwissRoll; +use Rubix\ML\Datasets\Generators\SwissRoll\SwissRoll; $generator = new SwissRoll(5.5, 1.5, -2.0, 10, 21.0, 0.2); ``` diff --git a/docs/regressors/adaline.md b/docs/regressors/adaline.md index 3d1722ebe..b3a28fb19 100644 --- a/docs/regressors/adaline.md +++ b/docs/regressors/adaline.md @@ -1,4 +1,4 @@ -[source] +[source] # Adaline *Adaptive Linear Neuron* is a single layer feed-forward neural network with a continuous linear output neuron suitable for regression tasks. Training is equivalent to solving L2 regularized linear regression ([Ridge](ridge.md)) online using Mini Batch Gradient Descent. @@ -20,9 +20,9 @@ ## Example ```php -use Rubix\ML\Regressors\Adaline; -use Rubix\ML\NeuralNet\Optimizers\Adam; -use Rubix\ML\NeuralNet\CostFunctions\HuberLoss; +use Rubix\ML\Regressors\Adaline\Adaline; +use Rubix\ML\NeuralNet\Optimizers\Adam\Adam; +use Rubix\ML\NeuralNet\CostFunctions\HuberLoss\HuberLoss; $estimator = new Adaline(256, new Adam(0.001), 1e-4, 500, 1e-6, 5, new HuberLoss(2.5)); ``` diff --git a/docs/regressors/extra-tree-regressor.md b/docs/regressors/extra-tree-regressor.md index d857f3933..5d5e2e388 100644 --- a/docs/regressors/extra-tree-regressor.md +++ b/docs/regressors/extra-tree-regressor.md @@ -1,4 +1,4 @@ -[source] +[source] # Extra Tree Regressor *Extremely Randomized* Regression Trees differ from standard [Regression Trees](regression-tree.md) in that they choose candidate splits at random rather than searching the entire feature column for the best value to split on. Extra Trees are also faster to build and their predictions have higher variance than a regular decision tree regressor. @@ -17,7 +17,7 @@ ## Example ```php -use Rubix\ML\Regressors\ExtraTreeRegressor; +use Rubix\ML\Regressors\ExtraTreeRegressor\ExtraTreeRegressor; $estimator = new ExtraTreeRegressor(30, 5, 0.05, null); ``` diff --git a/docs/regressors/gradient-boost.md b/docs/regressors/gradient-boost.md index 43c52db19..f0247cf5a 100644 --- a/docs/regressors/gradient-boost.md +++ b/docs/regressors/gradient-boost.md @@ -1,4 +1,4 @@ -[source] +[source] # Gradient Boost Gradient Boost (GBM) is a stage-wise additive ensemble that uses a Gradient Descent boosting scheme for training boosters (Decision Trees) to correct the error residuals of a base learner. @@ -28,8 +28,8 @@ Gradient Boost (GBM) is a stage-wise additive ensemble that uses a Gradient Desc ## Example ```php -use Rubix\ML\Regressors\GradientBoost; -use Rubix\ML\Regressors\RegressionTree; +use Rubix\ML\Regressors\GradientBoost\GradientBoost; +use Rubix\ML\Regressors\RegressionTree\RegressionTree; use Rubix\ML\CrossValidation\Metrics\SMAPE; $estimator = new GradientBoost(new RegressionTree(3), 0.1, 0.8, 1000, 1e-4, 3, 10, 0.1, new SMAPE()); diff --git a/docs/regressors/knn-regressor.md b/docs/regressors/knn-regressor.md index 987d6ad00..937880f27 100644 --- a/docs/regressors/knn-regressor.md +++ b/docs/regressors/knn-regressor.md @@ -1,4 +1,4 @@ -[source] +[source] # KNN Regressor K Nearest Neighbors (KNN) is a brute-force distance-based learner that locates the k nearest training samples from the training set and averages their labels to make a prediction. K Nearest Neighbors (KNN) is considered a *lazy* learner because it performs most of its computation at inference time. @@ -19,7 +19,7 @@ K Nearest Neighbors (KNN) is a brute-force distance-based learner that locates t ## Example ```php -use Rubix\ML\Regressors\KNNRegressor; +use Rubix\ML\Regressors\KNNRegressor\KNNRegressor; use Rubix\ML\Kernels\Distance\SafeEuclidean; $estimator = new KNNRegressor(5, false, new SafeEuclidean()); diff --git a/docs/regressors/mlp-regressor.md b/docs/regressors/mlp-regressor.md index bff693bc1..bf2a8e337 100644 --- a/docs/regressors/mlp-regressor.md +++ b/docs/regressors/mlp-regressor.md @@ -1,4 +1,4 @@ -[source] +[source] # MLP Regressor A multilayer feed-forward neural network with a continuous output layer suitable for regression problems. The Multilayer Perceptron regressor is able to handle complex non-linear regression problems by forming higher-order representations of the input features using intermediate user-defined hidden layers. The MLP also has network snapshotting and progress monitoring to ensure that the model achieves the highest validation score per a given training time budget. @@ -26,12 +26,12 @@ A multilayer feed-forward neural network with a continuous output layer suitable ## Example ```php -use Rubix\ML\Regressors\MLPRegressor; -use Rubix\ML\NeuralNet\CostFunctions\LeastSquares; -use Rubix\ML\NeuralNet\Layers\Dense; -use Rubix\ML\NeuralNet\Layers\Activation; -use Rubix\ML\NeuralNet\ActivationFunctions\ReLU; -use Rubix\ML\NeuralNet\Optimizers\RMSProp; +use Rubix\ML\Regressors\MLPRegressor\MLPRegressor; +use Rubix\ML\NeuralNet\CostFunctions\LeastSquares\LeastSquares; +use Rubix\ML\NeuralNet\Layers\Dense\Dense; +use Rubix\ML\NeuralNet\Layers\Activation\Activation; +use Rubix\ML\NeuralNet\ActivationFunctions\ReLU\ReLU; +use Rubix\ML\NeuralNet\Optimizers\RMSProp\RMSProp; use Rubix\ML\CrossValidation\Metrics\RSquared; $estimator = new MLPRegressor([ diff --git a/docs/regressors/radius-neighbors-regressor.md b/docs/regressors/radius-neighbors-regressor.md index 153bacf72..efd9b53b5 100644 --- a/docs/regressors/radius-neighbors-regressor.md +++ b/docs/regressors/radius-neighbors-regressor.md @@ -1,4 +1,4 @@ -[source] +[source] # Radius Neighbors Regressor This is the regressor version of [Radius Neighbors](../classifiers/radius-neighbors.md) implementing a binary spatial tree under the hood for fast radius queries. The prediction is a weighted average of each label from the training set that is within a fixed user-defined radius. @@ -18,7 +18,7 @@ This is the regressor version of [Radius Neighbors](../classifiers/radius-neighb ## Example ```php -use Rubix\ML\Regressors\RadiusNeighborsRegressor; +use Rubix\ML\Regressors\RadiusNeighborsRegressor\RadiusNeighborsRegressor; use Rubix\ML\Graph\Trees\BallTree; use Rubix\ML\Kernels\Distance\Diagonal; diff --git a/docs/regressors/regression-tree.md b/docs/regressors/regression-tree.md index c60bdcc38..27d399886 100644 --- a/docs/regressors/regression-tree.md +++ b/docs/regressors/regression-tree.md @@ -1,4 +1,4 @@ -[source] +[source] # Regression Tree A decision tree based on the CART (*Classification and Regression Tree*) learning algorithm that performs greedy splitting by minimizing the variance of the labels at each node split. Regression Trees can be used on their own or as the booster in algorithms such as [Gradient Boost](gradient-boost.md). @@ -18,7 +18,7 @@ A decision tree based on the CART (*Classification and Regression Tree*) learnin ## Example ```php -use Rubix\ML\Regressors\RegressionTree; +use Rubix\ML\Regressors\RegressionTree\RegressionTree; $estimator = new RegressionTree(20, 2, 1e-3, 10, null); ``` @@ -50,4 +50,4 @@ public balance() : ?int ## References: [^1]: W. Y. Loh. (2011). Classification and Regression Trees. -[^2]: K. Alsabti. et al. (1998). CLOUDS: A Decision Tree Classifier for Large Datasets. \ No newline at end of file +[^2]: K. Alsabti. et al. (1998). CLOUDS: A Decision Tree Classifier for Large Datasets. diff --git a/docs/regressors/ridge.md b/docs/regressors/ridge.md index 505c3eafc..eef48ed6c 100644 --- a/docs/regressors/ridge.md +++ b/docs/regressors/ridge.md @@ -1,4 +1,4 @@ -[source] +[source] # Ridge L2 regularized linear regression solved using a closed-form solution. The addition of regularization, controlled by the *alpha* hyper-parameter, makes Ridge less likely to overfit the training data than ordinary least squares (OLS). @@ -14,7 +14,7 @@ L2 regularized linear regression solved using a closed-form solution. The additi ## Example ```php -use Rubix\ML\Regressors\Ridge; +use Rubix\ML\Regressors\Ridge\Ridge; $estimator = new Ridge(2.0); ``` diff --git a/phpstan-baseline.neon b/phpstan-baseline.neon index 92f45b7e7..8ed931c49 100644 --- a/phpstan-baseline.neon +++ b/phpstan-baseline.neon @@ -19,8 +19,8 @@ parameters: path: src/NeuralNet/Networks/FeedForward/FeedForward.php - - message: '#^Parameter \#1 \$array \(list\\>\) of array_values is already a list, call has no effect\.$#' - identifier: arrayValues.list + message: '#^Parameter \#1 \$labels of method Rubix\\ML\\NeuralNet\\Networks\\FeedForward\\FeedForward\:\:backpropagate\(\) expects list\, array\ given\.$#' + identifier: argument.type count: 1 path: src/NeuralNet/Networks/FeedForward/FeedForward.php @@ -78,6 +78,12 @@ parameters: count: 1 path: src/Classifiers/LogitBoost.php + - + message: '#^Parameter \#2 \$labels of method Rubix\\ML\\CrossValidation\\Metrics\\Metric\:\:score\(\) expects list\, array\ given\.$#' + identifier: argument.type + count: 1 + path: src/Regressors/GradientBoost/GradientBoost.php + - message: '#^Instanceof between Rubix\\ML\\NeuralNet\\Layers\\Hidden and Rubix\\ML\\NeuralNet\\Layers\\Hidden will always evaluate to true\.$#' identifier: instanceof.alwaysTrue @@ -102,6 +108,18 @@ parameters: count: 1 path: src/Classifiers/NaiveBayes.php + - + message: '#^Property Rubix\\ML\\Classifiers\\NaiveBayes\:\:\$counts \(array\\>\>\>\) does not accept non\-empty\-array\\>\>\>\.$#' + identifier: assign.propertyType + count: 1 + path: src/Classifiers/NaiveBayes.php + + - + message: '#^Property Rubix\\ML\\Classifiers\\NaiveBayes\:\:\$probs \(array\\>\>\) does not accept non\-empty\-array\\>\>\.$#' + identifier: assign.propertyType + count: 1 + path: src/Classifiers/NaiveBayes.php + - message: '#^PHPDoc tag @var with type array\ is not subtype of native type array\\>\.$#' identifier: varTag.nativeType @@ -114,6 +132,12 @@ parameters: count: 1 path: src/Classifiers/RandomForest.php + - + message: '#^Parameter \#1 \.\.\.\$arg1 of function min expects non\-empty\-array, array\\> given\.$#' + identifier: argument.type + count: 1 + path: src/Classifiers/RandomForest.php + - message: '#^Method Rubix\\ML\\Clusterers\\DBSCAN\:\:predict\(\) should return list\ but returns array\\>\.$#' identifier: return.type @@ -133,7 +157,7 @@ parameters: path: src/Clusterers/FuzzyCMeans.php - - message: '#^Parameter \#2 \$labels of method Rubix\\ML\\Clusterers\\KMeans\:\:inertia\(\) expects list\, array given\.$#' + message: '#^Parameter \#2 \$labels of method Rubix\\ML\\Clusterers\\KMeans\:\:inertia\(\) expects list\, array\ given\.$#' identifier: argument.type count: 1 path: src/Clusterers/KMeans.php @@ -336,6 +360,12 @@ parameters: count: 1 path: src/Extractors/CSV.php + - + message: '#^Parameter \#2 \$labels of method Rubix\\ML\\CrossValidation\\Metrics\\Metric\:\:score\(\) expects list\, array\ given\.$#' + identifier: argument.type + count: 1 + path: src/Regressors/GradientBoost.php + - message: '#^Parameter \#1 \.\.\.\$arg1 of function max expects non\-empty\-array, list\ given\.$#' identifier: argument.type @@ -439,16 +469,16 @@ parameters: path: src/Pipeline.php - - message: '#^Parameter \#2 \$labels of method Rubix\\ML\\CrossValidation\\Metrics\\Metric\:\:score\(\) expects list\, array\ given\.$#' - identifier: argument.type + message: '#^Method Rubix\\ML\\Regressors\\KNNRegressor\:\:nearest\(\) should return array\{list\, list\\} but returns array\{array\, float\|int\>, array\, float\>\}\.$#' + identifier: return.type count: 1 - path: src/Regressors/GradientBoost.php + path: src/Regressors/KNNRegressor.php - - message: '#^Method Rubix\\ML\\Regressors\\KNNRegressor\:\:nearest\(\) should return array\{list\, list\\} but returns array\{array\, float\|int\>, array\, float\>\}\.$#' + message: '#^Method Rubix\\ML\\Regressors\\KNNRegressor\\KNNRegressor\:\:nearest\(\) should return array\{list\, list\\} but returns array\{array\, float\|int\>, array\, float\>\}\.$#' identifier: return.type count: 1 - path: src/Regressors/KNNRegressor.php + path: src/Regressors/KNNRegressor/KNNRegressor.php - message: '#^Parameter \#1 \$a of method Rubix\\ML\\Kernels\\Distance\\Distance\:\:compute\(\) expects list\, array\ given\.$#' @@ -456,24 +486,48 @@ parameters: count: 1 path: src/Regressors/KNNRegressor.php + - + message: '#^Parameter \#1 \$a of method Rubix\\ML\\Kernels\\Distance\\Distance\:\:compute\(\) expects list\, array\ given\.$#' + identifier: argument.type + count: 1 + path: src/Regressors/KNNRegressor/KNNRegressor.php + - message: '#^Parameter \#1 \$array \(list\\) of array_values is already a list, call has no effect\.$#' identifier: arrayValues.list count: 1 path: src/Regressors/KNNRegressor.php + - + message: '#^Parameter \#1 \$array \(list\\) of array_values is already a list, call has no effect\.$#' + identifier: arrayValues.list + count: 1 + path: src/Regressors/KNNRegressor/KNNRegressor.php + - message: '#^Parameter \#2 \$b of method Rubix\\ML\\Kernels\\Distance\\Distance\:\:compute\(\) expects list\, array\ given\.$#' identifier: argument.type count: 1 path: src/Regressors/KNNRegressor.php + - + message: '#^Parameter \#2 \$b of method Rubix\\ML\\Kernels\\Distance\\Distance\:\:compute\(\) expects list\, array\ given\.$#' + identifier: argument.type + count: 1 + path: src/Regressors/KNNRegressor/KNNRegressor.php + - message: '#^Property Rubix\\ML\\Regressors\\KNNRegressor\:\:\$labels \(list\\) does not accept array\\.$#' identifier: assign.propertyType count: 1 path: src/Regressors/KNNRegressor.php + - + message: '#^Property Rubix\\ML\\Regressors\\KNNRegressor\\KNNRegressor\:\:\$labels \(list\\) does not accept array\\.$#' + identifier: assign.propertyType + count: 1 + path: src/Regressors/KNNRegressor/KNNRegressor.php + - message: '#^Instanceof between Rubix\\ML\\NeuralNet\\Layers\\Hidden and Rubix\\ML\\NeuralNet\\Layers\\Hidden will always evaluate to true\.$#' identifier: instanceof.alwaysTrue @@ -588,6 +642,12 @@ parameters: count: 1 path: src/functions.php + - + message: '#^Function Rubix\\ML\\array_pack\(\) has parameter \$samples with no value type specified in iterable type array\.$#' + identifier: missingType.iterableValue + count: 1 + path: src/functions.php + - message: '#^Parameter \#1 \.\.\.\$arg1 of function min expects non\-empty\-array, array\<\(int&T\)\|\(string&T\), float\|int\> given\.$#' identifier: argument.type @@ -1512,34 +1572,16 @@ parameters: count: 1 path: src/Graph/Nodes/Isolator.php - - - message: '#^Parameter \#1 \$sample of method Rubix\\ML\\Graph\\Trees\\Spatial::nearest\(\) expects list, non\-empty\-array, mixed> given\.$#' - identifier: argument.type - count: 1 - path: src/Transformers/KNNImputer.php - - - - message: '#^Parameter \#1 \$sample of method Rubix\\ML\\Graph\\Trees\\Spatial::nearest\(\) expects list, non\-empty\-array, mixed> given\.$#' - identifier: argument.type - count: 1 - path: src/Transformers/HotDeckImputer.php - - message: '#^Parameter \#1 \$labels of method Rubix\\ML\\NeuralNet\\FeedForward::backpropagate\(\) expects list, array given\.$#' identifier: argument.type count: 1 path: src/NeuralNet/FeedForward.php - - - message: '#^Parameter \#1 \$labels of method Rubix\\ML\\NeuralNet\\Networks\\FeedForward\\FeedForward::backpropagate\(\) expects list, array given\.$#' - identifier: argument.type - count: 1 - path: src/NeuralNet/Networks/FeedForward/FeedForward.php - - message: '#^Parameter \#1 \$sample of method Rubix\\ML\\Graph\\Trees\\Spatial::range\(\) expects list, array, float|int> given\.$#' identifier: argument.type - count: 6 + count: 4 path: src/Clusterers/MeanShift.php - @@ -1602,3 +1644,8 @@ parameters: count: 1 path: src/Datasets/Labeled.php + - + message: '#^Parameter \#2 \$labels of method Rubix\\ML\\CrossValidation\\Metrics\\Metric\:\:score\(\) expects list, array given\.$#' + identifier: argument.type + count: 1 + path: src/Regressors/MLPRegressor/MLPRegressor.php diff --git a/phpstan-ci.neon b/phpstan-ci.neon index 7173262a0..39bd49742 100644 --- a/phpstan-ci.neon +++ b/phpstan-ci.neon @@ -11,19 +11,53 @@ parameters: - message: '#^Property Rubix\\ML\\Classifiers\\NaiveBayes\:\:\$counts \(array>>>\) does not accept non\-empty\-array>>>\.$#' identifier: assign.propertyType + count: 1 path: src/Classifiers/NaiveBayes.php - message: '#^Property Rubix\\ML\\Classifiers\\NaiveBayes\:\:\$probs \(array>>\) does not accept non\-empty\-array>>\.$#' identifier: assign.propertyType + count: 1 path: src/Classifiers/NaiveBayes.php - message: '#^Parameter \#1 \.\.\.\$arg1 of function min expects non\-empty\-array, array> given\.$#' identifier: argument.type + count: 1 path: src/Classifiers/RandomForest.php + - + message: '#^Property Rubix\\ML\\Classifiers\\ClassificationTree\:\:\$classes \(list\) in isset\(\) is not nullable\.$#' + identifier: isset.property + count: 1 + path: src/Classifiers/ClassificationTree.php + + - + message: '#^Property Rubix\\ML\\Classifiers\\ExtraTreeClassifier\:\:\$classes \(array\) in isset\(\) is not nullable\.$#' + identifier: isset.property + count: 1 + path: src/Classifiers/ExtraTreeClassifier.php + + - + message: '#^Property Rubix\\ML\\Regressors\\GradientBoost\:\:\$ensemble \(array\) in isset\(\) is not nullable\.$#' + identifier: isset.property + count: 2 + path: src/Regressors/GradientBoost.php + + - + message: '#^Property Rubix\\ML\\Regressors\\GradientBoost\\GradientBoost\:\:\$ensemble \(array\) in isset\(\) is not nullable\.$#' + identifier: isset.property + count: 2 + path: src/Regressors/GradientBoost/GradientBoost.php + - message: '#^Parameter \#2 \$labels of method Rubix\\ML\\Clusterers\\KMeans\:\:inertia\(\) expects list, array given\.$#' identifier: argument.type + count: 1 + path: src/Clusterers/KMeans.php + + - + message: '#^Parameter \#2 \$labels of method Rubix\\ML\\CrossValidation\\Metrics\\Metric\:\:score\(\) expects list\, array\ given\.$#' + identifier: argument.type + count: 1 path: src/Clusterers/KMeans.php diff --git a/src/Datasets/Generators/Hyperplane/Hyperplane.php b/src/Datasets/Generators/Hyperplane/Hyperplane.php new file mode 100644 index 000000000..0e634bcf3 --- /dev/null +++ b/src/Datasets/Generators/Hyperplane/Hyperplane.php @@ -0,0 +1,116 @@ + + */ +class Hyperplane implements Generator +{ + /** + * The n coefficients of the hyperplane where n is the dimensionality. + * + * @var NDArray + */ + protected NDArray $coefficients; + + /** + * The y intercept term. + * + * @var float + */ + protected float $intercept; + + /** + * The factor of gaussian noise to add to the data points. + * + * @var float + */ + protected float $noise; + + /** + * @param (int|float)[] $coefficients + * @param float $intercept + * @param float $noise + * @throws InvalidArgumentException + */ + public function __construct( + array $coefficients = [1, -1], + float $intercept = 0.0, + float $noise = 0.1 + ) { + if (empty($coefficients)) { + throw new InvalidArgumentException('Cannot generate samples' + . ' with dimensionality less than 1.'); + } + + if ($noise < 0.0) { + throw new InvalidArgumentException('Noise must be' + . " greater than 0, $noise given."); + } + + $this->coefficients = NumPower::array($coefficients); + $this->intercept = $intercept; + $this->noise = $noise; + } + + /** + * Return the dimensionality of the data this generates. + * + * @internal + * + * @return int<0,max> + */ + public function dimensions() : int + { + return $this->coefficients->shape()[0]; + } + + /** + * Generate n data points. + * + * @param int<0,max> $n + * @return Labeled + */ + public function generate(int $n) : Labeled + { + $d = $this->dimensions(); + + $y = NumPower::uniform(size: [$n], low: -1.0, high: 1.0); + + $coefficientsRow = NumPower::reshape($this->coefficients, [1, $d]); + + $yCol = NumPower::reshape(NumPower::add($y, $this->intercept), [$n, 1]); + + $noise = NumPower::multiply( + NumPower::normal(size: [$n, $d], loc: 0.0, scale: 1.0), + $this->noise + ); + + $samples = NumPower::add( + NumPower::matmul($yCol, $coefficientsRow), + $noise + )->toArray(); + + $labels = $y->toArray(); + + return Labeled::quick($samples, $labels); + } +} diff --git a/src/Datasets/Generators/SwissRoll/SwissRoll.php b/src/Datasets/Generators/SwissRoll/SwissRoll.php new file mode 100644 index 000000000..ea49efa4f --- /dev/null +++ b/src/Datasets/Generators/SwissRoll/SwissRoll.php @@ -0,0 +1,187 @@ + + */ +class SwissRoll implements Generator +{ + /** + * The center vector of the swiss roll. + * + * @var list + */ + protected array $center; + + /** + * The scaling factor of the swiss roll. + * + * @var float + */ + protected float $scale; + + /** + * The depth of the swiss roll i.e the scale of the y dimension. + * + * @var float + */ + protected float $depth; + + /** + * The standard deviation of the gaussian noise. + * + * @var float + */ + protected float $noise; + + /** + * @param float $x + * @param float $y + * @param float $z + * @param float $scale + * @param float $depth + * @param float $noise + * @throws InvalidArgumentException + */ + public function __construct( + float $x = 0.0, + float $y = 0.0, + float $z = 0.0, + float $scale = 1.0, + float $depth = 21.0, + float $noise = 0.1 + ) { + if ($scale < 0.0) { + throw new InvalidArgumentException('Scale must be' + . " greater than 0, $scale given."); + } + + if ($depth < 0) { + throw new InvalidArgumentException('Depth must be' + . " greater than 0, $depth given."); + } + + if ($noise < 0.0) { + throw new InvalidArgumentException('Noise factor cannot be less' + . " than 0, $noise given."); + } + + $this->center = [$x, $y, $z]; + $this->scale = $scale; + $this->depth = $depth; + $this->noise = $noise; + } + + /** + * Return the dimensionality of the data this generates. + * + * @internal + * + * @return int<0,max> + */ + public function dimensions() : int + { + return 3; + } + + /** + * Generate n data points. + * + * @param int<0,max> $n + * @return Labeled + */ + public function generate(int $n) : Labeled + { + $range = M_PI + HALF_PI; + + $t = []; + $y = []; + $coords = []; + + for ($i = 0; $i < $n; ++$i) { + $u = mt_rand() / mt_getrandmax(); + $ti = (($u * 2.0) + 1.0) * $range; + $t[] = $ti; + + $uy = mt_rand() / mt_getrandmax(); + $y[] = $uy * $this->depth; + + $coords[] = [ + $ti * cos($ti), + $y[$i], + $ti * sin($ti), + ]; + } + + $noise = []; + + if ($this->noise > 0.0) { + for ($i = 0; $i < $n; ++$i) { + $row = []; + + for ($j = 0; $j < 3; ++$j) { + $u1 = mt_rand() / mt_getrandmax(); + $u2 = mt_rand() / mt_getrandmax(); + $u1 = $u1 > 0.0 ? $u1 : 1e-12; + + $z0 = sqrt(-2.0 * log($u1)) * cos(2.0 * M_PI * $u2); + + $row[] = $z0 * $this->noise; + } + + $noise[] = $row; + } + } else { + for ($i = 0; $i < $n; ++$i) { + $noise[] = [0.0, 0.0, 0.0]; + } + } + + $center = []; + + for ($i = 0; $i < $n; ++$i) { + $center[] = $this->center; + } + + $coords = NumPower::array($coords); + $noise = NumPower::array($noise); + $center = NumPower::array($center); + + $samples = NumPower::add( + NumPower::add( + NumPower::multiply($coords, $this->scale), + $center + ), + $noise + ); + + return Labeled::quick($samples->toArray(), $t); + } +} diff --git a/src/NeuralNet/Networks/FeedForward/FeedForward.php b/src/NeuralNet/Networks/FeedForward/FeedForward.php index 41610e3b1..7d7aeda26 100644 --- a/src/NeuralNet/Networks/FeedForward/FeedForward.php +++ b/src/NeuralNet/Networks/FeedForward/FeedForward.php @@ -17,6 +17,7 @@ use Rubix\ML\NeuralNet\Optimizers\Base\Optimizer; use Traversable; use function array_reverse; +use function Rubix\ML\array_pack; /** * Feed Forward @@ -302,6 +303,6 @@ private function prepareSamples(Dataset $dataset) : array } // Reindex a nested array to ensure all levels have sequential numeric keys - return array_map('array_values', array_values($samples)); + return array_pack($samples); } } diff --git a/src/NeuralNet/Parameters/Parameter.php b/src/NeuralNet/Parameters/Parameter.php index 0cef2e87a..6741a0e49 100644 --- a/src/NeuralNet/Parameters/Parameter.php +++ b/src/NeuralNet/Parameters/Parameter.php @@ -90,9 +90,14 @@ public function update(NDArray $gradient, Optimizer $optimizer) : void /** * Perform a deep copy of the object upon cloning. + * + * Cloning an NDArray directly may trigger native memory corruption in some + * NumPower builds (e.g. heap corruption/segfaults when parameters are + * snapshotted during training). To make cloning deterministic and stable we + * deep-copy through a PHP array roundtrip: NDArray -> PHP array -> NDArray. */ public function __clone() : void { - $this->param = clone $this->param; + $this->param = NumPower::array($this->param->toArray()); } } diff --git a/src/Regressors/Adaline/Adaline.php b/src/Regressors/Adaline/Adaline.php new file mode 100644 index 000000000..b663a38be --- /dev/null +++ b/src/Regressors/Adaline/Adaline.php @@ -0,0 +1,463 @@ + + */ +class Adaline implements Estimator, Learner, Online, RanksFeatures, Verbose, Persistable +{ + use AutotrackRevisions, LoggerAware; + + /** + * The number of training samples to process at a time. + * + * @var positive-int + */ + protected int $batchSize; + + /** + * The gradient descent optimizer used to update the network parameters. + * + * @var Optimizer + */ + protected Optimizer $optimizer; + + /** + * The amount of L2 regularization applied to the weights of the output layer. + * + * @var float + */ + protected float $l2Penalty; + + /** + * The maximum number of training epochs. i.e. the number of times to iterate before terminating. + * + * @var int<0,max> + */ + protected int $epochs; + + /** + * The minimum change in the training loss necessary to continue training. + * + * @var float + */ + protected float $minChange; + + /** + * The number of epochs without improvement in the training loss to wait before considering an early stop. + * + * @var positive-int + */ + protected int $window; + + /** + * The function that computes the loss associated with an erroneous + * activation during training. + * + * @var RegressionLoss + */ + protected RegressionLoss $costFn; + + /** + * The underlying neural network instance. + * + * @var FeedForward|null + */ + protected ?FeedForward $network = null; + + /** + * The loss at each epoch from the last training session. + * + * @var float[]|null + */ + protected ?array $losses = null; + + /** + * @param int $batchSize + * @param Optimizer|null $optimizer + * @param float $l2Penalty + * @param int $epochs + * @param float $minChange + * @param int $window + * @param RegressionLoss|null $costFn + * @throws InvalidArgumentException + */ + public function __construct( + int $batchSize = 128, + ?Optimizer $optimizer = null, + float $l2Penalty = 1e-4, + int $epochs = 1000, + float $minChange = 1e-4, + int $window = 5, + ?RegressionLoss $costFn = null + ) { + if ($batchSize < 1) { + throw new InvalidArgumentException('Batch size must be' + . " greater than 0, $batchSize given."); + } + + if ($l2Penalty < 0.0) { + throw new InvalidArgumentException('L2 Penalty must be' + . " greater than 0, $l2Penalty given."); + } + + if ($epochs < 0) { + throw new InvalidArgumentException('Number of epochs' + . " must be greater than 0, $epochs given."); + } + + if ($minChange < 0.0) { + throw new InvalidArgumentException('Minimum change must be' + . " greater than 0, $minChange given."); + } + + if ($window < 1) { + throw new InvalidArgumentException('Window must be' + . " greater than 0, $window given."); + } + + $this->batchSize = $batchSize; + $this->optimizer = $optimizer ?? new Adam(); + $this->l2Penalty = $l2Penalty; + $this->epochs = $epochs; + $this->minChange = $minChange; + $this->window = $window; + $this->costFn = $costFn ?? new LeastSquares(); + } + + /** + * Return the estimator type. + * + * @internal + * + * @return EstimatorType + */ + public function type() : EstimatorType + { + return EstimatorType::regressor(); + } + + /** + * Return the data types that the estimator is compatible with. + * + * @internal + * + * @return list + */ + public function compatibility() : array + { + return [ + DataType::continuous(), + ]; + } + + /** + * Return the settings of the hyper-parameters in an associative array. + * + * @internal + * + * @return mixed[] + */ + public function params() : array + { + return [ + 'batch size' => $this->batchSize, + 'optimizer' => $this->optimizer, + 'l2 penalty' => $this->l2Penalty, + 'epochs' => $this->epochs, + 'min change' => $this->minChange, + 'window' => $this->window, + 'cost fn' => $this->costFn, + ]; + } + + /** + * Has the learner been trained? + * + * @return bool + */ + public function trained() : bool + { + return isset($this->network); + } + + /** + * Return an iterable progress table with the steps from the last training session. + * + * @return Generator + */ + public function steps() : Generator + { + if (!$this->losses) { + return; + } + + foreach ($this->losses as $epoch => $loss) { + yield [ + 'epoch' => $epoch, + 'loss' => $loss, + ]; + } + } + + /** + * Return the loss for each epoch from the last training session. + * + * @return float[]|null + */ + public function losses() : ?array + { + return $this->losses; + } + + /** + * Return the underlying neural network instance or null if not trained. + * + * @return FeedForward|null + */ + public function network() : ?FeedForward + { + return $this->network; + } + + /** + * Train the estimator with a dataset. + * + * @param Labeled $dataset + */ + public function train(Dataset $dataset) : void + { + DatasetIsNotEmpty::with($dataset)->check(); + + $this->network = new FeedForward( + new Placeholder1D($dataset->numFeatures()), + [new Dense(1, $this->l2Penalty, true, new XavierUniform())], + new Continuous($this->costFn), + $this->optimizer + ); + + $this->network->initialize(); + + $this->partial($dataset); + } + + /** + * Perform a partial train on the learner. + * + * @param Labeled $dataset + */ + public function partial(Dataset $dataset) : void + { + if (!$this->network) { + $this->train($dataset); + + return; + } + + SpecificationChain::with([ + new DatasetIsLabeled($dataset), + new DatasetIsNotEmpty($dataset), + new SamplesAreCompatibleWithEstimator($dataset, $this), + new LabelsAreCompatibleWithLearner($dataset, $this), + new DatasetHasDimensionality($dataset, $this->network->input()->width()), + ])->check(); + + if ($this->logger) { + $this->logger->info("Training $this"); + + $numParams = number_format($this->network->numParams()); + + $this->logger->info("{$numParams} trainable parameters"); + } + + $prevLoss = $bestLoss = INF; + $numWorseEpochs = 0; + + $this->losses = []; + + for ($epoch = 1; $epoch <= $this->epochs; ++$epoch) { + $batches = $dataset->randomize()->batch($this->batchSize); + + $loss = 0.0; + + foreach ($batches as $batch) { + $loss += $this->network->roundtrip($batch); + } + + $loss /= count($batches); + + $lossChange = abs($prevLoss - $loss); + + $this->losses[$epoch] = $loss; + + if ($this->logger) { + $lossDirection = $loss < $prevLoss ? '↓' : '↑'; + + $message = "Epoch: $epoch, " + . "{$this->costFn}: $loss, " + . "Loss Change: {$lossDirection}{$lossChange}"; + + $this->logger->info($message); + } + + if (is_nan($loss)) { + if ($this->logger) { + $this->logger->warning('Numerical under/overflow detected'); + } + + break; + } + + if ($loss <= 0.0) { + break; + } + + if ($lossChange < $this->minChange) { + break; + } + + if ($loss < $bestLoss) { + $bestLoss = $loss; + + $numWorseEpochs = 0; + } else { + ++$numWorseEpochs; + } + + if ($numWorseEpochs >= $this->window) { + break; + } + + $prevLoss = $loss; + } + + if ($this->logger) { + $this->logger->info('Training complete'); + } + } + + /** + * Make predictions from a dataset. + * + * @param Dataset $dataset + * @throws RuntimeException + * @return list + */ + public function predict(Dataset $dataset) : array + { + if (!$this->network) { + throw new RuntimeException('Estimator has not been trained.'); + } + + DatasetHasDimensionality::with($dataset, $this->network->input()->width())->check(); + + $activations = $this->network->infer($dataset); + + $activations = array_column($activations->toArray(), 0); + + return $activations; + } + + /** + * Return the importance scores of each feature column of the training set. + * + * @throws RuntimeException + * @return float[] + */ + public function featureImportances() : array + { + if (!$this->network) { + throw new RuntimeException('Estimator has not been trained.'); + } + + $layer = current($this->network->hidden()); + + if (!$layer instanceof Dense) { + throw new RuntimeException('Weight layer is missing.'); + } + + // Convert the weight matrix to a plain PHP array because the current NDArray build + // does not expose a stable row-extraction helper (e.g. rowAsVector()) + $weights = NumPower::abs($layer->weights())->toArray(); + + // This model has a single output neuron, so the first row contains the per-feature weights. + return $weights[0] ?? []; + } + + /** + * Return an associative array containing the data used to serialize the object. + * + * @return mixed[] + */ + public function __serialize() : array + { + $properties = get_object_vars($this); + + unset($properties['losses'], $properties['logger']); + + return $properties; + } + + /** + * Return the string representation of the object. + * + * @internal + * + * @return string + */ + public function __toString() : string + { + return 'Adaline (' . Params::stringify($this->params()) . ')'; + } +} diff --git a/src/Regressors/ExtraTreeRegressor/ExtraTreeRegressor.php b/src/Regressors/ExtraTreeRegressor/ExtraTreeRegressor.php new file mode 100644 index 000000000..edb89eb6a --- /dev/null +++ b/src/Regressors/ExtraTreeRegressor/ExtraTreeRegressor.php @@ -0,0 +1,202 @@ + + */ +class ExtraTreeRegressor extends ExtraTree implements Estimator, Learner, RanksFeatures, Persistable +{ + use AutotrackRevisions; + + /** + * @param int $maxHeight + * @param int $maxLeafSize + * @param float $minPurityIncrease + * @param int|null $maxFeatures + */ + public function __construct( + int $maxHeight = PHP_INT_MAX, + int $maxLeafSize = 3, + float $minPurityIncrease = 1e-7, + ?int $maxFeatures = null + ) { + parent::__construct($maxHeight, $maxLeafSize, $minPurityIncrease, $maxFeatures); + } + + /** + * Return the estimator type. + * + * @internal + * + * @return EstimatorType + */ + public function type() : EstimatorType + { + return EstimatorType::regressor(); + } + + /** + * Return the data types that the estimator is compatible with. + * + * @internal + * + * @return list + */ + public function compatibility() : array + { + return [ + DataType::categorical(), + DataType::continuous(), + ]; + } + + /** + * Return the settings of the hyper-parameters in an associative array. + * + * @internal + * + * @return mixed[] + */ + public function params() : array + { + return [ + 'max height' => $this->maxHeight, + 'max leaf size' => $this->maxLeafSize, + 'max features' => $this->maxFeatures, + 'min purity increase' => $this->minPurityIncrease, + ]; + } + + /** + * Has the learner been trained? + * + * @return bool + */ + public function trained() : bool + { + return !$this->bare(); + } + + /** + * Train the regression tree by learning the optimal splits in the + * training set. + * + * @param Labeled $dataset + */ + public function train(Dataset $dataset) : void + { + SpecificationChain::with([ + new DatasetIsLabeled($dataset), + new DatasetIsNotEmpty($dataset), + new SamplesAreCompatibleWithEstimator($dataset, $this), + new LabelsAreCompatibleWithLearner($dataset, $this), + ])->check(); + + $this->grow($dataset); + } + + /** + * Make a prediction based on the value of a terminal node in the tree. + * + * @param Dataset $dataset + * @throws RuntimeException + * @return list + */ + public function predict(Dataset $dataset) : array + { + if ($this->bare() or !$this->featureCount) { + throw new RuntimeException('Estimator has not been trained.'); + } + + DatasetHasDimensionality::with($dataset, $this->featureCount)->check(); + + return array_map([$this, 'predictSample'], $dataset->samples()); + } + + /** + * Predict a single sample and return the result. + * + * @internal + * + * @param list $sample + * @return int|float + */ + public function predictSample(array $sample) : int|float + { + /** @var Average $node */ + $node = $this->search($sample); + + return $node->outcome(); + } + + /** + * Terminate the branch with the most likely Average. + * + * @param Labeled $dataset + * @return Average + */ + protected function terminate(Labeled $dataset) : Average + { + [$mean, $variance] = Stats::meanVar($dataset->labels()); + + return new Average($mean, $variance, $dataset->numSamples()); + } + + /** + * Calculate the impurity of a set of labels. + * + * @param list $labels + * @return float + */ + protected function impurity(array $labels) : float + { + return Stats::variance($labels); + } + + /** + * Return the string representation of the object. + * + * @internal + * + * @return string + */ + public function __toString() : string + { + return 'Extra Tree Regressor (' . Params::stringify($this->params()) . ')'; + } +} diff --git a/src/Regressors/GradientBoost/GradientBoost.php b/src/Regressors/GradientBoost/GradientBoost.php new file mode 100644 index 000000000..66182dbba --- /dev/null +++ b/src/Regressors/GradientBoost/GradientBoost.php @@ -0,0 +1,626 @@ + + */ +class GradientBoost implements Estimator, Learner, RanksFeatures, Verbose, Persistable +{ + use AutotrackRevisions, LoggerAware; + + /** + * The class names of the compatible learners to used as boosters. + * + * @var class-string[] + */ + public const COMPATIBLE_BOOSTERS = [ + RegressionTree::class, + ExtraTreeRegressor::class, + ]; + + /** + * The minimum size of each training subset. + * + * @var int + */ + protected const MIN_SUBSAMPLE = 2; + + /** + * The regressor that will fix up the error residuals of the *weak* base learner. + * + * @var Learner + */ + protected Learner $booster; + + /** + * The learning rate of the ensemble i.e. the *shrinkage* applied to each step. + * + * @var float + */ + protected float $rate; + + /** + * The ratio of samples to subsample from the training set for each booster. + * + * @var float + */ + protected float $ratio; + + /** + * The maximum number of training epochs. i.e. the number of times to iterate before terminating. + * + * @var int<0,max> + */ + protected int $epochs; + + /** + * The minimum change in the training loss necessary to continue training. + * + * @var float + */ + protected float $minChange; + + /** + * The number of epochs to train before evaluating the model with the holdout set. + * + * @var int + */ + protected int $evalInterval; + + /** + * The number of epochs without improvement in the validation score to wait before considering an + * early stop. + * + * @var positive-int + */ + protected int $window; + + /** + * The proportion of training samples to use for validation and progress monitoring. + * + * @var float + */ + protected float $holdOut; + + /** + * The metric used to score the generalization performance of the model during training. + * + * @var Metric + */ + protected Metric $metric; + + /** + * An ensemble of weak regressors. + * + * @var mixed[] + */ + protected array $ensemble = [ + // + ]; + + /** + * The validation scores at each epoch. + * + * @var float[]|null + */ + protected ?array $scores = null; + + /** + * The average training loss at each epoch. + * + * @var float[]|null + */ + protected ?array $losses = null; + + /** + * The dimensionality of the training set. + * + * @var int<0,max>|null + */ + protected ?int $featureCount = null; + + /** + * The mean of the labels of the training set. + * + * @var float|null + */ + protected ?float $mu = null; + + /** + * @param Learner|null $booster + * @param float $rate + * @param float $ratio + * @param int $epochs + * @param float $minChange + * @param int $evalInterval + * @param int $window + * @param float $holdOut + * @param Metric|null $metric + * @throws InvalidArgumentException + */ + public function __construct( + ?Learner $booster = null, + float $rate = 0.1, + float $ratio = 0.5, + int $epochs = 1000, + float $minChange = 1e-4, + int $evalInterval = 3, + int $window = 5, + float $holdOut = 0.1, + ?Metric $metric = null + ) { + if ($booster and !in_array(get_class($booster), self::COMPATIBLE_BOOSTERS)) { + throw new InvalidArgumentException('Booster is not compatible' + . ' with the ensemble.'); + } + + if ($rate <= 0.0) { + throw new InvalidArgumentException('Learning rate must be' + . " greater than 0, $rate given."); + } + + if ($ratio <= 0.0 or $ratio > 1.0) { + throw new InvalidArgumentException('Ratio must be' + . " between 0 and 1, $ratio given."); + } + + if ($epochs < 0) { + throw new InvalidArgumentException('Number of epochs' + . " must be greater than 0, $epochs given."); + } + + if ($minChange < 0.0) { + throw new InvalidArgumentException('Minimum change must be' + . " greater than 0, $minChange given."); + } + + if ($evalInterval < 1) { + throw new InvalidArgumentException('Eval interval must be' + . " greater than 0, $evalInterval given."); + } + + if ($window < 1) { + throw new InvalidArgumentException('Window must be' + . " greater than 0, $window given."); + } + + if ($holdOut < 0.0 or $holdOut > 0.5) { + throw new InvalidArgumentException('Hold out ratio must be' + . " between 0 and 0.5, $holdOut given."); + } + + if ($metric) { + EstimatorIsCompatibleWithMetric::with($this, $metric)->check(); + } + + $this->booster = $booster ?? new RegressionTree(3); + $this->rate = $rate; + $this->ratio = $ratio; + $this->epochs = $epochs; + $this->minChange = $minChange; + $this->evalInterval = $evalInterval; + $this->window = $window; + $this->holdOut = $holdOut; + $this->metric = $metric ?? new RMSE(); + } + + /** + * Return the estimator type. + * + * @internal + * + * @return EstimatorType + */ + public function type() : EstimatorType + { + return EstimatorType::regressor(); + } + + /** + * Return the data types that the estimator is compatible with. + * + * @internal + * + * @return list<\Rubix\ML\DataType> + */ + public function compatibility() : array + { + return $this->booster->compatibility(); + } + + /** + * Return the settings of the hyper-parameters in an associative array. + * + * @internal + * + * @return mixed[] + */ + public function params() : array + { + return [ + 'booster' => $this->booster, + 'rate' => $this->rate, + 'ratio' => $this->ratio, + 'epochs' => $this->epochs, + 'min change' => $this->minChange, + 'eval interval' => $this->evalInterval, + 'window' => $this->window, + 'hold out' => $this->holdOut, + 'metric' => $this->metric, + ]; + } + + /** + * Has the learner been trained? + * + * @return bool + */ + public function trained() : bool + { + return !empty($this->ensemble); + } + + /** + * Return an iterable progress table with the steps from the last training session. + * + * @return Generator + */ + public function steps() : Generator + { + if (!$this->losses) { + return; + } + + foreach ($this->losses as $epoch => $loss) { + yield [ + 'epoch' => $epoch, + 'score' => $this->scores[$epoch] ?? null, + 'loss' => $loss, + ]; + } + } + + /** + * Return the validation scores at each epoch from the last training session. + * + * @return float[]|null + */ + public function scores() : ?array + { + return $this->scores; + } + + /** + * Return the loss for each epoch from the last training session. + * + * @return float[]|null + */ + public function losses() : ?array + { + return $this->losses; + } + + /** + * Train the estimator with a dataset. + * + * @param Labeled $dataset + */ + public function train(Dataset $dataset) : void + { + SpecificationChain::with([ + new DatasetIsLabeled($dataset), + new DatasetIsNotEmpty($dataset), + new SamplesAreCompatibleWithEstimator($dataset, $this), + new LabelsAreCompatibleWithLearner($dataset, $this), + ])->check(); + + if ($this->logger) { + $this->logger->info("Training $this"); + } + + [$testing, $training] = $dataset->randomize()->split($this->holdOut); + + [$minScore, $maxScore] = $this->metric->range()->list(); + + [$m, $n] = $training->shape(); + + $targets = $training->labels(); + + $mu = Stats::mean($targets); + + $out = array_fill(0, $m, $mu); + + if (!$testing->empty()) { + $outTest = array_fill(0, $testing->numSamples(), $mu); + } elseif ($this->logger) { + $this->logger->notice('Insufficient validation data, ' + . 'some features are disabled'); + } + + $p = max(self::MIN_SUBSAMPLE, (int) round($this->ratio * $m)); + + $weights = array_fill(0, $m, 1.0 / $m); + + $this->featureCount = $n; + $this->ensemble = $this->scores = $this->losses = []; + $this->mu = $mu; + + $bestScore = $minScore; + $bestEpoch = $numWorseEpochs = 0; + $score = null; + $prevLoss = INF; + + for ($epoch = 1; $epoch <= $this->epochs; ++$epoch) { + $gradient = array_map([$this, 'gradient'], $out, $targets); + $loss = array_reduce($gradient, [$this, 'l2Loss'], 0.0); + + $loss /= $m; + + $lossChange = abs($prevLoss - $loss); + + $this->losses[$epoch] = $loss; + + if ($epoch % $this->evalInterval === 0 && isset($outTest)) { + $score = $this->metric->score($outTest, $testing->labels()); + + $this->scores[$epoch] = $score; + } + + if ($this->logger) { + $message = "Epoch: $epoch, L2 Loss: $loss"; + + if (isset($score)) { + $message .= ", {$this->metric}: $score"; + } + + $this->logger->info($message); + } + + if (is_nan($loss)) { + if ($this->logger) { + $this->logger->warning('Numerical instability detected'); + } + + break; + } + + if (isset($score)) { + if ($score >= $maxScore) { + break; + } + + if ($score > $bestScore) { + $bestScore = $score; + $bestEpoch = $epoch; + + $numWorseEpochs = 0; + } else { + ++$numWorseEpochs; + } + + if ($numWorseEpochs >= $this->window) { + break; + } + + unset($score); + } + + if ($lossChange < $this->minChange) { + break; + } + + $training = Labeled::quick($training->samples(), $gradient); + + $subset = $training->randomWeightedSubsetWithReplacement($p, $weights); + + $booster = clone $this->booster; + + $booster->train($subset); + + $this->ensemble[] = $booster; + + $predictions = $booster->predict($training); + + $out = array_map([$this, 'updateOut'], $predictions, $out); + + if (isset($outTest)) { + $predictions = $booster->predict($testing); + + $outTest = array_map([$this, 'updateOut'], $predictions, $outTest); + } + + $weights = array_map('abs', $gradient); + + $prevLoss = $loss; + } + + if ($this->scores and end($this->scores) <= $bestScore) { + $this->ensemble = array_slice($this->ensemble, 0, $bestEpoch); + + if ($this->logger) { + $this->logger->info("Model state restored to epoch $bestEpoch"); + } + } + + if ($this->logger) { + $this->logger->info('Training complete'); + } + } + + /** + * Make a prediction from a dataset. + * + * @param Dataset $dataset + * @throws RuntimeException + * @return list + */ + public function predict(Dataset $dataset) : array + { + if (!isset($this->ensemble, $this->featureCount, $this->mu)) { + throw new RuntimeException('Estimator has not been trained.'); + } + + DatasetHasDimensionality::with($dataset, $this->featureCount)->check(); + + $out = array_fill(0, $dataset->numSamples(), $this->mu); + + foreach ($this->ensemble as $estimator) { + $predictions = $estimator->predict($dataset); + + $out = array_map([$this, 'updateOut'], $predictions, $out); + } + + return $out; + } + + /** + * Return the importance scores of each feature column of the training set. + * + * @throws RuntimeException + * @return float[] + */ + public function featureImportances() : array + { + if (!isset($this->ensemble, $this->featureCount)) { + throw new RuntimeException('Estimator has not been trained.'); + } + + $importances = array_fill(0, $this->featureCount, 0.0); + + foreach ($this->ensemble as $tree) { + $scores = $tree->featureImportances(); + + foreach ($scores as $column => $score) { + $importances[$column] += $score; + } + } + + $numEstimators = count($this->ensemble); + + foreach ($importances as &$importance) { + $importance /= $numEstimators; + } + + return $importances; + } + + /** + * Compute the output for an iteration. + * + * @param float $prediction + * @param float $out + * @return float + */ + protected function updateOut(float $prediction, float $out) : float + { + return $this->rate * $prediction + $out; + } + + /** + * Compute the gradient for a single sample. + * + * @param float $out + * @param float $target + * @return float + */ + protected function gradient(float $out, float $target) : float + { + return $target - $out; + } + + /** + * Compute the cross entropy loss function. + * + * @param float $loss + * @param float $derivative + * @return float + */ + protected function l2Loss(float $loss, float $derivative) : float + { + return $loss + $derivative ** 2; + } + + /** + * Return an associative array containing the data used to serialize the object. + * + * @return mixed[] + */ + public function __serialize() : array + { + $properties = get_object_vars($this); + + unset($properties['losses'], $properties['scores'], $properties['logger']); + + return $properties; + } + + /** + * Return the string representation of the object. + * + * @internal + * + * @return string + */ + public function __toString() : string + { + return 'Gradient Boost (' . Params::stringify($this->params()) . ')'; + } +} diff --git a/src/Regressors/KNNRegressor/KNNRegressor.php b/src/Regressors/KNNRegressor/KNNRegressor.php new file mode 100644 index 000000000..a28be25e7 --- /dev/null +++ b/src/Regressors/KNNRegressor/KNNRegressor.php @@ -0,0 +1,260 @@ + **Note:** This learner is considered a *lazy* learner because it does the majority + * of its computation during inference. For a fast spatial tree-accelerated version, see + * KD Neighbors Regressor. + * + * @category Machine Learning + * @package Rubix/ML + * @author Andrew DalPino + * @author Samuel Akopyan + */ +class KNNRegressor implements Estimator, Learner, Online, Persistable +{ + use AutotrackRevisions; + + /** + * The number of neighbors to consider when making a prediction. + * + * @var int + */ + protected int $k; + + /** + * Should we consider the distances of our nearest neighbors when making predictions? + * + * @var bool + */ + protected bool $weighted; + + /** + * The distance kernel to use when computing the distances. + * + * @var Distance + */ + protected Distance $kernel; + + /** + * The training samples. + * + * @var list<(string|int|float)[]> + */ + protected array $samples = [ + // + ]; + + /** + * The training labels. + * + * @var list + */ + protected array $labels = [ + // + ]; + + /** + * @param int $k + * @param bool $weighted + * @param Distance|null $kernel + * @throws InvalidArgumentException + */ + public function __construct(int $k = 5, bool $weighted = false, ?Distance $kernel = null) + { + if ($k < 1) { + throw new InvalidArgumentException('At least 1 neighbor is required' + . " to make a prediction, $k given."); + } + + $this->k = $k; + $this->weighted = $weighted; + $this->kernel = $kernel ?? new Euclidean(); + } + + /** + * Return the estimator type. + * + * @internal + * + * @return EstimatorType + */ + public function type() : EstimatorType + { + return EstimatorType::regressor(); + } + + /** + * Return the data types that the estimator is compatible with. + * + * @internal + * + * @return list<\Rubix\ML\DataType> + */ + public function compatibility() : array + { + return $this->kernel->compatibility(); + } + + /** + * Return the settings of the hyper-parameters in an associative array. + * + * @internal + * + * @return mixed[] + */ + public function params() : array + { + return [ + 'k' => $this->k, + 'weighted' => $this->weighted, + 'kernel' => $this->kernel, + ]; + } + + /** + * Has the learner been trained? + * + * @return bool + */ + public function trained() : bool + { + return $this->samples and $this->labels; + } + + /** + * Train the learner with a dataset. + * + * @param Labeled $dataset + */ + public function train(Dataset $dataset) : void + { + $this->samples = $this->labels = []; + + $this->partial($dataset); + } + + /** + * Perform a partial train on the learner. + * + * @param Labeled $dataset + */ + public function partial(Dataset $dataset) : void + { + SpecificationChain::with([ + new DatasetIsLabeled($dataset), + new DatasetIsNotEmpty($dataset), + new SamplesAreCompatibleWithEstimator($dataset, $this), + new LabelsAreCompatibleWithLearner($dataset, $this), + ])->check(); + + $this->samples = array_merge($this->samples, $dataset->samples()); + $this->labels = array_merge($this->labels, $dataset->labels()); + } + + /** + * Make a prediction based on the nearest neighbors. + * + * @param Dataset $dataset + * @throws RuntimeException + * @return list + */ + public function predict(Dataset $dataset) : array + { + if (!$this->samples or !$this->labels) { + throw new RuntimeException('Estimator has not been trained.'); + } + + DatasetHasDimensionality::with($dataset, count(current($this->samples)))->check(); + + return array_map([$this, 'predictSample'], $dataset->samples()); + } + + /** + * Predict a single sample and return the result. + * + * @internal + * + * @param list $sample + * @return int|float + */ + public function predictSample(array $sample) : int|float + { + [$labels, $distances] = $this->nearest($sample); + + if ($this->weighted) { + $distances = NumPower::array($distances); + $weights = NumPower::divide(1.0, NumPower::add($distances, 1.0))->toArray(); + + return Stats::weightedMean(array_values($labels), $weights); + } + + return Stats::mean($labels); + } + + /** + * Find the K nearest neighbors to the given sample vector using the brute force method. + * + * @param (string|int|float)[] $sample + * @return array{list,list} + */ + protected function nearest(array $sample) : array + { + $distances = []; + + foreach ($this->samples as $neighbor) { + $distances[] = $this->kernel->compute($sample, $neighbor); + } + + asort($distances); + + $distances = array_slice($distances, 0, $this->k, true); + + $labels = array_intersect_key($this->labels, $distances); + + return [$labels, $distances]; + } + + /** + * Return the string representation of the object. + * + * @internal + * + * @return string + */ + public function __toString() : string + { + return 'KNN Regressor (' . Params::stringify($this->params()) . ')'; + } +} diff --git a/src/Regressors/MLPRegressor/MLPRegressor.php b/src/Regressors/MLPRegressor/MLPRegressor.php new file mode 100644 index 000000000..77c13c644 --- /dev/null +++ b/src/Regressors/MLPRegressor/MLPRegressor.php @@ -0,0 +1,572 @@ + + */ +class MLPRegressor implements Estimator, Learner, Online, Verbose, Persistable +{ + use AutotrackRevisions, LoggerAware; + + /** + * An array composing the user-specified hidden layers of the network in order. + * + * @var Hidden[] + */ + protected array $hiddenLayers = [ + // + ]; + + /** + * The number of training samples to process at a time. + * + * @var positive-int + */ + protected int $batchSize; + + /** + * The gradient descent optimizer used to update the network parameters. + * + * @var Optimizer + */ + protected Optimizer $optimizer; + + /** + * The maximum number of training epochs. i.e. the number of times to iterate before terminating. + * + * @var int<0,max> + */ + protected int $epochs; + + /** + * The minimum change in the training loss necessary to continue training. + * + * @var float + */ + protected float $minChange; + + /** + * The number of epochs to train before evaluating the model with the holdout set. + * + * @var int + */ + protected int $evalInterval; + + /** + * The number of epochs without improvement in the validation score to wait before considering an early stop. + * + * @var positive-int + */ + protected int $window; + + /** + * The proportion of training samples to use for validation and progress monitoring. + * + * @var float + */ + protected float $holdOut; + + /** + * The function that computes the loss associated with an erroneous activation during training. + * + * @var RegressionLoss + */ + protected RegressionLoss $costFn; + + /** + * The metric used to score the generalization performance of the model during training. + * + * @var Metric + */ + protected Metric $metric; + + /** + * The underlying neural network instance. + * + * @var FeedForward|null + */ + protected ?FeedForward $network = null; + + /** + * The validation scores at each epoch from the last training session. + * + * @var float[]|null + */ + protected ?array $scores = null; + + /** + * The loss at each epoch from the last training session. + * + * @var float[]|null + */ + protected ?array $losses = null; + + /** + * Whether to pack the samples. + * + * @var bool + */ + private bool $packSamples; + + /** + * @param list $hiddenLayers + * @param int $batchSize + * @param Optimizer|null $optimizer + * @param int $epochs + * @param float $minChange + * @param int $evalInterval + * @param int $window + * @param float $holdOut + * @param RegressionLoss|null $costFn + * @param Metric|null $metric + * @param bool $packSamples + */ + public function __construct( + array $hiddenLayers = [], + int $batchSize = 128, + ?Optimizer $optimizer = null, + int $epochs = 1000, + float $minChange = 1e-4, + int $evalInterval = 3, + int $window = 5, + float $holdOut = 0.1, + ?RegressionLoss $costFn = null, + ?Metric $metric = null, + bool $packSamples = false + ) { + foreach ($hiddenLayers as $layer) { + if (!$layer instanceof Hidden) { + throw new InvalidArgumentException('Hidden layer' + . ' must implement the Hidden interface.'); + } + } + + if ($batchSize < 1) { + throw new InvalidArgumentException('Batch size must be' + . " greater than 0, $batchSize given."); + } + + if ($epochs < 0) { + throw new InvalidArgumentException('Number of epochs' + . " must be greater than 0, $epochs given."); + } + + if ($minChange < 0.0) { + throw new InvalidArgumentException('Minimum change must be' + . " greater than 0, $minChange given."); + } + + if ($evalInterval < 1) { + throw new InvalidArgumentException('Eval interval must be' + . " greater than 0, $evalInterval given."); + } + + if ($window < 1) { + throw new InvalidArgumentException('Window must be' + . " greater than 0, $window given."); + } + + if ($holdOut < 0.0 or $holdOut > 0.5) { + throw new InvalidArgumentException('Hold out ratio must be' + . " between 0 and 0.5, $holdOut given."); + } + + if ($metric) { + EstimatorIsCompatibleWithMetric::with($this, $metric)->check(); + } + + $this->hiddenLayers = $hiddenLayers; + $this->batchSize = $batchSize; + $this->optimizer = $optimizer ?? new Adam(); + $this->epochs = $epochs; + $this->minChange = $minChange; + $this->evalInterval = $evalInterval; + $this->window = $window; + $this->holdOut = $holdOut; + $this->costFn = $costFn ?? new LeastSquares(); + $this->metric = $metric ?? new RMSE(); + $this->packSamples = $packSamples; + } + + /** + * Return the estimator type. + * + * @internal + * + * @return EstimatorType + */ + public function type() : EstimatorType + { + return EstimatorType::regressor(); + } + + /** + * Return the data types that the estimator is compatible with. + * + * @internal + * + * @return list + */ + public function compatibility() : array + { + return [ + DataType::continuous(), + ]; + } + + /** + * Return the settings of the hyper-parameters in an associative array. + * + * @internal + * + * @return mixed[] + */ + public function params() : array + { + return [ + 'hidden layers' => $this->hiddenLayers, + 'batch size' => $this->batchSize, + 'optimizer' => $this->optimizer, + 'epochs' => $this->epochs, + 'min change' => $this->minChange, + 'eval interval' => $this->evalInterval, + 'window' => $this->window, + 'hold out' => $this->holdOut, + 'cost fn' => $this->costFn, + 'metric' => $this->metric, + ]; + } + + /** + * Has the learner been trained? + * + * @return bool + */ + public function trained() : bool + { + return isset($this->network); + } + + /** + * Return an iterable progress table with the steps from the last training session. + * + * @return Generator + */ + public function steps() : Generator + { + if (!$this->losses) { + return; + } + + foreach ($this->losses as $epoch => $loss) { + yield [ + 'epoch' => $epoch, + 'score' => $this->scores[$epoch] ?? null, + 'loss' => $loss, + ]; + } + } + + /** + * Return the validation score at each epoch. + * + * @return float[]|null + */ + public function scores() : ?array + { + return $this->scores; + } + + /** + * Return the training loss at each epoch. + * + * @return float[]|null + */ + public function losses() : ?array + { + return $this->losses; + } + + /** + * Return the underlying neural network instance or null if not trained. + * + * @return FeedForward|null + */ + public function network() : ?FeedForward + { + return $this->network; + } + + /** + * Train the estimator with a dataset. + * + * @param Labeled $dataset + */ + public function train(Dataset $dataset) : void + { + DatasetIsNotEmpty::with($dataset)->check(); + + $hiddenLayers = $this->hiddenLayers; + + $hiddenLayers[] = new Dense(1, 0.0, true, new XavierUniform()); + + $this->network = new FeedForward( + input: new Placeholder1D($dataset->numFeatures()), + hidden: $hiddenLayers, + output: new Continuous($this->costFn), + optimizer: $this->optimizer, + packSamples: $this->packSamples + ); + + $this->network->initialize(); + + $this->partial($dataset); + } + + /** + * Train the network using mini-batch gradient descent with backpropagation. + * + * @param Labeled $dataset + * @throws RuntimeException + */ + public function partial(Dataset $dataset) : void + { + if (!$this->network) { + $this->train($dataset); + + return; + } + + SpecificationChain::with([ + new DatasetIsLabeled($dataset), + new DatasetIsNotEmpty($dataset), + new SamplesAreCompatibleWithEstimator($dataset, $this), + new LabelsAreCompatibleWithLearner($dataset, $this), + new DatasetHasDimensionality($dataset, $this->network->input()->width()), + ])->check(); + + if ($this->logger) { + $this->logger->info("Training $this"); + + $numParams = number_format($this->network->numParams()); + + $this->logger->info("{$numParams} trainable parameters"); + } + + [$testing, $training] = $dataset->randomize()->split($this->holdOut); + + [$minScore, $maxScore] = $this->metric->range()->list(); + + $bestScore = $minScore; + $bestEpoch = $numWorseEpochs = 0; + $loss = 0.0; + $score = $snapshot = null; + $prevLoss = INF; + + $this->scores = $this->losses = []; + + for ($epoch = 1; $epoch <= $this->epochs; ++$epoch) { + $batches = $training->randomize()->batch($this->batchSize); + + $loss = 0.0; + + foreach ($batches as $batch) { + $loss += $this->network->roundtrip($batch); + } + + $loss /= count($batches); + + $lossChange = abs($prevLoss - $loss); + + $this->losses[$epoch] = $loss; + + if (is_nan($loss)) { + if ($this->logger) { + $this->logger->warning('Numerical instability detected'); + } + + break; + } + + if ($epoch % $this->evalInterval === 0 && !$testing->empty()) { + $predictions = $this->predict($testing); + + $score = $this->metric->score($predictions, $testing->labels()); + + $this->scores[$epoch] = $score; + } + + if ($this->logger) { + $message = "Epoch: $epoch, {$this->costFn}: $loss"; + + if (isset($score)) { + $message .= ", {$this->metric}: $score"; + } + + $this->logger->info($message); + } + + if (isset($score)) { + if ($score >= $maxScore) { + break; + } + + if ($score > $bestScore) { + $bestScore = $score; + $bestEpoch = $epoch; + + $snapshot = Snapshot::take($this->network); + + $numWorseEpochs = 0; + } else { + ++$numWorseEpochs; + } + + if ($numWorseEpochs >= $this->window) { + break; + } + + unset($score); + } + + if ($lossChange < $this->minChange) { + break; + } + + $prevLoss = $loss; + } + + if ($snapshot and (end($this->scores) < $bestScore or is_nan($loss))) { + $snapshot->restore(); + + if ($this->logger) { + $this->logger->info("Model state restored to epoch $bestEpoch"); + } + } + + if ($this->logger) { + $this->logger->info('Training complete'); + } + } + + /** + * Feed a sample through the network and make a prediction based on the + * activation of the output neuron. + * + * @param Dataset $dataset + * @throws RuntimeException + * @return list + */ + public function predict(Dataset $dataset) : array + { + if (!$this->network) { + throw new RuntimeException('Estimator has not been trained.'); + } + + DatasetHasDimensionality::with($dataset, $this->network->input()->width())->check(); + + $activations = $this->network->infer($dataset); + + $activations = array_column($activations->toArray(), 0); + + return $activations; + } + + /** + * Export the network architecture as a graph in dot format. + * + * @throws RuntimeException + * @return Encoding + */ + public function exportGraphviz() : Encoding + { + if (!$this->network) { + throw new RuntimeException('Must train network first.'); + } + + return $this->network->exportGraphviz(); + } + + /** + * Return an associative array containing the data used to serialize the object. + * + * @return mixed[] + */ + public function __serialize() : array + { + $properties = get_object_vars($this); + + unset($properties['losses'], $properties['scores'], $properties['logger']); + + return $properties; + } + + /** + * Return the string representation of the object. + * + * @internal + * + * @return string + */ + public function __toString() : string + { + return 'MLP Regressor (' . Params::stringify($this->params()) . ')'; + } +} diff --git a/src/Regressors/RadiusNeighborsRegressor/RadiusNeighborsRegressor.php b/src/Regressors/RadiusNeighborsRegressor/RadiusNeighborsRegressor.php new file mode 100644 index 000000000..715b6f154 --- /dev/null +++ b/src/Regressors/RadiusNeighborsRegressor/RadiusNeighborsRegressor.php @@ -0,0 +1,232 @@ + **Note**: Unknown samples with no training samples within radius are labeled + * *NaN*. As such, Radius Neighbors is also a quasi anomaly detector. + * + * @category Machine Learning + * @package Rubix/ML + * @author Andrew DalPino + * @author Samuel Akopyan + */ +class RadiusNeighborsRegressor implements Estimator, Learner, Persistable +{ + use AutotrackRevisions; + + /** + * The value to assign to outliers when making a prediction. + * + * @var mixed + */ + public const OUTLIER_VALUE = NAN; + + /** + * The radius within which points are considered neighbors. + * + * @var float + */ + protected float $radius; + + /** + * Should we consider the distances of our nearest neighbors when making predictions? + * + * @var bool + */ + protected bool $weighted; + + /** + * The spatial tree used to run range searches. + * + * @var Spatial + */ + protected Spatial $tree; + + /** + * The dimensionality of the training set. + * + * @var int|null + */ + protected ?int $featureCount = null; + + /** + * @param float $radius + * @param bool $weighted + * @param Spatial|null $tree + * @throws InvalidArgumentException + */ + public function __construct(float $radius = 1.0, bool $weighted = false, ?Spatial $tree = null) + { + if ($radius <= 0.0) { + throw new InvalidArgumentException('Radius must be' + . " greater than 0, $radius given."); + } + + $this->radius = $radius; + $this->weighted = $weighted; + $this->tree = $tree ?? new BallTree(); + } + + /** + * Return the estimator type. + * + * @internal + * + * @return EstimatorType + */ + public function type() : EstimatorType + { + return EstimatorType::regressor(); + } + + /** + * Return the data types that the estimator is compatible with. + * + * @internal + * + * @return list<\Rubix\ML\DataType> + */ + public function compatibility() : array + { + return $this->tree->kernel()->compatibility(); + } + + /** + * Return the settings of the hyper-parameters in an associative array. + * + * @internal + * + * @return mixed[] + */ + public function params() : array + { + return [ + 'radius' => $this->radius, + 'weighted' => $this->weighted, + 'tree' => $this->tree, + ]; + } + + /** + * Has the learner been trained? + * + * @return bool + */ + public function trained() : bool + { + return !$this->tree->bare(); + } + + /** + * Return the base spatial tree instance. + * + * @return Spatial + */ + public function tree() : Spatial + { + return $this->tree; + } + + /** + * Train the learner with a dataset. + * + * @param Labeled $dataset + */ + public function train(Dataset $dataset) : void + { + SpecificationChain::with([ + new DatasetIsLabeled($dataset), + new DatasetIsNotEmpty($dataset), + new SamplesAreCompatibleWithEstimator($dataset, $this), + new LabelsAreCompatibleWithLearner($dataset, $this), + ])->check(); + + $this->featureCount = $dataset->numFeatures(); + + $this->tree->grow($dataset); + } + + /** + * Make a prediction based on the nearest neighbors. + * + * @param Dataset $dataset + * @throws RuntimeException + * @return list + */ + public function predict(Dataset $dataset) : array + { + if ($this->tree->bare() or !$this->featureCount) { + throw new RuntimeException('Estimator has not been trained.'); + } + + DatasetHasDimensionality::with($dataset, $this->featureCount)->check(); + + return array_map([$this, 'predictSample'], $dataset->samples()); + } + + /** + * Predict a single sample and return the result. + * + * @internal + * + * @param list $sample + * @return int|float + */ + public function predictSample(array $sample) : int|float + { + [$samples, $labels, $distances] = $this->tree->range($sample, $this->radius); + + if (empty($labels)) { + return self::OUTLIER_VALUE; + } + + if ($this->weighted) { + $distances = NumPower::array($distances); + $weights = NumPower::divide(1.0, NumPower::add($distances, 1.0))->toArray(); + + return Stats::weightedMean($labels, $weights); + } + + return Stats::mean($labels); + } + + /** + * Return the string representation of the object. + * + * @internal + * + * @return string + */ + public function __toString() : string + { + return 'Radius Neighbors Regressor (' . Params::stringify($this->params()) . ')'; + } +} diff --git a/src/Regressors/RegressionTree/RegressionTree.php b/src/Regressors/RegressionTree/RegressionTree.php new file mode 100644 index 000000000..23e1e84e4 --- /dev/null +++ b/src/Regressors/RegressionTree/RegressionTree.php @@ -0,0 +1,203 @@ + + */ + public function compatibility() : array + { + return [ + DataType::categorical(), + DataType::continuous(), + ]; + } + + /** + * Return the settings of the hyper-parameters in an associative array. + * + * @internal + * + * @return mixed[] + */ + public function params() : array + { + return [ + 'max height' => $this->maxHeight, + 'max leaf size' => $this->maxLeafSize, + 'max features' => $this->maxFeatures, + 'min purity increase' => $this->minPurityIncrease, + 'max bins' => $this->maxBins, + ]; + } + + /** + * Has the learner been trained? + * + * @return bool + */ + public function trained() : bool + { + return !$this->bare(); + } + + /** + * Train the learner with a dataset. + * + * @param Labeled $dataset + */ + public function train(Dataset $dataset) : void + { + SpecificationChain::with([ + new DatasetIsLabeled($dataset), + new DatasetIsNotEmpty($dataset), + new SamplesAreCompatibleWithEstimator($dataset, $this), + new LabelsAreCompatibleWithLearner($dataset, $this), + ])->check(); + + $this->grow($dataset); + } + + /** + * Make a prediction based on the value of a terminal node in the tree. + * + * @param Dataset $dataset + * @throws RuntimeException + * @return list + */ + public function predict(Dataset $dataset) : array + { + if ($this->bare() or !$this->featureCount) { + throw new RuntimeException('Estimator has not been trained.'); + } + + DatasetHasDimensionality::with($dataset, $this->featureCount)->check(); + + return array_map([$this, 'predictSample'], $dataset->samples()); + } + + /** + * Predict a single sample and return the result. + * + * @internal + * + * @param list $sample + * @return int|float + */ + public function predictSample(array $sample) : int|float + { + /** @var Average $node */ + $node = $this->search($sample); + + return $node->outcome(); + } + + /** + * Terminate the branch with the most likely Average. + * + * @param Labeled $dataset + * @return Average + */ + protected function terminate(Labeled $dataset) : Average + { + [$mean, $variance] = Stats::meanVar($dataset->labels()); + + return new Average($mean, $variance, $dataset->numSamples()); + } + + /** + * Calculate the impurity of a set of labels. + * + * @param list $labels + * @return float + */ + protected function impurity(array $labels) : float + { + return Stats::variance($labels); + } + + /** + * Return the string representation of the object. + * + * @internal + * + * @return string + */ + public function __toString() : string + { + return 'Regression Tree (' . Params::stringify($this->params()) . ')'; + } +} diff --git a/src/Regressors/Ridge.php b/src/Regressors/Ridge.php index ff866530a..364fbe839 100644 --- a/src/Regressors/Ridge.php +++ b/src/Regressors/Ridge.php @@ -2,12 +2,11 @@ namespace Rubix\ML\Regressors; -use NDArray; -use NumPower; use Tensor\Matrix; use Tensor\Vector; use Rubix\ML\Learner; use Rubix\ML\DataType; +use Rubix\ML\Datasets\Labeled; use Rubix\ML\Estimator; use Rubix\ML\Persistable; use Rubix\ML\RanksFeatures; @@ -62,8 +61,6 @@ class Ridge implements Estimator, Learner, RanksFeatures, Persistable */ protected ?Vector $coefficients = null; - protected ?NDArray $coefficientsNd = null; - /** * @param float $l2Penalty * @throws InvalidArgumentException @@ -150,8 +147,9 @@ public function bias() : ?float /** * Train the learner with a dataset. + * Formula: (Xᵀ X + λ I)⁻¹ Xᵀ y * - * @param \Rubix\ML\Datasets\Labeled $dataset + * @param Labeled $dataset */ public function train(Dataset $dataset) : void { @@ -165,28 +163,23 @@ public function train(Dataset $dataset) : void $biases = Matrix::ones($dataset->numSamples(), 1); $x = Matrix::build($dataset->samples())->augmentLeft($biases); - $y = NumPower::array($dataset->labels()); + $y = Vector::build($dataset->labels()); /** @var int<0,max> $nHat */ $nHat = $x->n() - 1; $penalties = array_fill(0, $nHat, $this->l2Penalty); - array_unshift($penalties, 0.0); - $penalties = NumPower::array(Matrix::diagonal($penalties)->asArray()); + $penalties = Matrix::diagonal($penalties); - $xNp = NumPower::array($x->asArray()); - $xT = NumPower::transpose($xNp, [1, 0]); + $xT = $x->transpose(); - $xMul = NumPower::matmul($xT, $xNp); - $xMulAdd = NumPower::add($xMul, $penalties); - $xMulAddInv = NumPower::inv($xMulAdd); - $xtDotY = NumPower::dot($xT, $y); - - $coefficientsNd = NumPower::dot($xMulAddInv, $xtDotY); - $this->coefficientsNd = $coefficientsNd; - $coefficients = $coefficientsNd->toArray(); + $coefficients = $xT->matmul($x) + ->add($penalties) + ->inverse() + ->dot($xT->dot($y)) + ->asArray(); $this->bias = (float) array_shift($coefficients); $this->coefficients = Vector::quick($coefficients); @@ -201,16 +194,16 @@ public function train(Dataset $dataset) : void */ public function predict(Dataset $dataset) : array { - if (!$this->coefficients or is_null($this->bias) or is_null($this->coefficientsNd)) { + if (!$this->coefficients or is_null($this->bias)) { throw new RuntimeException('Estimator has not been trained.'); } DatasetHasDimensionality::with($dataset, count($this->coefficients))->check(); - $datasetNd = NumPower::array($dataset->samples()); - $datasetDotCoefficients = NumPower::dot($datasetNd, $this->coefficientsNd); - - return NumPower::add($datasetDotCoefficients, $this->bias)->toArray(); + return Matrix::build($dataset->samples()) + ->dot($this->coefficients) + ->add($this->bias) + ->asArray(); } /** diff --git a/src/Regressors/Ridge/Ridge.php b/src/Regressors/Ridge/Ridge.php new file mode 100644 index 000000000..3082f9b30 --- /dev/null +++ b/src/Regressors/Ridge/Ridge.php @@ -0,0 +1,260 @@ +l2Penalty = $l2Penalty; + } + + /** + * Return the estimator type. + * + * @internal + * + * @return EstimatorType + */ + public function type() : EstimatorType + { + return EstimatorType::regressor(); + } + + /** + * Return the data types that the estimator is compatible with. + * + * @internal + * + * @return list + */ + public function compatibility() : array + { + return [ + DataType::continuous(), + ]; + } + + /** + * Return the settings of the hyper-parameters in an associative array. + * + * @internal + * + * @return mixed[] + */ + public function params() : array + { + return [ + 'l2 penalty' => $this->l2Penalty, + ]; + } + + /** + * Has the learner been trained? + * + * @return bool + */ + public function trained() : bool + { + return $this->coefficients and isset($this->bias); + } + + /** + * Return the weights of features in the decision function. + * + * @return (int|float)[]|null + */ + public function coefficients() : ?array + { + return $this->coefficients ? $this->coefficients->toArray() : null; + } + + /** + * Return the bias added to the decision function. + * + * @return float|null + */ + public function bias() : ?float + { + return $this->bias; + } + + /** + * Train the learner with a dataset using NumPower for the algebra path. + * Formula: (Xᵀ X + λ I)⁻¹ Xᵀ y + * + * @param Labeled $dataset + */ + public function train(Dataset $dataset) : void + { + SpecificationChain::with([ + new DatasetIsLabeled($dataset), + new DatasetIsNotEmpty($dataset), + new SamplesAreCompatibleWithEstimator($dataset, $this), + new LabelsAreCompatibleWithLearner($dataset, $this), + ])->check(); + + $biases = NumPower::ones([$dataset->numSamples(), 1]); + + $samples = NumPower::array(array_pack($dataset->samples())); + // Add bias from left + $x = NumPower::concatenate([$biases, $samples], axis: 1); + $y = NumPower::array($dataset->labels()); + + /** @var int<0,max> $nHat */ + $nHat = $x->shape()[1] - 1; + + $penalties = array_fill(0, $nHat, $this->l2Penalty); + array_unshift($penalties, 0.0); + + $penalties = NumPower::diag($penalties); + + $xT = NumPower::transpose($x, [1, 0]); + + $a = NumPower::add(NumPower::matmul($xT, $x), $penalties); + $b = NumPower::dot($xT, $y); + + $coefficients = NumPower::dot(NumPower::inv($a), $b)->toArray(); + + $this->bias = (float) array_shift($coefficients); + $this->coefficients = NumPower::array($coefficients); + } + + /** + * Make a prediction based on the line calculated from the training data. + * + * @param Dataset $dataset + * @throws RuntimeException + * @return list + */ + public function predict(Dataset $dataset) : array + { + if (!$this->coefficients or is_null($this->bias)) { + throw new RuntimeException('Estimator has not been trained.'); + } + + $weights = $this->coefficients->toArray(); + + DatasetHasDimensionality::with($dataset, count($weights))->check(); + + $predictions = []; + + foreach ($dataset->samples() as $sample) { + $x = NumPower::array($sample); + $dot = NumPower::dot($x, $this->coefficients); + $result = NumPower::add($dot, $this->bias); + + if (is_float($result)) { + $predictions[] = $result; + + continue; + } + + $value = $result->toArray(); + + if (is_array($value)) { + $value = $value[0] ?? null; + } + + $predictions[] = (float) $value; + } + + return $predictions; + } + + /** + * Return the importance scores of each feature column of the training set. + * + * @throws RuntimeException + * @return float[] + */ + public function featureImportances() : array + { + if (is_null($this->coefficients)) { + throw new RuntimeException('Learner has not been trained.'); + } + + return NumPower::abs($this->coefficients)->toArray(); + } + + /** + * Return the string representation of the object. + * + * @internal + * + * @return string + */ + public function __toString() : string + { + return 'Ridge (' . Params::stringify($this->params()) . ')'; + } +} diff --git a/src/functions.php b/src/functions.php index cba6135fd..9a54a78fe 100644 --- a/src/functions.php +++ b/src/functions.php @@ -246,4 +246,15 @@ function warn_deprecated(string $message) : void { trigger_error($message, E_USER_DEPRECATED); } + + /** + * Prepare samples depending on packing configuration. + * @param array $samples + * @return array> + */ + function array_pack(array $samples) : array + { + // Reindex a nested array to ensure all levels have sequential numeric keys + return array_map('array_values', array_values($samples)); + } } diff --git a/tests/CrossValidation/Reports/ErrorAnalysisTest.php b/tests/CrossValidation/Reports/ErrorAnalysisTest.php index 8e67a0cb7..e1ad3ebfe 100644 --- a/tests/CrossValidation/Reports/ErrorAnalysisTest.php +++ b/tests/CrossValidation/Reports/ErrorAnalysisTest.php @@ -101,6 +101,20 @@ public function testGenerate(array $predictions, array $labels, array $expected) ); $this->assertInstanceOf(Report::class, $results); - $this->assertEquals($expected, $results->toArray()); + + $actual = $results->toArray(); + + // Instead of strict whole-array use equality with per-field checks. + foreach ($expected as $name => $value) { + if (is_float($value)) { + $this->assertArrayHasKey($name, $actual); + $this->assertEqualsWithDelta($value, $actual[$name], 1e-6, $name); + + continue; + } + + $this->assertArrayHasKey($name, $actual); + $this->assertEquals($value, $actual[$name], $name); + } } } diff --git a/tests/DataProvider/AdalineProvider.php b/tests/DataProvider/AdalineProvider.php new file mode 100644 index 000000000..86599b598 --- /dev/null +++ b/tests/DataProvider/AdalineProvider.php @@ -0,0 +1,51 @@ +>, 1: list, 2: list}> + */ + public static function trainPredictProvider() : Generator + { + yield '1 feature linear sample' => [ + [ + [0], + [1], + [2], + [3], + ], + [3, 5, 7, 9], + [4], + ]; + + yield '2 feature linear sample' => [ + [ + [0, 0], + [1, 1], + [2, 1], + [1, 2], + ], + [3, 6, 7, 8], + [2, 2], + ]; + + yield '3 feature linear sample' => [ + [ + [0, 0, 0], + [1, 0, 0], + [0, 1, 0], + [0, 0, 1], + ], + [4, 5, 6, 7], + [1, 1, 1], + ]; + } +} diff --git a/tests/DataProvider/ExtraTreeRegressorProvider.php b/tests/DataProvider/ExtraTreeRegressorProvider.php new file mode 100644 index 000000000..195001d12 --- /dev/null +++ b/tests/DataProvider/ExtraTreeRegressorProvider.php @@ -0,0 +1,62 @@ +>, 1: list, 2: list}> + */ + public static function trainPredictProvider() : Generator + { + yield '1 feature sample' => [ + [ + [0], + [1], + [2], + [3], + ], + [2, 4, 6, 8], + [4], + ]; + + yield '2 feature sample' => [ + [ + [0, 0], + [1, 1], + [2, 1], + [1, 2], + ], + [3, 6, 7, 8], + [2, 2], + ]; + + yield '3 feature sample' => [ + [ + [0, 0, 0], + [1, 0, 0], + [0, 1, 0], + [0, 0, 1], + ], + [4, 5, 6, 7], + [1, 1, 1], + ]; + + yield '4 feature sample' => [ + [ + [0, 0, 0, 0], + [1, 0, 0, 0], + [0, 1, 0, 0], + [0, 0, 1, 0], + ], + [2, 4, 6, 8], + [1, 1, 1, 1], + ]; + } +} diff --git a/tests/DataProvider/GradientBoostProvider.php b/tests/DataProvider/GradientBoostProvider.php new file mode 100644 index 000000000..19c0c07d9 --- /dev/null +++ b/tests/DataProvider/GradientBoostProvider.php @@ -0,0 +1,22 @@ + + */ + public static function trainPredictAdditionalProvider() : Generator + { + yield 'default swiss roll sample' => [512, 256]; + + yield 'smaller swiss roll sample' => [128, 64]; + } +} diff --git a/tests/DataProvider/RegressionTreeProvider.php b/tests/DataProvider/RegressionTreeProvider.php new file mode 100644 index 000000000..698388816 --- /dev/null +++ b/tests/DataProvider/RegressionTreeProvider.php @@ -0,0 +1,22 @@ + + */ + public static function trainedModelCases() : Generator + { + yield 'standard split' => [512, 256]; + + yield 'smaller split' => [128, 64]; + } +} diff --git a/tests/Datasets/Generators/Hyperplane/HyperplaneTest.php b/tests/Datasets/Generators/Hyperplane/HyperplaneTest.php new file mode 100644 index 000000000..28e5f2d52 --- /dev/null +++ b/tests/Datasets/Generators/Hyperplane/HyperplaneTest.php @@ -0,0 +1,75 @@ +generator = new Hyperplane(coefficients: [0.001, -4.0, 12], intercept: 5.0); + } + + #[Test] + #[TestDox('Returns the correct number of dimensions')] + public function dimensions() : void + { + self::assertEquals(3, $this->generator->dimensions()); + } + + #[Test] + #[TestDox('Can generate a labeled dataset')] + public function generate() : void + { + $dataset = $this->generator->generate(30); + + self::assertInstanceOf(Labeled::class, $dataset); + self::assertInstanceOf(Dataset::class, $dataset); + + self::assertCount(30, $dataset); + + self::assertSame([30, 3], $dataset->shape()); + + $samples = $dataset->samples(); + $labels = $dataset->labels(); + + self::assertCount(30, $samples); + self::assertCount(30, $labels); + + foreach ($labels as $label) { + self::assertIsFloat($label); + self::assertGreaterThanOrEqual(-1.0, $label); + self::assertLessThanOrEqual(1.0, $label); + } + + foreach ($samples as $i => $sample) { + self::assertCount(3, $sample); + + foreach ($sample as $value) { + self::assertIsFloat($value); + } + + $y = $labels[$i]; + + $yFromFeature2 = ($sample[1] / -4.0) - 5.0; + $yFromFeature3 = ($sample[2] / 12.0) - 5.0; + + self::assertEqualsWithDelta($y, $yFromFeature2, 0.2); + self::assertEqualsWithDelta($y, $yFromFeature3, 0.2); + } + } +} diff --git a/tests/Datasets/Generators/SwissRoll/SwissRollTest.php b/tests/Datasets/Generators/SwissRoll/SwissRollTest.php new file mode 100644 index 000000000..437604c21 --- /dev/null +++ b/tests/Datasets/Generators/SwissRoll/SwissRollTest.php @@ -0,0 +1,47 @@ +generator = new SwissRoll(x: 0.0, y: 0.0, z: 0.0, scale: 1.0, depth: 12.0, noise: 0.3); + } + + #[Test] + #[TestDox('Dimensions returns 3')] + public function testDimensions() : void + { + self::assertEquals(3, $this->generator->dimensions()); + } + + #[Test] + #[TestDox('Generate returns a labeled dataset of the requested size')] + public function testGenerate() : void + { + $dataset = $this->generator->generate(self::DATASET_SIZE); + + self::assertInstanceOf(Labeled::class, $dataset); + self::assertInstanceOf(Dataset::class, $dataset); + + self::assertCount(self::DATASET_SIZE, $dataset); + } +} diff --git a/tests/NeuralNet/Initializers/LeCun/LeCunNormalTest.php b/tests/NeuralNet/Initializers/LeCun/LeCunNormalTest.php index dfdf996bc..ef42ea465 100644 --- a/tests/NeuralNet/Initializers/LeCun/LeCunNormalTest.php +++ b/tests/NeuralNet/Initializers/LeCun/LeCunNormalTest.php @@ -95,7 +95,7 @@ public function testConstructor() : void $this->expectNotToPerformAssertions(); //when - new LeCunNormal(); + $class = new LeCunNormal(); } #[Test] diff --git a/tests/NeuralNet/Initializers/LeCun/LeCunUniformTest.php b/tests/NeuralNet/Initializers/LeCun/LeCunUniformTest.php index 415ebfba0..fd5d5e970 100644 --- a/tests/NeuralNet/Initializers/LeCun/LeCunUniformTest.php +++ b/tests/NeuralNet/Initializers/LeCun/LeCunUniformTest.php @@ -95,7 +95,7 @@ public function testConstructor() : void $this->expectNotToPerformAssertions(); //when - new LeCunUniform(); + $class = new LeCunUniform(); } #[Test] diff --git a/tests/NeuralNet/Initializers/Normal/NormalTest.php b/tests/NeuralNet/Initializers/Normal/NormalTest.php index 9d6641966..33b24a043 100644 --- a/tests/NeuralNet/Initializers/Normal/NormalTest.php +++ b/tests/NeuralNet/Initializers/Normal/NormalTest.php @@ -2,7 +2,7 @@ declare(strict_types = 1); -namespace Rubix\ML\Tests\NeuralNet\Initializers\He; +namespace Rubix\ML\Tests\NeuralNet\Initializers\Normal; use PHPUnit\Framework\Attributes\CoversClass; use PHPUnit\Framework\Attributes\DataProvider; diff --git a/tests/NeuralNet/Initializers/Normal/TruncatedNormalTest.php b/tests/NeuralNet/Initializers/Normal/TruncatedNormalTest.php index 82f4e88aa..c3a0b40b6 100644 --- a/tests/NeuralNet/Initializers/Normal/TruncatedNormalTest.php +++ b/tests/NeuralNet/Initializers/Normal/TruncatedNormalTest.php @@ -2,7 +2,7 @@ declare(strict_types = 1); -namespace Rubix\ML\Tests\NeuralNet\Initializers\He; +namespace Rubix\ML\Tests\NeuralNet\Initializers\Normal; use PHPUnit\Framework\Attributes\CoversClass; use PHPUnit\Framework\Attributes\DataProvider; diff --git a/tests/NeuralNet/Initializers/Uniform/UniformTest.php b/tests/NeuralNet/Initializers/Uniform/UniformTest.php index a22d70a47..bfe324801 100644 --- a/tests/NeuralNet/Initializers/Uniform/UniformTest.php +++ b/tests/NeuralNet/Initializers/Uniform/UniformTest.php @@ -2,7 +2,7 @@ declare(strict_types = 1); -namespace Rubix\ML\Tests\NeuralNet\Initializers\He; +namespace Rubix\ML\Tests\NeuralNet\Initializers\Uniform; use PHPUnit\Framework\Attributes\CoversClass; use PHPUnit\Framework\Attributes\DataProvider; diff --git a/tests/NeuralNet/Initializers/Xavier/XavierNormalTest.php b/tests/NeuralNet/Initializers/Xavier/XavierNormalTest.php index 95ed3e6f0..e84b5ec5f 100644 --- a/tests/NeuralNet/Initializers/Xavier/XavierNormalTest.php +++ b/tests/NeuralNet/Initializers/Xavier/XavierNormalTest.php @@ -2,7 +2,7 @@ declare(strict_types = 1); -namespace Rubix\ML\Tests\NeuralNet\Initializers\He; +namespace Rubix\ML\Tests\NeuralNet\Initializers\Xavier; use PHPUnit\Framework\Attributes\CoversClass; use PHPUnit\Framework\Attributes\DataProvider; diff --git a/tests/NeuralNet/Initializers/Xavier/XavierUniformTest.php b/tests/NeuralNet/Initializers/Xavier/XavierUniformTest.php index 236d69b80..c20892d75 100644 --- a/tests/NeuralNet/Initializers/Xavier/XavierUniformTest.php +++ b/tests/NeuralNet/Initializers/Xavier/XavierUniformTest.php @@ -2,7 +2,7 @@ declare(strict_types = 1); -namespace Rubix\ML\Tests\NeuralNet\Initializers\He; +namespace Rubix\ML\Tests\NeuralNet\Initializers\Xavier; use PHPUnit\Framework\Attributes\CoversClass; use PHPUnit\Framework\Attributes\DataProvider; @@ -95,7 +95,7 @@ public function consttestConstructorructTest1() : void $this->expectNotToPerformAssertions(); //when - new XavierUniform(); + $class = new XavierUniform(); } #[Test] diff --git a/tests/NeuralNet/NumPower/NumPowerTest.php b/tests/NeuralNet/NumPower/NumPowerTest.php new file mode 100644 index 000000000..20a2ee602 --- /dev/null +++ b/tests/NeuralNet/NumPower/NumPowerTest.php @@ -0,0 +1,50 @@ +shape()); + + $a = $t->toArray(); + + self::assertEqualsWithDelta(0.0, (float) $a[0][0], 1e-12); + self::assertEqualsWithDelta(1000.0, (float) $a[0][1], 1e-12); + self::assertEqualsWithDelta(2000.0, (float) $a[0][2], 1e-12); + + self::assertEqualsWithDelta(255.0, (float) $a[255][0], 1e-12); + self::assertEqualsWithDelta(1255.0, (float) $a[255][1], 1e-12); + self::assertEqualsWithDelta(2255.0, (float) $a[255][2], 1e-12); + + self::assertEqualsWithDelta(42.0, (float) $a[42][0], 1e-12); + self::assertEqualsWithDelta(1042.0, (float) $a[42][1], 1e-12); + self::assertEqualsWithDelta(2042.0, (float) $a[42][2], 1e-12); + } +} diff --git a/tests/Regressors/Adaline/AdalineTest.php b/tests/Regressors/Adaline/AdalineTest.php new file mode 100644 index 000000000..99cb445bc --- /dev/null +++ b/tests/Regressors/Adaline/AdalineTest.php @@ -0,0 +1,215 @@ +generator = new Hyperplane( + coefficients: [1.0, 5.5, -7, 0.01], + intercept: 0.0, + noise: 1.0 + ); + + $this->estimator = new Adaline( + batchSize: 32, + optimizer: new Adam(rate: 0.001), + l2Penalty: 1e-4, + epochs: 100, + minChange: 1e-4, + window: 5, + costFn: new HuberLoss(1.0) + ); + + $this->metric = new RSquared(); + + srand(self::RANDOM_SEED); + } + + #[Test] + #[TestDox('Assert pre conditions')] + public function preConditions() : void + { + self::assertFalse($this->estimator->trained()); + } + + #[Test] + #[TestDox('Throws an exception for a bad batch size')] + public function badBatchSize() : void + { + $this->expectException(InvalidArgumentException::class); + + new Adaline(-100); + } + + #[Test] + #[TestDox('Reports the estimator type')] + public function type() : void + { + self::assertEquals(EstimatorType::regressor(), $this->estimator->type()); + } + + #[Test] + #[TestDox('Reports compatibility')] + public function compatibility() : void + { + $expected = [ + DataType::continuous(), + ]; + + self::assertEquals($expected, $this->estimator->compatibility()); + } + + #[Test] + #[TestDox('Reports parameters')] + public function params() : void + { + $expected = [ + 'batch size' => 32, + 'optimizer' => new Adam(0.001), + 'l2 penalty' => 1e-4, + 'epochs' => 100, + 'min change' => 1e-4, + 'window' => 5, + 'cost fn' => new HuberLoss(1.0), + ]; + + self::assertEquals($expected, $this->estimator->params()); + } + + #[Test] + #[TestDox('Can train, predict, and provide feature importances')] + public function trainPredictImportances() : void + { + $this->estimator->setLogger(new BlackHole()); + + $training = $this->generator->generate(self::TRAIN_SIZE); + $testing = $this->generator->generate(self::TEST_SIZE); + + $this->estimator->train($training); + + self::assertTrue($this->estimator->trained()); + + $losses = $this->estimator->losses(); + + self::assertIsArray($losses); + self::assertContainsOnlyFloat($losses); + + $importances = $this->estimator->featureImportances(); + + self::assertCount(4, $importances); + self::assertContainsOnlyFloat($importances); + + $predictions = $this->estimator->predict($testing); + + /** @var list $labels */ + $labels = $testing->labels(); + $score = $this->metric->score( + predictions: $predictions, + labels: $labels + ); + + self::assertGreaterThanOrEqual(self::MIN_SCORE, $score); + } + + #[Test] + #[TestDox('Throws an exception when training with incompatible data')] + public function trainIncompatible() : void + { + $this->expectException(InvalidArgumentException::class); + + $this->estimator->train(Labeled::quick(samples: [['bad']], labels: [2])); + } + + #[Test] + #[TestDox('Throws an exception when predicting before training')] + public function predictUntrained() : void + { + $this->expectException(RuntimeException::class); + + $this->estimator->predict(Unlabeled::quick()); + } + + #[Test] + #[TestDox('Trains, predicts, and returns acceptable Adaline values')] + #[DataProviderExternal(AdalineProvider::class, 'trainPredictProvider')] + public function trainPredict(array $samples, array $labels, array $prediction) : void + { + $estimator = new Adaline( + batchSize: 32, + optimizer: new Adam(rate: 0.001), + l2Penalty: 1e-4, + epochs: 100, + minChange: 1e-4, + window: 5, + costFn: new HuberLoss(1.0) + ); + + $training = Labeled::quick($samples, $labels); + $estimator->train($training); + + self::assertTrue($estimator->trained()); + $params = $estimator->params(); + + self::assertSame(32, $params['batch size']); + self::assertEquals(1e-4, $params['l2 penalty']); + self::assertSame(100, $params['epochs']); + self::assertEquals(1e-4, $params['min change']); + self::assertSame(5, $params['window']); + + $predictions = $estimator->predict(Unlabeled::quick([$prediction])); + + self::assertIsFloat($predictions[0]); + } +} diff --git a/tests/Regressors/AdalineTest.php b/tests/Regressors/AdalineTest.php index 67ac5b1e0..00f2ae722 100644 --- a/tests/Regressors/AdalineTest.php +++ b/tests/Regressors/AdalineTest.php @@ -5,20 +5,24 @@ namespace Rubix\ML\Tests\Regressors; use PHPUnit\Framework\Attributes\CoversClass; +use PHPUnit\Framework\Attributes\DataProviderExternal; use PHPUnit\Framework\Attributes\Group; +use PHPUnit\Framework\Attributes\Test; +use PHPUnit\Framework\Attributes\TestDox; +use PHPUnit\Framework\TestCase; +use Rubix\ML\CrossValidation\Metrics\RSquared; use Rubix\ML\DataType; -use Rubix\ML\EstimatorType; +use Rubix\ML\Datasets\Generators\Hyperplane; use Rubix\ML\Datasets\Labeled; -use Rubix\ML\Loggers\BlackHole; use Rubix\ML\Datasets\Unlabeled; -use Rubix\ML\Regressors\Adaline; -use Rubix\ML\NeuralNet\Optimizers\Adam; -use Rubix\ML\Datasets\Generators\Hyperplane; -use Rubix\ML\CrossValidation\Metrics\RSquared; -use Rubix\ML\NeuralNet\CostFunctions\HuberLoss; +use Rubix\ML\EstimatorType; use Rubix\ML\Exceptions\InvalidArgumentException; use Rubix\ML\Exceptions\RuntimeException; -use PHPUnit\Framework\TestCase; +use Rubix\ML\Loggers\BlackHole; +use Rubix\ML\NeuralNet\CostFunctions\HuberLoss; +use Rubix\ML\NeuralNet\Optimizers\Adam; +use Rubix\ML\Regressors\Adaline; +use Rubix\ML\Tests\DataProvider\AdalineProvider; #[Group('Regressors')] #[CoversClass(Adaline::class)] @@ -160,4 +164,36 @@ public function testPredictUntrained() : void $this->estimator->predict(Unlabeled::quick()); } + + #[Test] + #[TestDox('Trains, predicts, and returns acceptable Adaline values')] + #[DataProviderExternal(AdalineProvider::class, 'trainPredictProvider')] + public function trainPredict(array $samples, array $labels, array $prediction) : void + { + $estimator = new Adaline( + batchSize: 32, + optimizer: new Adam(rate: 0.001), + l2Penalty: 1e-4, + epochs: 100, + minChange: 1e-4, + window: 5, + costFn: new HuberLoss(1.0) + ); + + $training = Labeled::quick($samples, $labels); + $estimator->train($training); + + self::assertTrue($estimator->trained()); + $params = $estimator->params(); + + self::assertSame(32, $params['batch size']); + self::assertEquals(1e-4, $params['l2 penalty']); + self::assertSame(100, $params['epochs']); + self::assertEquals(1e-4, $params['min change']); + self::assertSame(5, $params['window']); + + $predictions = $estimator->predict(Unlabeled::quick([$prediction])); + + self::assertIsFloat($predictions[0]); + } } diff --git a/tests/Regressors/ExtraTreeRegressor/ExtraTreeRegressorTest.php b/tests/Regressors/ExtraTreeRegressor/ExtraTreeRegressorTest.php new file mode 100644 index 000000000..70fdb7173 --- /dev/null +++ b/tests/Regressors/ExtraTreeRegressor/ExtraTreeRegressorTest.php @@ -0,0 +1,209 @@ +generator = new Hyperplane( + coefficients: [1.0, 5.5, -7, 0.01], + intercept: 35.0, + noise: 1.0 + ); + + $this->estimator = new ExtraTreeRegressor( + maxHeight: 30, + maxLeafSize: 3, + minPurityIncrease: 1e-7, + maxFeatures: 4 + ); + + $this->metric = new RSquared(); + + srand(self::RANDOM_SEED); + } + + #[Test] + #[TestDox('Is not trained before training')] + public function preConditions() : void + { + self::assertFalse($this->estimator->trained()); + } + + #[Test] + #[TestDox('Throws when max height is invalid')] + public function badMaxDepth() : void + { + $this->expectException(InvalidArgumentException::class); + + new ExtraTreeRegressor(0); + } + + #[Test] + #[TestDox('Returns estimator type')] + public function type() : void + { + self::assertEquals(EstimatorType::regressor(), $this->estimator->type()); + } + + #[Test] + #[TestDox('Declares feature compatibility')] + public function compatibility() : void + { + $expected = [ + DataType::categorical(), + DataType::continuous(), + ]; + + self::assertEquals($expected, $this->estimator->compatibility()); + } + + #[Test] + #[TestDox('Returns hyperparameters')] + public function params() : void + { + $expected = [ + 'max height' => 30, + 'max leaf size' => 3, + 'min purity increase' => 1.0E-7, + 'max features' => 4, + ]; + + self::assertEquals($expected, $this->estimator->params()); + } + + #[Test] + #[TestDox('Trains, predicts, and returns importances for continuous targets')] + public function trainPredictImportancesContinuous() : void + { + $training = $this->generator->generate(self::TRAIN_SIZE); + $testing = $this->generator->generate(self::TEST_SIZE); + + $this->estimator->train($training); + + self::assertTrue($this->estimator->trained()); + + $importances = $this->estimator->featureImportances(); + + self::assertCount(4, $importances); + self::assertContainsOnlyFloat($importances); + + $predictions = $this->estimator->predict($testing); + + /** @var list $labels */ + $labels = $testing->labels(); + + $score = $this->metric->score( + predictions: $predictions, + labels: $labels + ); + + self::assertGreaterThanOrEqual(self::MIN_SCORE, $score); + } + + #[Test] + #[TestDox('Can train and predict from provider samples')] + #[DataProviderExternal(ExtraTreeRegressorProvider::class, 'trainPredictProvider')] + public function trainPredictAdditional(array $samples, array $labels, array $prediction) : void + { + $training = Labeled::quick($samples, $labels); + + $this->estimator->train($training); + + self::assertTrue($this->estimator->trained()); + + $importances = $this->estimator->featureImportances(); + + self::assertCount(count($samples[0]), $importances); + self::assertContainsOnlyFloat($importances); + + $predictions = $this->estimator->predict(Unlabeled::quick([$prediction])); + + self::assertIsFloat($predictions[0]); + } + + #[Test] + #[TestDox('Trains and predicts with discretized targets')] + public function trainPredictCategorical() : void + { + $training = $this->generator + ->generate(self::TRAIN_SIZE + self::TEST_SIZE) + ->apply(new IntervalDiscretizer(bins: 5)); + + $testing = $training->randomize()->take(self::TEST_SIZE); + + $this->estimator->train($training); + + self::assertTrue($this->estimator->trained()); + + $predictions = $this->estimator->predict($testing); + + /** @var list $labels */ + $labels = $testing->labels(); + + $score = $this->metric->score( + predictions: $predictions, + labels: $labels + ); + + self::assertGreaterThanOrEqual(self::MIN_SCORE, $score); + } + + #[Test] + #[TestDox('Throws when predicting before training')] + public function predictUntrained() : void + { + $this->expectException(RuntimeException::class); + + $this->estimator->predict(Unlabeled::quick()); + } +} diff --git a/tests/Regressors/ExtraTreeRegressorTest.php b/tests/Regressors/ExtraTreeRegressorTest.php index aecd0b367..8456e7b37 100644 --- a/tests/Regressors/ExtraTreeRegressorTest.php +++ b/tests/Regressors/ExtraTreeRegressorTest.php @@ -5,12 +5,16 @@ namespace Rubix\ML\Tests\Regressors; use PHPUnit\Framework\Attributes\CoversClass; +use PHPUnit\Framework\Attributes\DataProviderExternal; use PHPUnit\Framework\Attributes\Group; +use PHPUnit\Framework\Attributes\Test; +use Rubix\ML\Datasets\Labeled; use Rubix\ML\DataType; use Rubix\ML\EstimatorType; use Rubix\ML\Datasets\Unlabeled; use Rubix\ML\Regressors\ExtraTreeRegressor; use Rubix\ML\Datasets\Generators\Hyperplane; +use Rubix\ML\Tests\DataProvider\ExtraTreeRegressorProvider; use Rubix\ML\Transformers\IntervalDiscretizer; use Rubix\ML\CrossValidation\Metrics\RSquared; use Rubix\ML\Exceptions\InvalidArgumentException; @@ -133,6 +137,25 @@ public function testTrainPredictImportancesContinuous() : void $this->assertGreaterThanOrEqual(self::MIN_SCORE, $score); } + #[DataProviderExternal(ExtraTreeRegressorProvider::class, 'trainPredictProvider')] + public function testTrainPredictAdditional(array $samples, array $labels, array $prediction) : void + { + $training = Labeled::quick($samples, $labels); + + $this->estimator->train($training); + + self::assertTrue($this->estimator->trained()); + + $importances = $this->estimator->featureImportances(); + + self::assertCount(count($samples[0]), $importances); + self::assertContainsOnlyFloat($importances); + + $predictions = $this->estimator->predict(Unlabeled::quick([$prediction])); + + self::assertIsFloat($predictions[0]); + } + public function testTrainPredictCategorical() : void { $training = $this->generator diff --git a/tests/Regressors/GradientBoost/GradientBoostTest.php b/tests/Regressors/GradientBoost/GradientBoostTest.php new file mode 100644 index 000000000..88d72affa --- /dev/null +++ b/tests/Regressors/GradientBoost/GradientBoostTest.php @@ -0,0 +1,233 @@ +generator = new SwissRoll( + x: 4.0, + y: -7.0, + z: 0.0, + scale: 1.0, + depth: 21.0, + noise: 0.5 + ); + + $this->estimator = new GradientBoost( + booster: new RegressionTree(maxHeight: 3), + rate: 0.1, + ratio: 0.3, + epochs: 300, + minChange: 1e-4, + evalInterval: 3, + window: 10, + holdOut: 0.1, + metric: new RMSE() + ); + + $this->metric = new RSquared(); + + srand(self::RANDOM_SEED); + } + + protected function assertPreConditions() : void + { + self::assertFalse($this->estimator->trained()); + } + + #[Test] + #[TestDox('Throws when booster is incompatible')] + public function incompatibleBooster() : void + { + $this->expectException(InvalidArgumentException::class); + + new GradientBoost(booster: new Ridge()); + } + + #[Test] + #[TestDox('Throws when learning rate is invalid')] + public function badLearningRate() : void + { + $this->expectException(InvalidArgumentException::class); + + new GradientBoost(booster: null, rate: -1e-3); + } + + #[Test] + #[TestDox('Returns estimator type')] + public function type() : void + { + self::assertEquals(EstimatorType::regressor(), $this->estimator->type()); + } + + #[Test] + #[TestDox('Declares feature compatibility')] + public function compatibility() : void + { + $expected = [ + DataType::categorical(), + DataType::continuous(), + ]; + + self::assertEquals($expected, $this->estimator->compatibility()); + } + + #[Test] + #[TestDox('Returns hyperparameters')] + public function params() : void + { + $expected = [ + 'booster' => new RegressionTree(maxHeight: 3), + 'rate' => 0.1, + 'ratio' => 0.3, + 'epochs' => 300, + 'min change' => 0.0001, + 'eval interval' => 3, + 'window' => 10, + 'hold out' => 0.1, + 'metric' => new RMSE(), + ]; + + self::assertEquals($expected, $this->estimator->params()); + } + + #[Test] + #[TestDox('Trains, predicts, and returns importances')] + public function trainPredictImportances() : void + { + $this->estimator->setLogger(new BlackHole()); + + $training = $this->generator->generate(self::TRAIN_SIZE); + $testing = $this->generator->generate(self::TEST_SIZE); + + $this->estimator->train($training); + + self::assertTrue($this->estimator->trained()); + + $losses = $this->estimator->losses(); + + self::assertIsArray($losses); + self::assertContainsOnlyFloat($losses); + + $scores = $this->estimator->scores(); + + self::assertIsArray($scores); + self::assertContainsOnlyFloat($scores); + + $importances = $this->estimator->featureImportances(); + + self::assertCount(3, $importances); + self::assertContainsOnlyFloat($importances); + + $predictions = $this->estimator->predict($testing); + + /** @var list $labels */ + $labels = $testing->labels(); + + $score = $this->metric->score( + predictions: $predictions, + labels: $labels + ); + + self::assertGreaterThanOrEqual(self::MIN_SCORE, $score); + } + + #[Test] + #[TestDox('Returns additional training artifacts and prediction details')] + #[DataProviderExternal(GradientBoostProvider::class, 'trainPredictAdditionalProvider')] + public function trainPredictAdditionalChecks(int $trainSize, int $testSize) : void + { + $this->estimator->setLogger(new BlackHole()); + + $training = $this->generator->generate($trainSize); + $testing = $this->generator->generate($testSize); + + $this->estimator->train($training); + + self::assertSame(3, $training->numFeatures()); + + $losses = $this->estimator->losses(); + + self::assertIsArray($losses); + self::assertNotEmpty($losses); + self::assertContainsOnlyFloat($losses); + + $scores = $this->estimator->scores(); + + self::assertIsArray($scores); + self::assertNotEmpty($scores); + self::assertContainsOnlyFloat($scores); + + $importances = $this->estimator->featureImportances(); + + self::assertCount(3, $importances); + self::assertContainsOnlyFloat($importances); + self::assertGreaterThan(0.0, array_sum($importances)); + + $predictions = $this->estimator->predict($testing); + + self::assertCount($testSize, $predictions); + self::assertContainsOnlyFloat($predictions); + } + + #[Test] + #[TestDox('Throws when predicting before training')] + public function predictUntrained() : void + { + $this->expectException(RuntimeException::class); + + $this->estimator->predict(Unlabeled::quick()); + } +} diff --git a/tests/Regressors/GradientBoostTest.php b/tests/Regressors/GradientBoostTest.php index 70f5a053d..c66b11fcd 100644 --- a/tests/Regressors/GradientBoostTest.php +++ b/tests/Regressors/GradientBoostTest.php @@ -5,7 +5,9 @@ namespace Rubix\ML\Tests\Regressors; use PHPUnit\Framework\Attributes\CoversClass; +use PHPUnit\Framework\Attributes\DataProviderExternal; use PHPUnit\Framework\Attributes\Group; +use PHPUnit\Framework\Attributes\Test; use Rubix\ML\DataType; use Rubix\ML\EstimatorType; use Rubix\ML\Regressors\Ridge; @@ -19,6 +21,7 @@ use Rubix\ML\Exceptions\InvalidArgumentException; use Rubix\ML\Exceptions\RuntimeException; use PHPUnit\Framework\TestCase; +use Rubix\ML\Tests\DataProvider\GradientBoostProvider; #[Group('Regressors')] #[CoversClass(GradientBoost::class)] @@ -168,6 +171,42 @@ public function testTrainPredictImportances() : void $this->assertGreaterThanOrEqual(self::MIN_SCORE, $score); } + #[DataProviderExternal(GradientBoostProvider::class, 'trainPredictAdditionalProvider')] + public function testTrainPredictAdditionalChecks(int $trainSize, int $testSize) : void + { + $this->estimator->setLogger(new BlackHole()); + + $training = $this->generator->generate($trainSize); + $testing = $this->generator->generate($testSize); + + $this->estimator->train($training); + + self::assertSame(3, $training->numFeatures()); + + $losses = $this->estimator->losses(); + + self::assertIsArray($losses); + self::assertNotEmpty($losses); + self::assertContainsOnlyFloat($losses); + + $scores = $this->estimator->scores(); + + self::assertIsArray($scores); + self::assertNotEmpty($scores); + self::assertContainsOnlyFloat($scores); + + $importances = $this->estimator->featureImportances(); + + self::assertCount(3, $importances); + self::assertContainsOnlyFloat($importances); + self::assertGreaterThan(0.0, array_sum($importances)); + + $predictions = $this->estimator->predict($testing); + + self::assertCount($testSize, $predictions); + self::assertContainsOnlyFloat($predictions); + } + public function testPredictUntrained() : void { $this->expectException(RuntimeException::class); diff --git a/tests/Regressors/KNNRegressor/KNNRegressorTest.php b/tests/Regressors/KNNRegressor/KNNRegressorTest.php new file mode 100644 index 000000000..67658d114 --- /dev/null +++ b/tests/Regressors/KNNRegressor/KNNRegressorTest.php @@ -0,0 +1,180 @@ + [self::TRAIN_SIZE, 3]; + } + + protected function setUp() : void + { + $this->generator = new HalfMoon(x: 4.0, y: -7.0, scale: 1.0, rotation: 90, noise: 0.25); + + $this->estimator = new KNNRegressor(k: 10, weighted: true, kernel: new Minkowski(3.0)); + + $this->metric = new RSquared(); + + srand(self::RANDOM_SEED); + } + + #[Test] + #[TestDox('asserts preconditions')] + public function assertsPreConditions() : void + { + self::assertFalse($this->estimator->trained()); + } + + #[Test] + #[TestDox('rejects invalid k values')] + public function rejectsInvalidK() : void + { + $this->expectException(InvalidArgumentException::class); + + new KNNRegressor(k: 0); + } + + #[Test] + #[TestDox('returns the regressor estimator type')] + public function returnsTheRegressorEstimatorType() : void + { + self::assertEquals(EstimatorType::regressor(), $this->estimator->type()); + } + + #[Test] + #[TestDox('returns the expected compatibility types')] + public function returnsTheExpectedCompatibilityTypes() : void + { + $expected = [ + DataType::continuous(), + ]; + + self::assertEquals($expected, $this->estimator->compatibility()); + } + + #[Test] + #[TestDox('returns the configured parameters')] + public function returnsTheConfiguredParameters() : void + { + $expected = [ + 'k' => 10, + 'weighted' => true, + 'kernel' => new Minkowski(3.0), + ]; + + self::assertEquals($expected, $this->estimator->params()); + } + + #[Test] + #[TestDox('trains partially and makes accurate predictions')] + public function trainsPartiallyAndMakesAccuratePredictions() : void + { + $training = $this->generator->generate(self::TRAIN_SIZE); + $testing = $this->generator->generate(self::TEST_SIZE); + + $folds = $training->fold(3); + + $this->estimator->train($folds[0]); + $this->estimator->partial($folds[1]); + $this->estimator->partial($folds[2]); + + self::assertTrue($this->estimator->trained()); + + $predictions = $this->estimator->predict($testing); + + /** @var list $labels */ + $labels = $testing->labels(); + $score = $this->metric->score( + predictions: $predictions, + labels: $labels + ); + + self::assertGreaterThanOrEqual(self::MIN_SCORE, $score); + } + + #[Test] + #[TestDox('rejects incompatible training data')] + public function rejectsIncompatibleTrainingData() : void + { + $this->expectException(InvalidArgumentException::class); + + $this->estimator->train(Labeled::quick(samples: [['bad']], labels: [2])); + } + + #[Test] + #[TestDox('rejects predictions from an untrained model')] + public function rejectsPredictionsFromAnUntrainedModel() : void + { + $this->expectException(RuntimeException::class); + + $this->estimator->predict(Unlabeled::quick()); + } + + #[Test] + #[TestDox('becomes trained after partial fitting')] + #[DataProvider('trainedStateCases')] + public function becomesTrainedAfterPartialFitting(int $trainSize, int $folds) : void + { + $training = $this->generator->generate($trainSize); + + $parts = $training->fold($folds); + + $this->estimator->train($parts[0]); + + for ($i = 1; $i < $folds; ++$i) { + $this->estimator->partial($parts[$i]); + } + + self::assertTrue($this->estimator->trained()); + } +} diff --git a/tests/Regressors/KNNRegressorTest.php b/tests/Regressors/KNNRegressorTest.php index bb2761fb0..02903a60b 100644 --- a/tests/Regressors/KNNRegressorTest.php +++ b/tests/Regressors/KNNRegressorTest.php @@ -4,8 +4,11 @@ namespace Rubix\ML\Tests\Regressors; +use Generator; use PHPUnit\Framework\Attributes\CoversClass; +use PHPUnit\Framework\Attributes\DataProvider; use PHPUnit\Framework\Attributes\Group; +use PHPUnit\Framework\Attributes\Test; use Rubix\ML\DataType; use Rubix\ML\EstimatorType; use Rubix\ML\Datasets\Labeled; @@ -48,6 +51,11 @@ class KNNRegressorTest extends TestCase protected RSquared $metric; + public static function trainedStateCases() : Generator + { + yield 'three-fold partial fit' => [self::TRAIN_SIZE, 3]; + } + protected function setUp() : void { $this->generator = new HalfMoon(x: 4.0, y: -7.0, scale: 1.0, rotation: 90, noise: 0.25); @@ -134,4 +142,20 @@ public function testPredictUntrained() : void $this->estimator->predict(Unlabeled::quick()); } + + #[DataProvider('trainedStateCases')] + public function testBecomesTrainedAfterPartialFitting(int $trainSize, int $folds) : void + { + $training = $this->generator->generate($trainSize); + + $parts = $training->fold($folds); + + $this->estimator->train($parts[0]); + + for ($i = 1; $i < $folds; ++$i) { + $this->estimator->partial($parts[$i]); + } + + $this->assertTrue($this->estimator->trained()); + } } diff --git a/tests/Regressors/MLPRegressorTest.php b/tests/Regressors/MLPRegressorTest.php index 9d7dc7650..f2f11fd3d 100644 --- a/tests/Regressors/MLPRegressorTest.php +++ b/tests/Regressors/MLPRegressorTest.php @@ -6,6 +6,7 @@ use PHPUnit\Framework\Attributes\CoversClass; use PHPUnit\Framework\Attributes\Group; +use PHPUnit\Framework\Attributes\Test; use Rubix\ML\DataType; use Rubix\ML\EstimatorType; use Rubix\ML\Datasets\Labeled; @@ -188,10 +189,57 @@ public function testTrainIncompatible() : void $this->estimator->train(Labeled::quick(samples: [['bad']], labels: [2])); } + #[Test] + public function testTrainedModelExposesNetworkLossesAndScores() : void + { + [$testing] = $this->trainEstimatorAndGetTestingSet(); + + self::assertTrue($this->estimator->trained()); + self::assertNotNull($this->estimator->network()); + + $losses = $this->estimator->losses(); + $scores = $this->estimator->scores(); + + self::assertIsArray($losses); + self::assertIsArray($scores); + self::assertNotEmpty($losses); + self::assertNotEmpty($scores); + self::assertContainsOnlyFloat($losses); + self::assertContainsOnlyFloat($scores); + + $predictions = $this->estimator->predict($testing); + + self::assertCount($testing->numSamples(), $predictions); + + foreach ($predictions as $prediction) { + self::assertIsNumeric($prediction); + } + } + public function testPredictUntrained() : void { $this->expectException(RuntimeException::class); $this->estimator->predict(Unlabeled::quick()); } + + /** + * @return array{0: Unlabeled} + */ + private function trainEstimatorAndGetTestingSet() : array + { + $dataset = $this->generator->generate(self::TRAIN_SIZE + self::TEST_SIZE); + + $dataset->apply(new ZScaleStandardizer()); + + $testing = $dataset->randomize()->take(self::TEST_SIZE); + + $folds = $dataset->fold(3); + + $this->estimator->train($folds[0]); + $this->estimator->partial($folds[1]); + $this->estimator->partial($folds[2]); + + return [$testing]; + } } diff --git a/tests/Regressors/MLPRegressors/MLPRegressorTest.php b/tests/Regressors/MLPRegressors/MLPRegressorTest.php new file mode 100644 index 000000000..e19d5a495 --- /dev/null +++ b/tests/Regressors/MLPRegressors/MLPRegressorTest.php @@ -0,0 +1,347 @@ +generator = new SwissRoll(x: 4.0, y: -7.0, z: 0.0, scale: 1.0, depth: 21.0, noise: 0.5); + + $this->estimator = new MLPRegressor( + hiddenLayers: [ + new Dense(32), + new Activation(new SiLU()), + new Dense(16), + new Activation(new SiLU()), + new Dense(8), + new Activation(new SiLU()), + ], + batchSize: 32, + optimizer: new Adam(0.01), + epochs: 100, + minChange: 1e-4, + evalInterval: 3, + window: 5, + holdOut: 0.1, + costFn: new LeastSquares(), + metric: new RMSE(), + packSamples: true, + ); + + $this->metric = new RSquared(); + + $this->estimator->setLogger(new BlackHole()); + + srand(self::RANDOM_SEED); + } + + #[Test] + #[TestDox('Assert pre conditions')] + public function preConditions() : void + { + self::assertFalse($this->estimator->trained()); + } + + #[Test] + #[TestDox('Bad batch size')] + public function badBatchSize() : void + { + $this->expectException(InvalidArgumentException::class); + + new MLPRegressor(hiddenLayers: [], batchSize: -100); + } + + #[Test] + #[TestDox('Type')] + public function type() : void + { + self::assertEquals(EstimatorType::regressor(), $this->estimator->type()); + } + + #[Test] + #[TestDox('Compatibility')] + public function compatibility() : void + { + $expected = [ + DataType::continuous(), + ]; + + self::assertEquals($expected, $this->estimator->compatibility()); + } + + #[Test] + #[TestDox('Params')] + public function params() : void + { + $expected = [ + 'hidden layers' => [ + new Dense(32), + new Activation(new SiLU()), + new Dense(16), + new Activation(new SiLU()), + new Dense(8), + new Activation(new SiLU()), + ], + 'batch size' => 32, + 'optimizer' => new Adam(0.01), + 'epochs' => 100, + 'min change' => 1e-4, + 'eval interval' => 3, + 'window' => 5, + 'hold out' => 0.1, + 'cost fn' => new LeastSquares(), + 'metric' => new RMSE(), + ]; + + self::assertEquals($expected, $this->estimator->params()); + } + + #[Test] + #[TestDox('Train partial predict')] + public function trainPartialPredict() : void + { + $dataset = $this->generator->generate(self::TRAIN_SIZE + self::TEST_SIZE); + + $dataset->apply(new ZScaleStandardizer()); + + $testing = $dataset->randomize()->take(self::TEST_SIZE); + + $folds = $dataset->fold(3); + + $this->estimator->train($folds[0]); + $this->estimator->partial($folds[1]); + $this->estimator->partial($folds[2]); + + self::assertTrue($this->estimator->trained()); + + $dot = $this->estimator->exportGraphviz(); + + // Graphviz::dotToImage($dot)->saveTo(new Filesystem('test.png')); + + self::assertStringStartsWith('digraph Tree {', (string) $dot); + + $losses = $this->estimator->losses(); + + self::assertIsArray($losses); + self::assertContainsOnlyFloat($losses); + + $scores = $this->estimator->scores(); + + self::assertIsArray($scores); + self::assertContainsOnlyFloat($scores); + + $predictions = $this->estimator->predict($testing); + + /** @var list $labels */ + $labels = $testing->labels(); + $score = $this->metric->score( + predictions: $predictions, + labels: $labels + ); + + self::assertGreaterThanOrEqual(self::MIN_SCORE, $score); + } + + #[Test] + #[TestDox('Predict count matches number of samples')] + public function predictCountMatchesNumberOfSamples() : void + { + [$testing] = $this->trainEstimatorAndGetTestingSet(); + + $predictions = $this->estimator->predict($testing); + + self::assertCount($testing->numSamples(), $predictions); + } + + #[Test] + #[TestDox('Predict returns numeric finite values')] + public function predictReturnsNumericFiniteValues() : void + { + [$testing] = $this->trainEstimatorAndGetTestingSet(); + + $predictions = $this->estimator->predict($testing); + + self::assertCount($testing->numSamples(), $predictions); + + foreach ($predictions as $prediction) { + self::assertIsNumeric($prediction); + self::assertFalse(is_nan((float) $prediction)); + self::assertTrue(is_finite((float) $prediction)); + } + } + + #[Test] + #[TestDox('Predict is repeatable for same model and dataset')] + public function predictIsRepeatableForSameModelAndDataset() : void + { + [$testing] = $this->trainEstimatorAndGetTestingSet(); + + $predictions1 = $this->estimator->predict($testing); + $predictions2 = $this->estimator->predict($testing); + + self::assertCount($testing->numSamples(), $predictions1); + self::assertCount($testing->numSamples(), $predictions2); + + foreach ($predictions1 as $i => $prediction) { + self::assertEqualsWithDelta((float) $prediction, (float) $predictions2[$i], 1e-12); + } + } + + #[Test] + #[TestDox('Predict does not mutate dataset samples or labels')] + public function predictDoesNotMutateDataset() : void + { + [$testing] = $this->trainEstimatorAndGetTestingSet(); + + $samplesBefore = $testing->samples(); + $labelsBefore = $testing->labels(); + + $predictions = $this->estimator->predict($testing); + + self::assertCount($testing->numSamples(), $predictions); + self::assertEquals($samplesBefore, $testing->samples()); + self::assertEquals($labelsBefore, $testing->labels()); + } + + #[Test] + #[TestDox('Serialization preserves predict output')] + public function serializationPreservesPredictOutput() : void + { + [$testing] = $this->trainEstimatorAndGetTestingSet(); + + $predictionsBefore = $this->estimator->predict($testing); + + $copy = unserialize(serialize($this->estimator)); + + self::assertInstanceOf(MLPRegressor::class, $copy); + self::assertTrue($copy->trained()); + + $predictionsAfter = $copy->predict($testing); + + self::assertCount($testing->numSamples(), $predictionsAfter); + + foreach ($predictionsAfter as $i => $prediction) { + self::assertEqualsWithDelta((float) $predictionsBefore[$i], (float) $prediction, 1e-8); + } + } + + #[Test] + #[TestDox('Train incompatible')] + public function trainIncompatible() : void + { + $this->expectException(InvalidArgumentException::class); + + $this->estimator->train(Labeled::quick(samples: [['bad']], labels: [2])); + } + + #[Test] + #[TestDox('Predict untrained')] + public function predictUntrained() : void + { + $this->expectException(RuntimeException::class); + + $this->estimator->predict(Unlabeled::quick()); + } + + #[Test] + #[TestDox('Trained model exposes network, losses, and scores')] + public function trainedModelExposesNetworkLossesAndScores() : void + { + [$testing] = $this->trainEstimatorAndGetTestingSet(); + + self::assertTrue($this->estimator->trained()); + self::assertNotNull($this->estimator->network()); + + $losses = $this->estimator->losses(); + $scores = $this->estimator->scores(); + + self::assertIsArray($losses); + self::assertIsArray($scores); + self::assertNotEmpty($losses); + self::assertNotEmpty($scores); + self::assertContainsOnlyFloat($losses); + self::assertContainsOnlyFloat($scores); + + $predictions = $this->estimator->predict($testing); + + self::assertCount($testing->numSamples(), $predictions); + + foreach ($predictions as $prediction) { + self::assertIsNumeric($prediction); + } + } + + /** + * @return array{0: Unlabeled} + */ + private function trainEstimatorAndGetTestingSet() : array + { + $dataset = $this->generator->generate(self::TRAIN_SIZE + self::TEST_SIZE); + + $dataset->apply(new ZScaleStandardizer()); + + $testing = $dataset->randomize()->take(self::TEST_SIZE); + + $folds = $dataset->fold(3); + + $this->estimator->train($folds[0]); + $this->estimator->partial($folds[1]); + $this->estimator->partial($folds[2]); + + return [$testing]; + } +} diff --git a/tests/Regressors/RadiusNeighborsRegressor/RadiusNeighborsRegressorTest.php b/tests/Regressors/RadiusNeighborsRegressor/RadiusNeighborsRegressorTest.php new file mode 100644 index 000000000..2a8d93aa9 --- /dev/null +++ b/tests/Regressors/RadiusNeighborsRegressor/RadiusNeighborsRegressorTest.php @@ -0,0 +1,172 @@ + [self::TRAIN_SIZE, self::TEST_SIZE]; + } + + protected function setUp() : void + { + $this->generator = new HalfMoon(x: 4.0, y: -7.0, scale: 1.0, rotation: 90, noise: 0.25); + + $this->estimator = new RadiusNeighborsRegressor(radius: 0.8, weighted: true, tree: new BallTree()); + + $this->metric = new RSquared(); + + srand(self::RANDOM_SEED); + } + + #[Test] + #[TestDox('Estimator is untrained before fitting')] + public function testAssertPreConditions() : void + { + self::assertFalse($this->estimator->trained()); + } + + #[Test] + #[TestDox('Radius must be greater than zero')] + public function badRadius() : void + { + $this->expectException(InvalidArgumentException::class); + + new RadiusNeighborsRegressor(radius: 0.0); + } + + #[Test] + #[TestDox('Estimator type is regressor')] + public function type() : void + { + self::assertEquals(EstimatorType::regressor(), $this->estimator->type()); + } + + #[Test] + #[TestDox('Compatibility only includes continuous data')] + public function compatibility() : void + { + $expected = [ + DataType::continuous(), + ]; + + self::assertEquals($expected, $this->estimator->compatibility()); + } + + #[Test] + #[TestDox('It trains and predicts with the expected score')] + public function trainPredict() : void + { + $training = $this->generator->generate(self::TRAIN_SIZE); + $testing = $this->generator->generate(self::TEST_SIZE); + + $this->estimator->train($training); + + self::assertTrue($this->estimator->trained()); + + $predictions = $this->estimator->predict($testing); + + /** @var list $labels */ + $labels = $testing->labels(); + $score = $this->metric->score( + predictions: $predictions, + labels: $labels + ); + + self::assertGreaterThanOrEqual(self::MIN_SCORE, $score); + } + + #[Test] + #[TestDox('Predictions match the test set and remain finite')] + #[DataProvider('predictionChecks')] + public function trainPredictChecks(int $trainSize, int $testSize) : void + { + $training = $this->generator->generate($trainSize); + $testing = $this->generator->generate($testSize); + + $this->estimator->train($training); + + $predictions = $this->estimator->predict($testing); + + self::assertCount($testSize, $predictions); + + foreach ($predictions as $prediction) { + self::assertIsFloat($prediction); + self::assertFalse(is_nan($prediction)); + } + + /** @var list $labels */ + $labels = $testing->labels(); + $score = $this->metric->score(predictions: $predictions, labels: $labels); + + self::assertIsFloat($score); + self::assertGreaterThanOrEqual(self::MIN_SCORE, $score); + } + + #[Test] + #[TestDox('Training rejects incompatible labels')] + public function trainIncompatible() : void + { + $this->expectException(InvalidArgumentException::class); + + $this->estimator->train(Labeled::quick(samples: [['bad']], labels: [2])); + } + + #[Test] + #[TestDox('Predicting before training throws an exception')] + public function predictUntrained() : void + { + $this->expectException(RuntimeException::class); + + $this->estimator->predict(Unlabeled::quick()); + } +} diff --git a/tests/Regressors/RadiusNeighborsRegressorTest.php b/tests/Regressors/RadiusNeighborsRegressorTest.php index ebecc902b..f903b6a03 100644 --- a/tests/Regressors/RadiusNeighborsRegressorTest.php +++ b/tests/Regressors/RadiusNeighborsRegressorTest.php @@ -4,8 +4,11 @@ namespace Rubix\ML\Tests\Regressors; +use Generator; use PHPUnit\Framework\Attributes\CoversClass; +use PHPUnit\Framework\Attributes\DataProvider; use PHPUnit\Framework\Attributes\Group; +use PHPUnit\Framework\Attributes\Test; use Rubix\ML\DataType; use Rubix\ML\EstimatorType; use Rubix\ML\Datasets\Labeled; @@ -48,6 +51,11 @@ class RadiusNeighborsRegressorTest extends TestCase protected RSquared $metric; + public static function predictionChecks() : Generator + { + yield 'default dataset sizes' => [self::TRAIN_SIZE, self::TEST_SIZE]; + } + protected function setUp() : void { $this->generator = new HalfMoon(x: 4.0, y: -7.0, scale: 1.0, rotation: 90, noise: 0.25); @@ -106,6 +114,31 @@ public function testTrainPredict() : void $this->assertGreaterThanOrEqual(self::MIN_SCORE, $score); } + #[DataProvider('predictionChecks')] + public function testTrainPredictChecks(int $trainSize, int $testSize) : void + { + $training = $this->generator->generate($trainSize); + $testing = $this->generator->generate($testSize); + + $this->estimator->train($training); + + $predictions = $this->estimator->predict($testing); + + self::assertCount($testSize, $predictions); + + foreach ($predictions as $prediction) { + self::assertIsFloat($prediction); + self::assertFalse(is_nan($prediction)); + } + + /** @var list $labels */ + $labels = $testing->labels(); + $score = $this->metric->score(predictions: $predictions, labels: $labels); + + self::assertIsFloat($score); + self::assertGreaterThanOrEqual(self::MIN_SCORE, $score); + } + public function testTrainIncompatible() : void { $this->expectException(InvalidArgumentException::class); diff --git a/tests/Regressors/RegressionTree/RegressionTreeTest.php b/tests/Regressors/RegressionTree/RegressionTreeTest.php new file mode 100644 index 000000000..3a119a4e0 --- /dev/null +++ b/tests/Regressors/RegressionTree/RegressionTreeTest.php @@ -0,0 +1,220 @@ +generator = new Hyperplane( + coefficients: [1.0, 5.5, -7, 0.01], + intercept: 35.0, + noise: 1.0 + ); + + $this->estimator = new RegressionTree( + maxHeight: 30, + maxLeafSize: 5, + minPurityIncrease: 1e-7, + maxFeatures: 3 + ); + + $this->metric = new RSquared(); + + srand(self::RANDOM_SEED); + } + + #[Test] + #[TestDox('Is not trained before training')] + public function preConditions() : void + { + self::assertFalse($this->estimator->trained()); + } + + #[Test] + #[TestDox('Throws when max height is invalid')] + public function badMaxDepth() : void + { + $this->expectException(InvalidArgumentException::class); + + new RegressionTree(maxHeight: 0); + } + + #[Test] + #[TestDox('Returns estimator type')] + public function type() : void + { + self::assertEquals(EstimatorType::regressor(), $this->estimator->type()); + } + + #[Test] + #[TestDox('Declares feature compatibility')] + public function compatibility() : void + { + $expected = [ + DataType::categorical(), + DataType::continuous(), + ]; + + self::assertEquals($expected, $this->estimator->compatibility()); + } + + #[Test] + #[TestDox('Returns hyperparameters')] + public function params() : void + { + $expected = [ + 'max height' => 30, + 'max leaf size' => 5, + 'min purity increase' => 1.0E-7, + 'max features' => 3, + 'max bins' => null, + ]; + + self::assertEquals($expected, $this->estimator->params()); + } + + #[Test] + #[TestDox('Trains, predicts, and returns importances for continuous targets')] + public function trainPredictImportancesContinuous() : void + { + $training = $this->generator->generate(self::TRAIN_SIZE); + $testing = $this->generator->generate(self::TEST_SIZE); + + $this->estimator->train($training); + + self::assertTrue($this->estimator->trained()); + + $importances = $this->estimator->featureImportances(); + + self::assertCount(4, $importances); + self::assertContainsOnlyFloat($importances); + + $dot = $this->estimator->exportGraphviz(); + + // Graphviz::dotToImage($dot)->saveTo(new Filesystem('test.png')); + + self::assertStringStartsWith('digraph Tree {', (string) $dot); + + $predictions = $this->estimator->predict($testing); + + /** @var list $labels */ + $labels = $testing->labels(); + $score = $this->metric->score( + predictions: $predictions, + labels: $labels + ); + + self::assertGreaterThanOrEqual(self::MIN_SCORE, $score); + } + + #[Test] + #[TestDox('Trains and predicts with discretized targets')] + public function trainPredictCategorical() : void + { + $training = $this->generator + ->generate(self::TRAIN_SIZE + self::TEST_SIZE) + ->apply(new IntervalDiscretizer(bins: 5)); + + $testing = $training->randomize()->take(self::TEST_SIZE); + + $this->estimator->train($training); + + self::assertTrue($this->estimator->trained()); + + $dot = $this->estimator->exportGraphviz(); + + // Graphviz::dotToImage($dot)->saveTo(new Filesystem('test.png')); + + self::assertStringStartsWith('digraph Tree {', (string) $dot); + + $predictions = $this->estimator->predict($testing); + + /** @var list $labels */ + $labels = $testing->labels(); + $score = $this->metric->score( + predictions: $predictions, + labels: $labels + ); + + self::assertGreaterThanOrEqual(self::MIN_SCORE, $score); + } + + #[Test] + #[TestDox('Exposes trained state, feature importances, and prediction counts after fitting')] + #[DataProviderExternal(RegressionTreeProvider::class, 'trainedModelCases')] + public function trainedModelExposesAdditionalChecks(int $trainingSize, int $testingSize) : void + { + $training = $this->generator->generate($trainingSize); + $testing = $this->generator->generate($testingSize); + + $this->estimator->train($training); + + self::assertTrue($this->estimator->trained()); + + $importances = $this->estimator->featureImportances(); + + self::assertCount(4, $importances); + self::assertContainsOnlyFloat($importances); + + $predictions = $this->estimator->predict($testing); + + self::assertCount($testingSize, $predictions); + } + + #[Test] + #[TestDox('Throws when predicting before training')] + public function predictUntrained() : void + { + $this->expectException(RuntimeException::class); + + $this->estimator->predict(Unlabeled::quick()); + } +} diff --git a/tests/Regressors/RegressionTreeTest.php b/tests/Regressors/RegressionTreeTest.php index 0b9903f79..8ee1f2249 100644 --- a/tests/Regressors/RegressionTreeTest.php +++ b/tests/Regressors/RegressionTreeTest.php @@ -5,12 +5,15 @@ namespace Rubix\ML\Tests\Regressors; use PHPUnit\Framework\Attributes\CoversClass; +use PHPUnit\Framework\Attributes\DataProviderExternal; use PHPUnit\Framework\Attributes\Group; +use PHPUnit\Framework\Attributes\Test; use Rubix\ML\DataType; use Rubix\ML\EstimatorType; use Rubix\ML\Datasets\Unlabeled; use Rubix\ML\Regressors\RegressionTree; use Rubix\ML\Datasets\Generators\Hyperplane; +use Rubix\ML\Tests\DataProvider\RegressionTreeProvider; use Rubix\ML\Transformers\IntervalDiscretizer; use Rubix\ML\CrossValidation\Metrics\RSquared; use Rubix\ML\Exceptions\InvalidArgumentException; @@ -169,6 +172,26 @@ public function testTrainPredictCategorical() : void $this->assertGreaterThanOrEqual(self::MIN_SCORE, $score); } + #[DataProviderExternal(RegressionTreeProvider::class, 'trainedModelCases')] + public function testTrainedModelExposesAdditionalChecks(int $trainingSize, int $testingSize) : void + { + $training = $this->generator->generate($trainingSize); + $testing = $this->generator->generate($testingSize); + + $this->estimator->train($training); + + self::assertTrue($this->estimator->trained()); + + $importances = $this->estimator->featureImportances(); + + self::assertCount(4, $importances); + self::assertContainsOnlyFloat($importances); + + $predictions = $this->estimator->predict($testing); + + self::assertCount($testingSize, $predictions); + } + public function testPredictUntrained() : void { $this->expectException(RuntimeException::class); diff --git a/tests/Regressors/Ridge/RidgeTest.php b/tests/Regressors/Ridge/RidgeTest.php new file mode 100644 index 000000000..ac4a4c96f --- /dev/null +++ b/tests/Regressors/Ridge/RidgeTest.php @@ -0,0 +1,253 @@ + [ + [ + [0], + [1], + [2], + [3], + ], + [3, 5, 7, 9], + [4], + 11.0, + [2.0], + 3.0, + ]; + + yield 'sample with 2 features and smaller values' => [ + [ + [0, 0], + [1, 1], + [2, 1], + [1, 2], + ], + [3, 6, 7, 8], + [2, 2], + 9.0, + [1.0, 2.0], + 3.0, + ]; + + yield 'sample with 3 features and smaller values' => [ + [ + [0, 0, 0], + [1, 0, 0], + [0, 1, 0], + [0, 0, 1], + ], + [4, 5, 6, 7], + [1, 1, 1], + 10.0, + [1.0, 2.0, 3.0], + 4.0, + ]; + + yield 'sample with 4 features' => [ + [ + [50, 3, 5, 10], + [70, 10, 3, 5], + [40, 2, 8, 30], + ], + [66000, 95000, 45000], + [60, 5, 4, 12], + $isArm ? 77676.53 : 77644.0, + $isArm + ? [1208.26, 360.18, -96.53, -420.41] + : [1172.0, 452.0, -70.0, -424.0], + $isArm ? 8810.75 : 10432.0, + ]; + + yield 'sample with 4 features with shifted values' => [ + [ + [52, 4, 6, 12], + [71, 9, 4, 6], + [38, 3, 7, 28], + ], + [66000, 95000, 45000], + [60, 5, 4, 12], + $isArm ? 77585.35 : 78540.0, + $isArm + ? [1364.07, 476.45, -161.59, -82.90] + : [1366.0, 504.0, -156.0, -91.0], + $isArm ? -4999.93 : -4224.0, + ]; + } + + protected function setUp() : void + { + $this->generator = new Hyperplane( + coefficients: [1.0, 5.5, -7, 0.01], + intercept: 0.0, + noise: 1.0 + ); + + $this->estimator = new Ridge(1.0); + + $this->metric = new RSquared(); + + srand(self::RANDOM_SEED); + } + + #[Test] + #[TestDox('Is not trained before training')] + public function preConditions() : void + { + self::assertFalse($this->estimator->trained()); + } + + #[Test] + #[TestDox('Throws when L2 penalty is invalid')] + public function badL2Penalty() : void + { + $this->expectException(InvalidArgumentException::class); + + new Ridge(-1e-4); + } + + #[Test] + #[TestDox('Returns estimator type')] + public function type() : void + { + self::assertEquals(EstimatorType::regressor(), $this->estimator->type()); + } + + #[Test] + #[TestDox('Declares feature compatibility')] + public function compatibility() : void + { + $expected = [ + DataType::continuous(), + ]; + + self::assertEquals($expected, $this->estimator->compatibility()); + } + + #[Test] + #[TestDox('Trains, predicts, and returns importances')] + public function trainPredictImportances() : void + { + $training = $this->generator->generate(self::TRAIN_SIZE); + $testing = $this->generator->generate(self::TEST_SIZE); + + $this->estimator->train($training); + + self::assertTrue($this->estimator->trained()); + + $coefficients = $this->estimator->coefficients(); + + self::assertIsArray($coefficients); + self::assertCount(4, $coefficients); + + self::assertIsFloat($this->estimator->bias()); + + $importances = $this->estimator->featureImportances(); + + self::assertCount(4, $importances); + self::assertContainsOnlyFloat($importances); + + $predictions = $this->estimator->predict($testing); + + /** @var list $labels */ + $labels = $testing->labels(); + $score = $this->metric->score( + predictions: $predictions, + labels: $labels + ); + + self::assertGreaterThanOrEqual(self::MIN_SCORE, $score); + } + + #[Test] + #[TestDox('Throws when training set is incompatible')] + public function trainIncompatible() : void + { + $this->expectException(InvalidArgumentException::class); + + $this->estimator->train(Labeled::quick(samples: [['bad']], labels: [2])); + } + + #[Test] + #[TestDox('Throws when predicting before training')] + public function predictUntrained() : void + { + $this->expectException(RuntimeException::class); + + $this->estimator->predict(Unlabeled::quick()); + } + + #[Test] + #[TestDox('Trains, predicts, and returns the expected NumPower ridge values')] + #[DataProvider('trainPredictProvider')] + public function trainPredict(array $samples, array $labels, array $prediction, float $expectedPrediction, array $expectedCoefficients, float $expectedBias) : void + { + $regression = new Ridge(0.01); + $regression->train(new Labeled($samples, $labels)); + + $predictions = $regression->predict(new Unlabeled([$prediction])); + $coefficients = $regression->coefficients(); + + self::assertEqualsWithDelta($expectedPrediction, $predictions[0], 0.2); + self::assertIsArray($coefficients); + self::assertCount(count($expectedCoefficients), $coefficients); + + foreach ($expectedCoefficients as $i => $expectedCoefficient) { + self::assertEqualsWithDelta($expectedCoefficient, $coefficients[$i], 0.2); + } + self::assertEqualsWithDelta($expectedBias, $regression->bias(), 0.2); + } +} diff --git a/tests/Regressors/RidgeTest.php b/tests/Regressors/RidgeTest.php index cd9143b50..caa108d93 100644 --- a/tests/Regressors/RidgeTest.php +++ b/tests/Regressors/RidgeTest.php @@ -4,8 +4,12 @@ namespace Rubix\ML\Tests\Regressors; +use Generator; use PHPUnit\Framework\Attributes\CoversClass; use PHPUnit\Framework\Attributes\Group; +use PHPUnit\Framework\Attributes\DataProvider; +use PHPUnit\Framework\Attributes\Test; +use PHPUnit\Framework\Attributes\TestDox; use Rubix\ML\DataType; use Rubix\ML\EstimatorType; use Rubix\ML\Datasets\Labeled; @@ -47,6 +51,77 @@ class RidgeTest extends TestCase protected RSquared $metric; + public static function trainPredictProvider() : Generator + { + yield 'sample with 1 feature and smaller values' => [ + [ + [0], + [1], + [2], + [3], + ], + [3, 5, 7, 9], + [4], + 11.0, + [2.0], + 3.0, + ]; + + yield 'sample with 2 features and smaller values' => [ + [ + [0, 0], + [1, 1], + [2, 1], + [1, 2], + ], + [3, 6, 7, 8], + [2, 2], + 9.0, + [1.0, 2.0], + 3.0, + ]; + + yield 'sample with 3 features and smaller values' => [ + [ + [0, 0, 0], + [1, 0, 0], + [0, 1, 0], + [0, 0, 1], + ], + [4, 5, 6, 7], + [1, 1, 1], + 10.0, + [1.0, 2.0, 3.0], + 4.0, + ]; + + yield 'sample with 4 features' => [ + [ + [50, 3, 5, 10], + [70, 10, 3, 5], + [40, 2, 8, 30], + ], + [66000, 95000, 45000], + [60, 5, 4, 12], + 78037.05, + [1192.98, 401.06, -132.47, -413.58], + 9949.78, + ]; + + yield 'sample with 4 features with shifted values' => [ + [ + [52, 4, 6, 12], + [71, 9, 4, 6], + [38, 3, 7, 28], + ], + [66000, 95000, 45000], + [60, 5, 4, 12], + 77709.72, + [1368.77, 442.49, -158.60, -77.49], + -5054.98, + ]; + } + protected function setUp() : void { $this->generator = new Hyperplane( @@ -90,8 +165,6 @@ public function testCompatibility() : void public function testTrainPredictImportances() : void { - $this->markTestSkipped('TODO: doesn\'t work by some reason'); - $training = $this->generator->generate(self::TRAIN_SIZE); $testing = $this->generator->generate(self::TEST_SIZE); @@ -136,4 +209,25 @@ public function testPredictUntrained() : void $this->estimator->predict(Unlabeled::quick()); } + + #[Test] + #[TestDox('Trains, predicts, and returns the expected legacy ridge values')] + #[DataProvider('trainPredictProvider')] + public function trainPredict(array $samples, array $labels, array $prediction, float $expectedPrediction, array $expectedCoefficients, float $expectedBias) : void + { + $regression = new Ridge(0.01); + $regression->train(new Labeled($samples, $labels)); + + $predictions = $regression->predict(new Unlabeled([$prediction])); + $coefficients = $regression->coefficients(); + + self::assertEqualsWithDelta($expectedPrediction, $predictions[0], 0.2); + self::assertIsArray($coefficients); + self::assertCount(count($expectedCoefficients), $coefficients); + + foreach ($expectedCoefficients as $i => $expectedCoefficient) { + self::assertEqualsWithDelta($expectedCoefficient, $coefficients[$i], 0.2); + } + self::assertEqualsWithDelta($expectedBias, $regression->bias(), 0.2); + } }