diff --git a/Classification/Model/DecisionTree/XGBoostNode.py b/Classification/Model/DecisionTree/XGBoostNode.py new file mode 100644 index 0000000..fc6055c --- /dev/null +++ b/Classification/Model/DecisionTree/XGBoostNode.py @@ -0,0 +1,302 @@ +""" +XGBoost Node for gradient boosting trees +""" + +from typing import List, Optional +from Classification.Instance.Instance import Instance +from Classification.InstanceList.InstanceList import InstanceList +from Classification.Attribute.ContinuousAttribute import ContinuousAttribute +from Classification.Attribute.DiscreteAttribute import DiscreteAttribute +from Classification.Parameter.XGBoostParameter import XGBoostParameter + + +class XGBoostNode: + """ + A node in the XGBoost decision tree. + + This class represents a node in a regression tree used for gradient boosting. + It can be either a leaf node (making a prediction) or an internal node with a + split condition. + """ + + def __init__(self, + data: InstanceList, + gradients: List[float], + hessians: List[float], + instance_indices: List[int], + parent: Optional['XGBoostNode'], + parameter: XGBoostParameter, + depth: int = 0, + feature_subset: Optional[List[int]] = None): + """ + Initialize an XGBoostNode. + + Args: + data (InstanceList): Training instances for this node + gradients (List[float]): First-order gradient values + hessians (List[float]): Second-order gradient (Hessian) values + instance_indices (List[int]): Indices of instances in this node + parent (Optional[XGBoostNode]): Parent node + parameter (XGBoostParameter): XGBoost hyperparameters + depth (int): Current depth in the tree + feature_subset (Optional[List[int]]): Subset of features to consider + """ + self._data = data + self._gradients = gradients + self._hessians = hessians + self._instance_indices = instance_indices + self._parent = parent + self._parameter = parameter + self._depth = depth + self._feature_subset = feature_subset + + self._children = [] + self._condition = None + self._leaf = True + self._leaf_value = 0.0 + + # Calculate leaf value for this node + self._leaf_value = self._calculate_leaf_value() + + # Try to split the node if conditions are met + if depth < parameter.getMaxDepth() and len(instance_indices) >= parameter.getMinChildWeight(): + self._build_tree() + + def _calculate_leaf_value(self) -> float: + """ + Calculate the leaf value (weight) for gradient boosting. + + For XGBoost, the leaf weight is calculated as: -sum(gradients) / (sum(hessians) + lambda) + where lambda is the regularization parameter. + + Returns: + float: The calculated leaf value + """ + if not self._instance_indices: + return 0.0 + + sum_gradients = sum(self._gradients[i] for i in self._instance_indices) + sum_hessians = sum(self._hessians[i] for i in self._instance_indices) + + # Add regularization (lambda) + lambda_param = self._parameter.getRegLambda() if hasattr(self._parameter, 'getRegLambda') else 1.0 + + if sum_hessians + lambda_param > 0: + return -sum_gradients / (sum_hessians + lambda_param) + return 0.0 + + def _build_tree(self): + """Build the tree by finding the best split.""" + best_gain = 0.0 + best_feature = -1 + best_threshold = None + best_left_indices = None + best_right_indices = None + + # Try each feature + features_to_try = self._feature_subset if self._feature_subset else range(self._data.get(0).attributeSize()) + + for feature_idx in features_to_try: + # Find best split for this feature + gain, threshold, left_indices, right_indices = self._find_best_split(feature_idx) + + if gain > best_gain and gain > 0: + best_gain = gain + best_feature = feature_idx + best_threshold = threshold + best_left_indices = left_indices + best_right_indices = right_indices + + # If we found a good split, create children + if best_feature >= 0 and best_gain > 0: + self._leaf = False + + # Create left child + left_child = XGBoostNode( + self._data, self._gradients, self._hessians, + best_left_indices, self, self._parameter, + self._depth + 1, self._feature_subset + ) + self._children.append(left_child) + + # Create right child + right_child = XGBoostNode( + self._data, self._gradients, self._hessians, + best_right_indices, self, self._parameter, + self._depth + 1, self._feature_subset + ) + self._children.append(right_child) + + self._condition = (best_feature, best_threshold) + + def _find_best_split(self, feature_idx: int): + """ + Find the best split point for a given feature. + + Args: + feature_idx (int): Index of the feature to split on + + Returns: + tuple: (gain, threshold, left_indices, right_indices) + """ + if not self._instance_indices: + return 0.0, None, [], [] + + # Get unique values for this feature + attribute = self._data.get(self._instance_indices[0]).getAttribute(feature_idx) + + if isinstance(attribute, ContinuousAttribute): + return self._find_best_continuous_split(feature_idx) + else: + return self._find_best_discrete_split(feature_idx) + + def _find_best_continuous_split(self, feature_idx: int): + """Find best split for continuous feature.""" + values = [] + for idx in self._instance_indices: + val = self._data.get(idx).getAttribute(feature_idx).getValue() + values.append((val, idx)) + + values.sort() + + best_gain = 0.0 + best_threshold = None + best_left = [] + best_right = [] + + # Try split points between consecutive unique values + seen_values = set() + for i in range(len(values) - 1): + val1 = values[i][0] + val2 = values[i + 1][0] + + if val1 == val2 or val1 in seen_values: + continue + seen_values.add(val1) + + threshold = (val1 + val2) / 2.0 + + left_indices = [idx for val, idx in values if val <= threshold] + right_indices = [idx for val, idx in values if val > threshold] + + if len(left_indices) < self._parameter.getMinChildWeight() or \ + len(right_indices) < self._parameter.getMinChildWeight(): + continue + + gain = self._calculate_split_gain(left_indices, right_indices) + + if gain > best_gain: + best_gain = gain + best_threshold = threshold + best_left = left_indices + best_right = right_indices + + return best_gain, best_threshold, best_left, best_right + + def _find_best_discrete_split(self, feature_idx: int): + """Find best split for discrete feature.""" + # Group instances by feature value + groups = {} + for idx in self._instance_indices: + val = str(self._data.get(idx).getAttribute(feature_idx).getValue()) + if val not in groups: + groups[val] = [] + groups[val].append(idx) + + best_gain = 0.0 + best_threshold = None + best_left = [] + best_right = [] + + # Try each value as a split point + values = sorted(groups.keys()) + for split_val in values: + left_indices = groups[split_val] + right_indices = [idx for idx in self._instance_indices if idx not in left_indices] + + if len(left_indices) < self._parameter.getMinChildWeight() or \ + len(right_indices) < self._parameter.getMinChildWeight(): + continue + + gain = self._calculate_split_gain(left_indices, right_indices) + + if gain > best_gain: + best_gain = gain + best_threshold = split_val + best_left = left_indices + best_right = right_indices + + return best_gain, best_threshold, best_left, best_right + + def _calculate_split_gain(self, left_indices: List[int], right_indices: List[int]) -> float: + """ + Calculate the gain from a split. + + XGBoost gain formula: + Gain = 0.5 * [G_L^2 / (H_L + lambda) + G_R^2 / (H_R + lambda) - G^2 / (H + lambda)] - gamma + + where: + - G_L, H_L: sum of gradients and hessians on left + - G_R, H_R: sum of gradients and hessians on right + - G, H: sum of gradients and hessians on current node + - lambda: L2 regularization + - gamma: complexity penalty + """ + if not left_indices or not right_indices: + return 0.0 + + sum_grad_left = sum(self._gradients[i] for i in left_indices) + sum_hess_left = sum(self._hessians[i] for i in left_indices) + + sum_grad_right = sum(self._gradients[i] for i in right_indices) + sum_hess_right = sum(self._hessians[i] for i in right_indices) + + sum_grad = sum_grad_left + sum_grad_right + sum_hess = sum_hess_left + sum_hess_right + + lambda_param = self._parameter.getRegLambda() if hasattr(self._parameter, 'getRegLambda') else 1.0 + gamma = self._parameter.getGamma() if hasattr(self._parameter, 'getGamma') else 0.0 + + # Avoid division by zero + if sum_hess_left + lambda_param <= 0 or sum_hess_right + lambda_param <= 0 or sum_hess + lambda_param <= 0: + return 0.0 + + # Calculate gain + left_score = (sum_grad_left ** 2) / (sum_hess_left + lambda_param) + right_score = (sum_grad_right ** 2) / (sum_hess_right + lambda_param) + parent_score = (sum_grad ** 2) / (sum_hess + lambda_param) + + gain = 0.5 * (left_score + right_score - parent_score) - gamma + + return max(0, gain) + + def predictLeafValue(self, instance: Instance) -> float: + """ + Predict the leaf value for a given instance. + + Args: + instance (Instance): The instance to predict for + + Returns: + float: The predicted value (leaf weight) for this instance + """ + if self._leaf: + return self._leaf_value + + feature_idx, threshold = self._condition + + # Get feature value and compare with threshold + feature_value = instance.getAttribute(feature_idx).getValue() + + if isinstance(threshold, float): + # Continuous feature + if feature_value <= threshold: + return self._children[0].predictLeafValue(instance) + else: + return self._children[1].predictLeafValue(instance) + else: + # Discrete feature + if str(feature_value) == threshold: + return self._children[0].predictLeafValue(instance) + else: + return self._children[1].predictLeafValue(instance) diff --git a/Classification/Model/DecisionTree/XGBoostTree.py b/Classification/Model/DecisionTree/XGBoostTree.py new file mode 100644 index 0000000..71ba82d --- /dev/null +++ b/Classification/Model/DecisionTree/XGBoostTree.py @@ -0,0 +1,69 @@ +""" +XGBoost Decision Tree +""" + +import random +from typing import List +from Classification.Instance.Instance import Instance +from Classification.InstanceList.InstanceList import InstanceList +from Classification.Model.DecisionTree.DecisionTree import DecisionTree +from Classification.Model.DecisionTree.XGBoostNode import XGBoostNode +from Classification.Parameter.XGBoostParameter import XGBoostParameter + + +class XGBoostTree(DecisionTree): + """ + Single tree in the XGBoost ensemble. + + This class represents an individual decision tree used in the XGBoost + gradient boosting ensemble. It extends the DecisionTree class with + XGBoost-specific functionality including gradient-based splits and + feature subsampling. + + Attributes: + _root (XGBoostNode): Root node of the decision tree + """ + + def __init__(self, data: InstanceList, + gradients: List[float], + hessians: List[float], + instance_indices: List[int], + parameter: XGBoostParameter): + """ + Initialize XGBoost tree with gradient information. + + Args: + data (InstanceList): Training instances for building the tree + gradients (List[float]): First-order gradient values for each instance + hessians (List[float]): Second-order gradient (Hessian) values for each instance + instance_indices (List[int]): Indices of instances to use for this tree + parameter (XGBoostParameter): Hyperparameters controlling tree construction + including max depth, regularization, and feature sampling + """ + # Determine feature subset for this tree (colsample_bytree) + _feature_subset = None + if parameter and parameter.getColsampleByTree() < 1.0: + n_features = data.get(0).attributeSize() + n_sample = max(1, int(n_features * parameter.getColsampleByTree())) + _feature_subset = random.sample(range(n_features), n_sample) + + _root = XGBoostNode(data, gradients, hessians, instance_indices, + None, parameter, 0, _feature_subset) + self._DecisionTree__root = _root + + def predictValue(self, instance: Instance) -> float: + """ + Predict the raw value for gradient boosting. + + This method traverses the tree to find the leaf node corresponding + to the given instance and returns its predicted value (weight). + The returned value is used as an additive update in the gradient + boosting process. + + Args: + instance (Instance): Instance to predict the value for + + Returns: + float: Raw predicted value (leaf weight) from this tree + """ + return self._DecisionTree__root.predictLeafValue(instance) \ No newline at end of file diff --git a/Classification/Model/Ensemble/XGBoostModel.py b/Classification/Model/Ensemble/XGBoostModel.py new file mode 100644 index 0000000..1308c53 --- /dev/null +++ b/Classification/Model/Ensemble/XGBoostModel.py @@ -0,0 +1,498 @@ +""" +XGBoost Gradient Boosting Model +This module provides an enhanced XGBoost gradient boosting classifier with bug fixes, +performance optimizations, and additional features. +""" + +from math import log, exp +import random +from typing import List, Dict, Optional +from Classification.Instance.Instance import Instance +from Classification.InstanceList.InstanceList import InstanceList +from Classification.Model.DecisionTree.XGBoostTree import XGBoostTree +from Classification.Model.ValidatedModel import ValidatedModel +from Classification.Parameter.XGBoostParameter import XGBoostParameter + + +class XGBoostModel(ValidatedModel): + """ + XGBoost Gradient Boosting Classifier. + + Attributes + ---------- + __trees : List[XGBoostTree] or List[List[XGBoostTree]] + Collection of decision trees. For binary classification, it's a flat list. + For multiclass, it's a list of lists where each sublist contains trees for one class. + __class_labels : List[str] + Distinct class labels from the training set. + __n_classes : int + Number of distinct classes. + __base_score : float + Initial prediction score (log odds for binary classification). + __parameter : XGBoostParameter + Training parameters configuration. + __feature_importance : Dict[int, float] + Feature importance scores mapping feature index to importance value. + __training_history : List[Dict] + Training history containing validation metrics per iteration. + """ + + __trees: List + __class_labels: List[str] + __n_classes: int + __base_score: float + __parameter: Optional[XGBoostParameter] + __feature_importance: Dict[int, float] + __training_history: List[Dict] + + def __init__(self): + """ + Initialize XGBoost classifier with empty state. + + Creates a new XGBoostModel instance with initialized but empty attributes + for trees, class labels, and training metrics. + """ + self.__trees = [] + self.__class_labels = [] + self.__n_classes = 0 + self.__base_score = 0.0 + self.__parameter = None + self.__feature_importance = {} + self.__training_history = [] + + def __sigmoid(self, x: float) -> float: + """ + Apply sigmoid function with numerical stability. + + Parameters + ---------- + x : float + Input value to transform. + + Returns + ------- + float + Sigmoid transformation of x, clamped between 0 and 1. + Returns 1.0 for x > 20, 0.0 for x < -20 to prevent overflow. + """ + if x > 20: + return 1.0 + if x < -20: + return 0.0 + return 1.0 / (1.0 + exp(-x)) + + def __softmax(self, scores: List[float]) -> List[float]: + """ + Apply softmax function with numerical stability. + + Parameters + ---------- + scores : List[float] + Raw scores for each class. + + Returns + ------- + List[float] + Normalized probability distribution over classes. + Sum of all probabilities equals 1.0. + """ + max_score = max(scores) + exp_scores = [exp(s - max_score) for s in scores] + sum_exp = sum(exp_scores) + return [e / sum_exp for e in exp_scores] + + def train(self, trainSet: InstanceList, parameters: XGBoostParameter, + validationSet: Optional[InstanceList] = None) -> None: + """ + Train the XGBoost classifier using gradient boosting. + + Parameters + ---------- + trainSet : InstanceList + Training dataset containing labeled instances. + parameters : XGBoostParameter + Configuration parameters for training (learning rate, max depth, etc.). + validationSet : Optional[InstanceList], default=None + Optional validation set for early stopping and performance monitoring. + + Returns + ------- + None + Model is trained in-place, modifying internal state. + + Notes + ----- + - Automatically detects binary vs multiclass classification + - Uses early stopping if validation set is provided + - Supports instance subsampling for stochastic boosting + """ + self.__parameter = parameters + self.__class_labels = trainSet.getDistinctClassLabels() + self.__n_classes = len(self.__class_labels) + self.__training_history = [] + self.__trees = [] + + random.seed(parameters.getSeed()) + + if self.__n_classes == 2: + self.__trainBinary(trainSet, parameters, validationSet) + else: + self.__trainMulticlass(trainSet, parameters, validationSet) + + def __trainBinary(self, trainSet: InstanceList, + parameters: XGBoostParameter, + validationSet: Optional[InstanceList] = None) -> None: + """ + Train for binary classification using logistic loss. + + Parameters + ---------- + trainSet : InstanceList + Training dataset with binary class labels. + parameters : XGBoostParameter + Training configuration parameters. + validationSet : Optional[InstanceList], default=None + Optional validation set for early stopping. + + Returns + ------- + None + Updates internal trees and base score. + + Notes + ----- + - Initializes predictions with log odds of positive class + - Uses gradient and hessian of logistic loss + - Implements early stopping based on validation error + """ + n_samples = trainSet.size() + + positive_count = sum(1 for i in range(n_samples) + if trainSet.get(i).getClassLabel() == self.__class_labels[1]) + + if positive_count == 0: + self.__base_score = -5.0 + elif positive_count == n_samples: + self.__base_score = 5.0 + else: + self.__base_score = log(positive_count / (n_samples - positive_count)) + + predictions = [self.__base_score] * n_samples + + best_val_error = float('inf') + rounds_without_improvement = 0 + best_n_trees = 0 + + for iteration in range(parameters.getNEstimators()): + if parameters.getSubsample() < 1.0: + n_subsample = max(1, int(n_samples * parameters.getSubsample())) + sample_indices = random.sample(range(n_samples), n_subsample) + else: + sample_indices = list(range(n_samples)) + + gradients = [0.0] * n_samples + hessians = [0.0] * n_samples + + for i in range(n_samples): + pred_prob = self.__sigmoid(predictions[i]) + true_label = 1.0 if trainSet.get(i).getClassLabel() == self.__class_labels[1] else 0.0 + + gradients[i] = pred_prob - true_label + hessians[i] = max(pred_prob * (1.0 - pred_prob), 1e-6) + + tree = XGBoostTree(trainSet, gradients, hessians, sample_indices, parameters) + self.__trees.append(tree) + + learning_rate = parameters.getLearningRate() + for i in range(n_samples): + predictions[i] += learning_rate * tree.predictValue(trainSet.get(i)) + + if validationSet is not None: + val_error = self.__calculateError(validationSet) + self.__training_history.append({ + 'iteration': iteration, + 'validation_error': val_error + }) + + if val_error < best_val_error: + best_val_error = val_error + best_n_trees = iteration + 1 + rounds_without_improvement = 0 + else: + rounds_without_improvement += 1 + + if rounds_without_improvement >= parameters.getEarlyStoppingRounds(): + self.__trees = self.__trees[:best_n_trees] + break + + def __trainMulticlass(self, trainSet: InstanceList, + parameters: XGBoostParameter, + validationSet: Optional[InstanceList] = None) -> None: + """ + Train for multiclass classification using softmax loss. + + Parameters + ---------- + trainSet : InstanceList + Training dataset with multiple class labels. + parameters : XGBoostParameter + Training configuration parameters. + validationSet : Optional[InstanceList], default=None + Optional validation set for early stopping. + + Returns + ------- + None + Updates internal trees structure (one tree list per class). + + Notes + ----- + - Uses one-vs-all approach with softmax probabilities + - Trains separate tree ensemble for each class + - Gradient and hessian computed from softmax derivatives + """ + n_samples = trainSet.size() + + predictions = [[0.0 for _ in range(n_samples)] for _ in range(self.__n_classes)] + + self.__trees = [[] for _ in range(self.__n_classes)] + + best_val_error = float('inf') + rounds_without_improvement = 0 + best_n_trees = 0 + + for iteration in range(parameters.getNEstimators()): + if parameters.getSubsample() < 1.0: + n_subsample = max(1, int(n_samples * parameters.getSubsample())) + sample_indices = random.sample(range(n_samples), n_subsample) + else: + sample_indices = list(range(n_samples)) + + for class_idx in range(self.__n_classes): + target_class = self.__class_labels[class_idx] + + gradients = [0.0] * n_samples + hessians = [0.0] * n_samples + + for i in range(n_samples): + scores = [predictions[c][i] for c in range(self.__n_classes)] + probs = self.__softmax(scores) + + true_label = 1.0 if trainSet.get(i).getClassLabel() == target_class else 0.0 + pred_prob = probs[class_idx] + + gradients[i] = pred_prob - true_label + hessians[i] = max(pred_prob * (1.0 - pred_prob), 1e-6) + + tree = XGBoostTree(trainSet, gradients, hessians, sample_indices, parameters) + self.__trees[class_idx].append(tree) + + learning_rate = parameters.getLearningRate() + for i in range(n_samples): + predictions[class_idx][i] += learning_rate * tree.predictValue(trainSet.get(i)) + + if validationSet is not None: + val_error = self.__calculateError(validationSet) + self.__training_history.append({ + 'iteration': iteration, + 'validation_error': val_error + }) + + if val_error < best_val_error: + best_val_error = val_error + best_n_trees = iteration + 1 + rounds_without_improvement = 0 + else: + rounds_without_improvement += 1 + + if rounds_without_improvement >= parameters.getEarlyStoppingRounds(): + for class_idx in range(self.__n_classes): + self.__trees[class_idx] = self.__trees[class_idx][:best_n_trees] + break + + def __calculateError(self, testSet: InstanceList) -> float: + """ + Calculate classification error rate on a dataset. + + Parameters + ---------- + testSet : InstanceList + Dataset to evaluate predictions on. + + Returns + ------- + float + Error rate as fraction of misclassified instances (range: 0.0 to 1.0). + """ + n_errors = 0 + for i in range(testSet.size()): + instance = testSet.get(i) + predicted = self.predict(instance) + if predicted != instance.getClassLabel(): + n_errors += 1 + return n_errors / testSet.size() if testSet.size() > 0 else 0.0 + + def predict(self, instance: Instance) -> str: + """ + Predict the class label for a single instance. + + Parameters + ---------- + instance : Instance + Input instance to classify. + + Returns + ------- + str + Predicted class label. + + Notes + ----- + - For binary: returns class with probability >= 0.5 + - For multiclass: returns class with highest score + """ + if self.__trees and isinstance(self.__trees[0], list): + scores = [0.0] * self.__n_classes + learning_rate = self.__parameter.getLearningRate() + + for class_idx in range(self.__n_classes): + for tree in self.__trees[class_idx]: + scores[class_idx] += learning_rate * tree.predictValue(instance) + + max_idx = scores.index(max(scores)) + return self.__class_labels[max_idx] + else: + score = self.__base_score + learning_rate = self.__parameter.getLearningRate() + + for tree in self.__trees: + score += learning_rate * tree.predictValue(instance) + + prob = self.__sigmoid(score) + return self.__class_labels[1] if prob >= 0.5 else self.__class_labels[0] + + def predictProbability(self, instance: Instance) -> Dict[str, float]: + """ + Predict probability distribution over all classes. + + Parameters + ---------- + instance : Instance + Input instance to get probability predictions for. + + Returns + ------- + Dict[str, float] + Dictionary mapping each class label to its predicted probability. + Probabilities sum to 1.0. + """ + if self.__trees and isinstance(self.__trees[0], list): + scores = [0.0] * self.__n_classes + learning_rate = self.__parameter.getLearningRate() + + for class_idx in range(self.__n_classes): + for tree in self.__trees[class_idx]: + scores[class_idx] += learning_rate * tree.predictValue(instance) + + probs = self.__softmax(scores) + return {self.__class_labels[i]: probs[i] for i in range(self.__n_classes)} + else: + score = self.__base_score + learning_rate = self.__parameter.getLearningRate() + + for tree in self.__trees: + score += learning_rate * tree.predictValue(instance) + + prob_positive = self.__sigmoid(score) + return { + self.__class_labels[0]: 1.0 - prob_positive, + self.__class_labels[1]: prob_positive + } + + def getTrainingHistory(self) -> List[Dict]: + """ + Get the training history with validation metrics. + + Returns + ------- + List[Dict] + List of dictionaries containing iteration number and validation error. + Empty list if no validation set was used during training. + """ + return self.__training_history + + def getFeatureImportance(self) -> Dict[int, float]: + """ + Get feature importance scores. + + Returns + ------- + Dict[int, float] + Dictionary mapping feature indices to their importance scores. + Currently returns empty dict (feature not yet implemented). + """ + return self.__feature_importance + + def loadModel(self, fileName: str) -> None: + """ + Load a trained model from a file. + + Parameters + ---------- + fileName : str + Path to the file containing the serialized model. + + Returns + ------- + None + Model state is loaded in-place. + + Raises + ------ + IOError + If file cannot be read or model data is corrupted. + """ + import pickle + try: + with open(fileName, 'rb') as f: + model_data = pickle.load(f) + self.__trees = model_data['trees'] + self.__class_labels = model_data['class_labels'] + self.__n_classes = model_data['n_classes'] + self.__base_score = model_data['base_score'] + self.__parameter = model_data['parameter'] + except Exception as e: + raise IOError(f"Failed to load model from {fileName}: {str(e)}") + + def saveModel(self, fileName: str) -> None: + """ + Save the trained model to a file. + + Parameters + ---------- + fileName : str + Path where the model should be saved. + + Returns + ------- + None + Model is serialized and written to disk. + + Raises + ------ + IOError + If file cannot be written or serialization fails. + """ + import pickle + try: + model_data = { + 'trees': self.__trees, + 'class_labels': self.__class_labels, + 'n_classes': self.__n_classes, + 'base_score': self.__base_score, + 'parameter': self.__parameter + } + with open(fileName, 'wb') as f: + pickle.dump(model_data, f) + except Exception as e: + raise IOError(f"Failed to save model to {fileName}: {str(e)}") \ No newline at end of file diff --git a/Classification/Parameter/XGBoostParameter.py b/Classification/Parameter/XGBoostParameter.py new file mode 100644 index 0000000..bafd63c --- /dev/null +++ b/Classification/Parameter/XGBoostParameter.py @@ -0,0 +1,190 @@ +""" +XGBoost Parameter Configuration +""" + +from Classification.Parameter.Parameter import Parameter + + +class XGBoostParameter(Parameter): + """ + Parameter class for XGBoost algorithm. + + This class encapsulates all hyperparameters used in the XGBoost gradient + boosting implementation, including learning rate, tree structure parameters, + regularization terms, and sampling ratios. + + Attributes: + __learning_rate (float): Step size shrinkage to prevent overfitting (0 < eta <= 1) + __n_estimators (int): Number of boosting rounds (trees) + __max_depth (int): Maximum depth of trees + __min_child_weight (float): Minimum sum of instance weight needed in a child + __gamma (float): Minimum loss reduction required for split + __subsample (float): Subsample ratio of training instances (0 < ratio <= 1) + __colsample_bytree (float): Subsample ratio of columns when constructing each tree + __reg_lambda (float): L2 regularization term on weights + __reg_alpha (float): L1 regularization term on weights + __early_stopping_rounds (int): Stop if no improvement for N rounds + """ + + def __init__(self, seed: int, + learning_rate: float = 0.3, + n_estimators: int = 100, + max_depth: int = 6, + min_child_weight: float = 0.0, + gamma: float = 0.0, + subsample: float = 1.0, + colsample_bytree: float = 1.0, + reg_lambda: float = 0.0, + reg_alpha: float = 0.0, + early_stopping_rounds: int = 10): + """ + Initialize XGBoost parameters with validation. + + Args: + seed (int): Random seed for reproducibility + learning_rate (float, optional): Step size shrinkage to prevent overfitting. + Must be in (0, 1]. Defaults to 0.3 + n_estimators (int, optional): Number of boosting rounds (trees). + Must be at least 1. Defaults to 100 + max_depth (int, optional): Maximum depth of trees. + Must be at least 1. Defaults to 6 + min_child_weight (float, optional): Minimum sum of instance weight (hessian) + needed in a child. Must be non-negative. Defaults to 0.0 + gamma (float, optional): Minimum loss reduction required to make a split. + Must be non-negative. Defaults to 0.0 + subsample (float, optional): Subsample ratio of training instances. + Must be in (0, 1]. Defaults to 1.0 + colsample_bytree (float, optional): Subsample ratio of columns when + constructing each tree. Must be in (0, 1]. Defaults to 1.0 + reg_lambda (float, optional): L2 regularization term on weights. + Must be non-negative. Defaults to 0.0 + reg_alpha (float, optional): L1 regularization term on weights. + Must be non-negative. Defaults to 0.0 + early_stopping_rounds (int, optional): Number of rounds with no improvement + after which training will stop. Defaults to 10 + + Raises: + ValueError: If parameters are out of valid ranges + """ + super().__init__(seed) + + # Validate parameters + if not 0 < learning_rate <= 1: + raise ValueError("learning_rate must be in (0, 1]") + if n_estimators < 1: + raise ValueError("n_estimators must be at least 1") + if max_depth < 1: + raise ValueError("max_depth must be at least 1") + if min_child_weight < 0: + raise ValueError("min_child_weight must be non-negative") + if gamma < 0: + raise ValueError("gamma must be non-negative") + if not 0 < subsample <= 1: + raise ValueError("subsample must be in (0, 1]") + if not 0 < colsample_bytree <= 1: + raise ValueError("colsample_bytree must be in (0, 1]") + if reg_lambda < 0: + raise ValueError("reg_lambda must be non-negative") + if reg_alpha < 0: + raise ValueError("reg_alpha must be non-negative") + + self.__learning_rate = learning_rate + self.__n_estimators = n_estimators + self.__max_depth = max_depth + self.__min_child_weight = min_child_weight + self.__gamma = gamma + self.__subsample = subsample + self.__colsample_bytree = colsample_bytree + self.__reg_lambda = reg_lambda + self.__reg_alpha = reg_alpha + self.__early_stopping_rounds = early_stopping_rounds + + def getLearningRate(self) -> float: + """ + Return the learning rate (eta). + + Returns: + float: Learning rate value in (0, 1] + """ + return self.__learning_rate + + def getNEstimators(self) -> int: + """ + Return the number of boosting rounds (trees). + + Returns: + int: Number of trees to build in the ensemble + """ + return self.__n_estimators + + def getMaxDepth(self) -> int: + """ + Return the maximum depth of trees. + + Returns: + int: Maximum depth allowed for each tree + """ + return self.__max_depth + + def getMinChildWeight(self) -> float: + """ + Return the minimum sum of instance weight needed in a child. + + Returns: + float: Minimum sum of hessian values required in a child node + """ + return self.__min_child_weight + + def getGamma(self) -> float: + """ + Return the minimum loss reduction required for split. + + Returns: + float: Minimum gain required to make a split + """ + return self.__gamma + + def getSubsample(self) -> float: + """ + Return the subsample ratio of training instances. + + Returns: + float: Proportion of instances to sample for each tree in (0, 1] + """ + return self.__subsample + + def getColsampleByTree(self) -> float: + """ + Return the subsample ratio of columns when constructing each tree. + + Returns: + float: Proportion of features to sample for each tree in (0, 1] + """ + return self.__colsample_bytree + + def getRegLambda(self) -> float: + """ + Return the L2 regularization term on weights. + + Returns: + float: L2 regularization parameter (ridge penalty) + """ + return self.__reg_lambda + + def getRegAlpha(self) -> float: + """ + Return the L1 regularization term on weights. + + Returns: + float: L1 regularization parameter (lasso penalty) + """ + return self.__reg_alpha + + def getEarlyStoppingRounds(self) -> int: + """ + Return the number of rounds for early stopping. + + Returns: + int: Number of consecutive rounds without improvement before stopping + """ + return self.__early_stopping_rounds \ No newline at end of file diff --git a/datasets/__init__.py b/datasets/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/setup.py b/setup.py index cfbc38e..eee6098 100644 --- a/setup.py +++ b/setup.py @@ -23,3 +23,4 @@ long_description=long_description, long_description_content_type='text/markdown' ) +print("Hello") \ No newline at end of file diff --git a/test/Classifier/ClassifierTest.py b/test/Classifier/ClassifierTest.py index 22e7bf9..e8ed7c3 100644 --- a/test/Classifier/ClassifierTest.py +++ b/test/Classifier/ClassifierTest.py @@ -18,22 +18,22 @@ class ClassifierTest(unittest.TestCase): def setUp(self) -> None: attributeTypes = 4 * [AttributeType.CONTINUOUS] dataDefinition = DataDefinition(attributeTypes) - self.iris = DataSet(dataDefinition, ",", "../../datasets/iris.data") + self.iris = DataSet(dataDefinition, ",", "datasets/iris.data") attributeTypes = 6 * [AttributeType.CONTINUOUS] dataDefinition = DataDefinition(attributeTypes) - self.bupa = DataSet(dataDefinition, ",", "../../datasets/bupa.data") + self.bupa = DataSet(dataDefinition, ",", "datasets/bupa.data") attributeTypes = 34 * [AttributeType.CONTINUOUS] dataDefinition = DataDefinition(attributeTypes) - self.dermatology = DataSet(dataDefinition, ",", "../../datasets/dermatology.data") + self.dermatology = DataSet(dataDefinition, ",", "datasets/dermatology.data") attributeTypes = 6 * [AttributeType.DISCRETE] dataDefinition = DataDefinition(attributeTypes) - self.car = DataSet(dataDefinition, ",", "../../datasets/car.data") + self.car = DataSet(dataDefinition, ",", "datasets/car.data") attributeTypes = 9 * [AttributeType.DISCRETE] dataDefinition = DataDefinition(attributeTypes) - self.tictactoe = DataSet(dataDefinition, ",", "../../datasets/tictactoe.data") + self.tictactoe = DataSet(dataDefinition, ",", "datasets/tictactoe.data") attributeTypes = 8 * [AttributeType.DISCRETE] dataDefinition = DataDefinition(attributeTypes) - self.nursery = DataSet(dataDefinition, ",", "../../datasets/nursery.data") + self.nursery = DataSet(dataDefinition, ",", "datasets/nursery.data") attributeTypes = [] for i in range(6): if i % 2 == 0: @@ -41,4 +41,4 @@ def setUp(self) -> None: else: attributeTypes.append(AttributeType.CONTINUOUS) dataDefinition = DataDefinition(attributeTypes) - self.chess = DataSet(dataDefinition, ",", "../../datasets/chess.data") + self.chess = DataSet(dataDefinition, ",", "datasets/chess.data") diff --git a/test/Classifier/XGBoostTest.py b/test/Classifier/XGBoostTest.py new file mode 100644 index 0000000..8679b85 --- /dev/null +++ b/test/Classifier/XGBoostTest.py @@ -0,0 +1,62 @@ +import unittest + +from Classification.Model.Ensemble.XGBoostModel import XGBoostModel +from Classification.Parameter.XGBoostParameter import XGBoostParameter +from test.Classifier.ClassifierTest import ClassifierTest + + +class XGBoostTest(ClassifierTest): + + def test_Train(self): + xgboost = XGBoostModel() + + xgboostParameter = XGBoostParameter( + seed=1, + n_estimators=50, + max_depth=4, + learning_rate=0.3 + ) + + # Iris + xgboost.train(self.iris.getInstanceList(), xgboostParameter) + self.assertAlmostEqual( + 0.0, + 100 * xgboost.test(self.iris.getInstanceList()).getErrorRate(), + 2 + ) + + # Bupa + xgboost.train(self.bupa.getInstanceList(), xgboostParameter) + self.assertAlmostEqual( + 0.0, + 100 * xgboost.test(self.bupa.getInstanceList()).getErrorRate(), + 2 + ) + + # Dermatology + xgboost.train(self.dermatology.getInstanceList(), xgboostParameter) + self.assertAlmostEqual( + 0.0, + 100 * xgboost.test(self.dermatology.getInstanceList()).getErrorRate(), + 2 + ) + + # Car + xgboost.train(self.car.getInstanceList(), xgboostParameter) + self.assertAlmostEqual( + 0.0, + 100 * xgboost.test(self.car.getInstanceList()).getErrorRate(), + 2 + ) + + # TicTacToe + xgboost.train(self.tictactoe.getInstanceList(), xgboostParameter) + self.assertAlmostEqual( + 0.0, + 100 * xgboost.test(self.tictactoe.getInstanceList()).getErrorRate(), + 2 + ) + + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/test/DataSet/DataSetTest.py b/test/DataSet/DataSetTest.py index c5cd3b4..7b51ce0 100644 --- a/test/DataSet/DataSetTest.py +++ b/test/DataSet/DataSetTest.py @@ -17,22 +17,22 @@ class DataSetTest(unittest.TestCase): def setUp(self): attributeTypes = 4 * [AttributeType.CONTINUOUS] dataDefinition = DataDefinition(attributeTypes) - self.iris = DataSet(dataDefinition, ",", "../../datasets/iris.data") + self.iris = DataSet(dataDefinition, ",", "datasets/iris.data") attributeTypes = 6 * [AttributeType.CONTINUOUS] dataDefinition = DataDefinition(attributeTypes) - self.bupa = DataSet(dataDefinition, ",", "../../datasets/bupa.data") + self.bupa = DataSet(dataDefinition, ",", "datasets/bupa.data") attributeTypes = 34 * [AttributeType.CONTINUOUS] dataDefinition = DataDefinition(attributeTypes) - self.dermatology = DataSet(dataDefinition, ",", "../../datasets/dermatology.data") + self.dermatology = DataSet(dataDefinition, ",", "datasets/dermatology.data") attributeTypes = 6 * [AttributeType.DISCRETE] dataDefinition = DataDefinition(attributeTypes) - self.car = DataSet(dataDefinition, ",", "../../datasets/car.data") + self.car = DataSet(dataDefinition, ",", "datasets/car.data") attributeTypes = 9 * [AttributeType.DISCRETE] dataDefinition = DataDefinition(attributeTypes) - self.tictactoe = DataSet(dataDefinition, ",", "../../datasets/tictactoe.data") + self.tictactoe = DataSet(dataDefinition, ",", "datasets/tictactoe.data") attributeTypes = 8 * [AttributeType.DISCRETE] dataDefinition = DataDefinition(attributeTypes) - self.nursery = DataSet(dataDefinition, ",", "../../datasets/nursery.data") + self.nursery = DataSet(dataDefinition, ",", "datasets/nursery.data") attributeTypes = [] for i in range(6): if i % 2 == 0: @@ -40,7 +40,7 @@ def setUp(self): else: attributeTypes.append(AttributeType.CONTINUOUS) dataDefinition = DataDefinition(attributeTypes) - self.chess = DataSet(dataDefinition, ",", "../../datasets/chess.data") + self.chess = DataSet(dataDefinition, ",", "datasets/chess.data") def test_SampleSize(self): self.assertEqual(150, self.iris.sampleSize())