I tried to build a Python class, CustomStackingClassifier(), to implement the Stacking method in ensemble machine learning. In this implementation, the output of the base classifiers is set to be the predicted probabilities, and StratifiedKFold is used for model training. The input matrix for the meta-classifier has dimensions (samples, models * classes).
This code essentially replicates the functionality of sklearn.ensemble.StackingClassifier() manually. However, after testing it with the wine dataset and comparing the results between the two methods, I found discrepancies. Despite spending a lot of time on it, I could not pinpoint the issue. I would greatly appreciate any help or insights from the community. Thank you so much!
I hope to clarify whether there is a logical issue with the CustomStackingClassifier(). If there is a problem, I would appreciate guidance and suggestions for corrections. If the implementation is correct, why does it show result differences compared to sklearn.ensemble.StackingClassifier()?
The code is as follows, I really need help, please:
class CustomStackingClassifier(BaseEstimator, ClassifierMixin):
def __init__(self, base_classifiers, meta_classifier, n_splits=5):
"""
:param base_classifiers: list of estimators
:param meta_classifier: final_estimator
:param n_splits: cv
"""
self.base_classifiers = base_classifiers
self.meta_classifier = meta_classifier
self.n_splits = n_splits
def fit(self, X, y):
"""
:param X: train data
:param y: train label
"""
n_samples = X.shape[0]
n_classifiers = len(self.base_classifiers)
n_classes = len(np.unique(y)) # Get the number of categories
base_probabilities = np.zeros((n_samples, n_classifiers * n_classes)) # Used to store the predicted probabilities of the base classifier
# Setting up cross validation by StratifiedKFold, consistent with StackingClassifier
kf = StratifiedKFold(n_splits=self.n_splits, shuffle=False, random_state=None)
# reset index of data
X_re_index = X.reset_index(drop=True)
y_re_index = y.reset_index(drop=True)
# Train each base classifier and generate prediction probabilities
for i, (name, clf) in enumerate(self.base_classifiers):
fold_probabilities = np.zeros((n_samples, n_classes))
# Train and predict for each fold
for train_index, val_index in kf.split(X_re_index,y_re_index):
X_train, X_val = X_re_index.iloc[train_index], X_re_index.iloc[val_index]
y_train, y_val = y_re_index.iloc[train_index], y_re_index.iloc[val_index]
# Train base classifier
clf.fit(X_train, y_train)
# Predict probabilities on validation set
fold_probabilities[val_index] = clf.predict_proba(X_val)
# Save the predicted probabilities of each base classifier into base_probabilities
base_probabilities[:, i * n_classes: (i + 1) * n_classes] = fold_probabilities
# train meta classifier
self.meta_classifier.fit(base_probabilities, y_re_index)
return self
def predict(self, X):
"""
:param X: test data
"""
# get the predicted probabilities of each base classifier
base_probabilities = np.column_stack([clf.predict_proba(X) for name, clf in self.base_classifiers])
# predict the label using the meta classifier
return self.meta_classifier.predict(base_probabilities)
def predict_proba(self, X):
"""
:param X: test data
"""
# get the predicted probabilities of each base classifier
base_probabilities = np.column_stack([clf.predict_proba(X) for name, clf in self.base_classifiers])
# predict the label probabilities using the meta classifier
return self.meta_classifier.predict_proba(base_probabilities)
base_models = [
('svm', SVC(probability=True,random_state=42)),
('knn', KNeighborsClassifier()),
('rf', RandomForestClassifier(random_state=42)),
]
meta_model = xgb.XGBClassifier(verbosity=0,random_state=42)
# 1. load wine dataset
iris = load_wine()
X = iris.data
y = pd.Series(iris.target)
# data spilt
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.2, random_state=41)
# data preprocessing
scaler = preprocessing.StandardScaler().fit(X_train)
X_train_scaled = pd.DataFrame(scaler.transform(X_train),
columns=iris.feature_names)
X_test_scaled = pd.DataFrame(scaler.transform(X_test),
columns=iris.feature_names)
# manual implementation of Stacking Model
stacking_model = CustomStackingClassifier (base_classifiers=base_models, meta_classifier=meta_model, n_splits=5) # accuracy: 0.944
# Stacking Model method in sklearn
stacking_model = StackingClassifier(estimators=base_models, final_estimator=meta_model,cv=5,stack_method='auto',verbose=1) # accuracy: 0.972
stacking_model.fit(X_train_scaled, y_train)
# 4. Evaluate the model
y_pred = stacking_model.predict(X_test_scaled)
print('Evaluating results of Stacking model :')
print('accuracy:', accuracy_score(y_test, y_pred))
print('precision:', precision_score(y_test, y_pred,
average='macro'))
print('recall:', recall_score(y_test, y_pred, average='macro'))
print('F1-score:', f1_score(y_test, y_pred, average='macro'))