This is a follow-up question to My last question. From then on, I think that I have improved quite a lot thanks to a large amount of feedback I received from my last post. This will be the last time I ask for feedback based on this code. I am looking for any feedback on best practices or coding style but to be honest any kind of tip, information or feedback is greatly accepted. This is my first real python module so I can imagine there is quite a bit of strange or simply wrong stuff.
import os.path
import math
from typing import (Tuple, List)
import pandas as pd
import numpy as np
from sklearn import ensemble
from sklearn.externals import joblib
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
def prepare_dataset(path: str) -> List[pd.DataFrame]:
df = pd.read_csv(path)
features = df.drop('days_remaning', axis=1)
output = df['days_remaning']
return train_test_split(features, output, test_size=0.3)
class Model():
def __init__(self, product: str) -> None:
self.product = product
self.save_path = f'models/trained_{product}_classifier_model.pkl'
(self.x_train, self.x_test, self.y_train, self.y_test) = prepare_dataset(
f'datasets/{product}_dataset.csv')
if os.path.isfile(self.save_path):
print(f'Loading Local Model, product: {product}.')
self.model = joblib.load(self.save_path)
else:
print(f'Creating New Model, product: {product}.')
self.model = self.create_model()
def create_model(self) -> ensemble.GradientBoostingRegressor:
'''Create & fit a GB model using GridSearch optimised parameters.'''
params = dict(
learning_rate=0.01,
loss='ls',
max_depth=4,
max_features=0.3,
min_samples_leaf=9,
n_estimators=500
)
model = ensemble.GradientBoostingRegressor(**params)
model.fit(self.x_train, self.y_train)
return model
def use_model(self, data: List[List[int]]) -> np.ndarray:
return self.model.predict(data)
def print_model_output(self, data: List[List[int]]) -> None:
for output in self.use_model(data):
print('In about {} to {} days you will need to buy new {}.'.format(
math.floor(output), math.ceil(output), self.product))
def save_model(self) -> None:
joblib.dump(self.model, self.save_path)
def calculate_error_rates(self) -> Tuple[float]:
train_error = mean_absolute_error(
self.y_train, self.model.predict(self.x_train))
test_error = mean_absolute_error(
self.y_test, self.model.predict(self.x_test))
return train_error, test_error
def print_error_rates(self) -> None:
(train_error, test_error) = self.calculate_error_rates()
print(f'Training Set Mean Absolute Error: {train_error:.4f}')
print(f'Test Set Mean Absolute Error: {test_error:.4f}')
def print_features_importance(self):
features_labels = np.array(['days_after_restock', 'left_content'])
importance = self.model.feature_importances_
features_indexes_by_importance = importance.argsort()
for feature in features_indexes_by_importance:
print(
f'{features_labels[feature]} - {importance[feature] * 100.0:.2f}%')
if __name__ == '__main__':
EGGS = Model('eggs')
EGGS.print_error_rates()
EGGS.print_features_importance()
EGGS.print_model_output([[2, 7], [7, 2]])