Source code for edamame.regressor.regression

#TODO - add parameteres "verbose" for logging message like unable to print/save

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
from IPython.display import display, Markdown
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV, KFold, cross_val_predict, cross_val_score
import pickle
from edamame.eda.tools import dataframe_review, dummy_control, setup
from typing import Tuple, List, Literal, Union
# pandas options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

# list of metrics
#get_scorer

[docs]class TrainRegressor: """ This class represents a pipeline for training and handling regression models. Attributes: X_train (pd.DataFrame): The input training data. y_train (pd.Series): The target training data. X_test (pd.DataFrame): The input test data. y_test (pd.Series): The target test data. Example: >>> from edamame.regressor import TrainRegressor >>> regressor = TrainRegressor(X_train, np.log(y_train), X_test, np.log(y_test)) >>> linear = regressor.linear() >>> regressor.model_metrics(model_name="linear") >>> regressor.save_model(model_name="linear") >>> lasso = regressor.lasso() >>> ridge = regressor.ridge() >>> tree = regressor.tree() >>> rf = regressor.random_forest() >>> xgb = regressor.xgboost() >>> regressor.model_metrics() >>> # using AutoML >>> models = regressor.auto_ml() >>> regressor.model_metrics() >>> regressor.save_model() """ def __init__(self, X_train: pd.DataFrame, y_train: pd.DataFrame, X_test, y_test): self.X_train = X_train self.y_train = y_train self.X_test = X_test self.y_test = y_test # check dataframe dataframe_review(self.X_train) dataframe_review(self.X_test) # check columns type dummy_control(self.X_train) dummy_control(self.X_test) # models self.__linear_fit = {} self.__lasso_fit = {} self.__ridge_fit = {} self.__tree_fit = {} self.__random_forest_fit = {} self.__xgb_fit = {}
[docs] def linear(self, **kwargs) -> LinearRegression: """ Train a linear regression model using the training data and return the fitted model. Args: **kwargs: Arbitrary keyword arguments to be passed to the `linear` constructor. Returns: LinearRegression: The trained linear regression model. Example: >>> from edamame.regressor import TrainRegressor >>> regressor = TrainRegressor(X_train, np.log(y_train), X_test, np.log(y_test)) >>> linear = regressor.linear() """ linear = LinearRegression(**kwargs) linear.fit(self.X_train, self.y_train.squeeze()) # save the model in the instance attributes self.__linear_fit = linear # return step return self.__linear_fit
[docs] def lasso(self, alpha: Tuple[float, float, int] = (0.0001, 10., 50), n_folds: int = 5, **kwargs) -> Lasso: """ Train a Lasso regression model using the training data and return the fitted model. Args: alpha (Tuple[float, float, int]): The range of alpha values to test for hyperparameter tuning. Default is (0.0001, 10., 50). n_folds (int): The number of cross-validation folds to use for hyperparameter tuning. Default is 5. **kwargs: Arbitrary keyword arguments to be passed to the `lasso` constructor. Returns: Lasso: The trained Lasso regression model. Example: >>> from edamame.regressor import TrainRegressor >>> regressor = TrainRegressor(X_train, np.log(y_train), X_test, np.log(y_test)) >>> lasso = regressor.lasso(alpha=(0.0001, 10., 50), n_folds=5) """ # lasso hyperparameter alphas = np.linspace(alpha[0], alpha[1], alpha[2]) # hyperparameter gridsearch lasso = Lasso(**kwargs) tuned_parameters = [{"alpha": alphas}] reg_lasso = GridSearchCV(lasso, tuned_parameters, cv=n_folds, refit=True, verbose=0, scoring='r2') reg_lasso.fit(self.X_train, self.y_train.squeeze()) # save the model in the instance attributes self.__lasso_fit = reg_lasso.best_estimator_ # return step return self.__lasso_fit
[docs] def ridge(self, alpha: Tuple[float, float, int] = (0.1, 50., 50), n_folds: int = 5, **kwargs) -> Ridge: """ Train a Ridge regression model using the training data and return the fitted model. Args: alpha (Tuple[float, float, int]): The range of alpha values to test for hyperparameter tuning. Default is (0.1, 50, 50). n_folds (int): The number of cross-validation folds to use for hyperparameter tuning. Default is 5. **kwargs: Arbitrary keyword arguments to be passed to the `ridge` constructor. Returns: Ridge: The trained Ridge regression model. Example: >>> from edamame.regressor import TrainRegressor >>> regressor = TrainRegressor(X_train, np.log(y_train), X_test, np.log(y_test)) >>> ridge = regressor.ridge(alpha=((0.1, 50., 50), n_folds=5) """ # ridge hyperparameter alphas = np.linspace(alpha[0], alpha[1], alpha[2]) # hyperparameter gridsearch ridge = Ridge(**kwargs) tuned_parameters = [{"alpha": alphas}] reg_ridge = GridSearchCV(ridge, tuned_parameters, cv=n_folds, refit=True, verbose=0, scoring='r2') reg_ridge.fit(self.X_train, self.y_train.squeeze()) # save the model in the instance attributes self.__ridge_fit = reg_ridge.best_estimator_ # return step return self.__ridge_fit
[docs] def tree(self, alpha: Tuple[float, float, int] = (0., 0.001, 5), impurity: Tuple[float, float, int] = (0., 0.00001, 5), n_folds: int = 5, **kwargs) -> DecisionTreeRegressor: """ Fits a decision tree regression model using the provided training data and hyperparameters. Args: alpha (Tuple[float, float, int]): A tuple specifying the range of values to use for the ccp_alpha hyperparameter. The range is given as a tuple (start, stop, num), where `start` is the start of the range, `stop` is the end of the range, and `num` is the number of values to generate within the range. Defaults to (0., 0.001, 5). impurity (Tuple[float, float, int]): A tuple specifying the range of values to use for the min_impurity_decrease hyperparameter. The range is given as a tuple (start, stop, num), where `start` is the start of the range, `stop` is the end of the range, and `num` is the number of values to generate within the range. Defaults to (0., 0.00001, 5). n_folds (int): The number of folds to use for cross-validation. Defaults to 5. **kwargs: Arbitrary keyword arguments to be passed to the `tree` constructor. Returns: DecisionTreeRegressor: The fitted decision tree regressor model. Example: >>> from edamame.regressor import TrainRegressor >>> regressor = TrainRegressor(X_train, np.log(y_train), X_test, np.log(y_test)) >>> tree = regressor.tree(alpha=(0., 0.001, 5), impurity=(0., 0.00001, 5), n_folds=3) """ # hyperparameters gridsearch alphas = np.linspace(alpha[0], alpha[1], alpha[2]) impurities = np.linspace(impurity[0], impurity[1], impurity[2]) tuned_parameters = [{"ccp_alpha": alphas, 'min_impurity_decrease': impurities}] tree = DecisionTreeRegressor(**kwargs) reg_tree = GridSearchCV(tree, tuned_parameters, cv=n_folds, refit=True, verbose=0, scoring='r2') reg_tree.fit(self.X_train, self.y_train.squeeze()) # save the model in the instance attributes self.__tree_fit = reg_tree.best_estimator_ # return step return self.__tree_fit
[docs] def random_forest(self, n_estimators: Tuple[int, int, int] = (50, 1000, 5), n_folds: int = 2, **kwargs) -> RandomForestRegressor: """ Trains a Random Forest regression model on the training data and returns the best estimator found by GridSearchCV. Args: n_estimators (Tuple[int, int, int]): A tuple of integers specifying the minimum and maximum number of trees to include in the forest, and the step size between them. n_folds (int): The number of cross-validation folds to use when evaluating models. **kwargs: Arbitrary keyword arguments to be passed to the `random forest` constructor. Returns: RandomForestRegressor: The best Random Forest model found by GridSearchCV. """ n_estimators = np.linspace(n_estimators[0], n_estimators[1], n_estimators[2]).astype(np.int16) tuned_parameters = [{"n_estimators": n_estimators}] random_forest = RandomForestRegressor(warm_start=True, n_jobs=-1, **kwargs) reg_random_forest = GridSearchCV(random_forest, tuned_parameters, cv=n_folds, refit=True, verbose=0, scoring='r2') reg_random_forest.fit(self.X_train, self.y_train.squeeze()) # save the model in the instance attributes self.__random_forest_fit = reg_random_forest.best_estimator_ # return step return self.__random_forest_fit
[docs] def xgboost(self, n_estimators: Tuple[int, int, int] = (10, 100, 5), n_folds: int = 2, **kwargs) -> xgb.XGBRegressor: """ Trains an XGBoost model using the specified hyperparameters. Args: n_estimators (Tuple[int, int, int]): A tuple containing the start, end and step values for number of estimators. Default is (10, 100, 5). n_folds (int): The number of folds to use in the cross-validation process. Default is 2. **kwargs: Arbitrary keyword arguments to be passed to the `xgboost` constructor. Returns: xgb.XGBRegressor: The trained XGBoost model. Example: >>> from edamame.regressor import TrainRegressor >>> regressor = TrainRegressor(X_train, np.log(y_train), X_test, np.log(y_test)) >>> xgboost = regressor.xgboost(n_estimators=(10, 200, 10), n_folds=5) """ n_est = np.linspace(n_estimators[0], n_estimators[1], n_estimators[2]).astype(np.int16) tuned_parameters = {"n_estimators": n_est} xgb_m = xgb.XGBRegressor(objective ='reg:squarederror', **kwargs) reg_xgb = GridSearchCV(xgb_m, tuned_parameters, cv=n_folds, refit=True, verbose=0, scoring='r2') reg_xgb.fit(self.X_train, self.y_train.squeeze()) # save the model in the instance attributes self.__xgb_fit = reg_xgb.best_estimator_ # return step return self.__xgb_fit
[docs] def model_metrics(self, model_name: Literal["all", "linear", "lasso", "ridge", "tree", "random_forest", "xgboost"] = 'all') -> None: """ Displays the metrics of a trained regression model. The metrics displayed are R2, MSE, and MAE for both the training and test sets. Args: model_name (Literal["all", "linear", "lasso", "ridge", "tree", "random_forest", "xgboost"]): The name of the model to display metrics for. Can be one of 'all', 'linear', 'lasso', 'ridge', 'tree', 'random_forest', or 'xgboost'. Defaults to 'all'. Returns: None Example: >>> from edamame.regressor import TrainRegressor >>> regressor = TrainRegressor(X_train, np.log(y_train), X_test, np.log(y_test)) >>> xgboost = regressor.xgboost(n_estimators=(10, 200, 10), n_folds=5) >>> regressor.model_metrics(model_name="xgboost") """ model_dct = {'linear': 0, 'lasso': 1, 'ridge': 2, 'tree': 3, 'random_forest': 4, 'xgboost': 5} model_list = [self.__linear_fit, self.__lasso_fit, self.__ridge_fit, self.__tree_fit, self.__random_forest_fit, self.__xgb_fit] if model_name == 'all': for key in model_dct: if model_list[model_dct[key]].__class__.__name__ == 'dict': display(f'unable to show {key} model metrics') else: y_pred_train = model_list[model_dct[key]].predict(self.X_train) y_pred_test = model_list[model_dct[key]].predict(self.X_test) # r2 r2_train = r2_score(self.y_train, y_pred_train) r2_test = r2_score(self.y_test, y_pred_test) # MSE mse_train = mean_squared_error(self.y_train, y_pred_train) mse_test = mean_squared_error(self.y_test, y_pred_test) # MAE mae_train = mean_absolute_error(self.y_train, y_pred_train) mae_test = mean_absolute_error(self.y_test, y_pred_test) # display step index_label = ['R2', 'MSE', 'MAE'] metrics = pd.DataFrame([[r2_train, r2_test], [mse_train, mse_test], [mae_train, mae_test]], index = index_label) metrics.columns = [f'Train', 'Test'] string = f'### {key} model metrics:' display(Markdown(string)) display(metrics) else: if model_list[model_dct[model_name]].__class__.__name__ == 'dict': display(f'unable to show {model_name} model metrics') else: y_pred_train = model_list[model_dct[model_name]].predict(self.X_train) y_pred_test = model_list[model_dct[model_name]].predict(self.X_test) # r2 r2_train = r2_score(self.y_train, y_pred_train) r2_test = r2_score(self.y_test, y_pred_test) # MSE mse_train = mean_squared_error(self.y_train, y_pred_train) mse_test = mean_squared_error(self.y_test, y_pred_test) # MAE mae_train = mean_absolute_error(self.y_train, y_pred_train) mae_test = mean_absolute_error(self.y_test, y_pred_test) # display step index_label = ['R2', 'MSE', 'MAE'] metrics = pd.DataFrame([[r2_train, r2_test], [mse_train, mse_test], [mae_train, mae_test]], index = index_label) metrics.columns = [f'Train', 'Test'] string = f'### {model_name} model metrics:' display(Markdown(string)) display(metrics)
[docs] def auto_ml(self, n_folds: int = 5, data: Literal['train', 'test'] = 'train') -> List: """ Perform automated machine learning with cross validation on a list of regression models. Args: n_folds (int): Number of cross-validation folds. Defaults to 5. data (Literal['train', 'test']): Target dataset for cross-validation. Must be either 'train' or 'test'. Defaults to 'train'. Returns: List: List of best-fit regression models for each algorithm. Example: >>> from edamame.regressor import TrainRegressor >>> regressor = TrainRegressor(X_train, np.log(y_train), X_test, np.log(y_test)) >>> model_list = regressor.auto_ml() """ kfold = KFold(n_splits=n_folds) cv_mean = [] score = [] std = [] regressor = ["Linear", "Lasso", "Ridge", "Tree", "Random Forest", "Xgboost"] try: model_list = [LinearRegression(), Lasso(alpha = self.__lasso_fit.alpha), Ridge(alpha = self.__ridge_fit.alpha), DecisionTreeRegressor(ccp_alpha=self.__tree_fit.ccp_alpha, min_impurity_decrease=self.__tree_fit.min_impurity_decrease), RandomForestRegressor(n_estimators = self.__random_forest_fit.n_estimators, warm_start=True, n_jobs=-1), xgb.XGBRegressor(objective ='reg:squarederror', n_estimators = self.__xgb_fit.n_estimators)] except: # find best hyperparameters self.linear() self.lasso() self.ridge() self.tree() self.random_forest() self.xgboost() # model list model_list = [LinearRegression(), Lasso(alpha = self.__lasso_fit.alpha), Ridge(alpha = self.__ridge_fit.alpha), DecisionTreeRegressor(ccp_alpha=self.__tree_fit.ccp_alpha, min_impurity_decrease=self.__tree_fit.min_impurity_decrease), RandomForestRegressor(n_estimators = self.__random_forest_fit.n_estimators, warm_start=True, n_jobs=-1), xgb.XGBRegressor(objective ='reg:squarederror', n_estimators = self.__xgb_fit.n_estimators)] # cross validation loop for model in model_list: if data == 'train': cv_result = cross_val_score(model, self.X_train, self.y_train.squeeze(), cv=kfold, scoring="r2") elif data == 'test': cv_result = cross_val_score(model, self.X_test, self.y_test.squeeze(), cv=kfold, scoring="r2") else: raise ValueError('insert valid target dataset (\'train\' or \'test\')') cv_mean.append(cv_result.mean()) std.append(cv_result.std()) score.append(cv_result) # dataframe for results df_kfold_result = pd.DataFrame({"CV Mean": cv_mean, "Std": std}, index=regressor) # display step string = f'### Metrics results on {data} set:' display(Markdown(string)) display(df_kfold_result) # boxplot on R2 box = pd.DataFrame(score, index=regressor) plt.figure(figsize=(10,8)) box.T.boxplot() plt.show() return [self.__linear_fit, self.__lasso_fit, self.__ridge_fit, self.__tree_fit, self.__random_forest_fit, self.__xgb_fit]
[docs] def save_model(self, model_name: Literal["all", "linear", "lasso", "ridge", "tree", "random_forest", "xgboost"] = 'all') -> None: """ Saves the specified machine learning model or all models in the instance to a pickle file. Args: model_name (Literal["all", "linear", "lasso", "ridge", "tree", "random_forest", "xgboost"]): The name of the model to save. Defaults to 'all'. Returns: None Example: >>> from edamame.regressor import TrainRegressor >>> regressor = TrainRegressor(X_train, np.log(y_train), X_test, np.log(y_test)) >>> model_list = regressor.auto_ml() >>> regressor.save_model(model_name="all") """ model_dct = {'linear': 0, 'lasso': 1, 'ridge': 2, 'tree': 3, 'random_forest': 4, 'xgboost': 5} model_list = [self.__linear_fit, self.__lasso_fit, self.__ridge_fit, self.__tree_fit, self.__random_forest_fit, self.__xgb_fit] if model_name == 'all': for key in model_dct: if model_list[model_dct[key]].__class__.__name__ == 'dict': display(f'unable to save {key} model') else: filename = f'{key}.pkl' with open(filename, 'wb') as file: pickle.dump(model_list[model_dct[key]], file) display(f'{filename} saved') else: if model_list[model_dct[model_name]].__class__.__name__ == 'dict': display(f'unable to save {model_name} model') else: filename = f'{model_name}.pkl' with open(filename, 'wb') as file: pickle.dump(model_list[model_dct[model_name]], file)
[docs]def regression_metrics(model: Union[LinearRegression, Lasso, Ridge, DecisionTreeRegressor, RandomForestRegressor, xgb.XGBRegressor], X: pd.DataFrame, y: pd.DataFrame) -> None: """ Compute and display the regression metrics R2, MSE and MAE of the input model. Args: model (Union[LinearRegression, Lasso, Ridge, DecisionTreeRegressor, RandomForestRegressor, xgb.XGBRegressor]): Regression model. X (pd.DataFrame): Input features. y (pd.DataFrame): Target feature. Returns: None """ # dataframe check dataframe_review(X) dummy_control(X) # pred step y_pred = model.predict(X) # r2 r2 = r2_score(y, y_pred) # MSE mse = mean_squared_error(y, y_pred) # MAE mae = mean_absolute_error(y, y_pred) # display step index_label = ['R2', 'MSE', 'MAE'] metrics = pd.DataFrame([r2,mse,mae], index = index_label) metrics.columns = ['Values'] string = '### Model metrics:' display(Markdown(string)) display(metrics)
if __name__ == '__main__': X = pd.read_csv('/Users/marcosalvalaggio/code/python/ds/data/melb_data/X.csv', sep = ';') y = pd.read_csv('/Users/marcosalvalaggio/code/python/ds/data/melb_data/y.csv', sep = ';') X_train, X_test, y_train, y_test = setup(X, y) regressor = TrainRegressor(X_train, np.log(y_train), X_test, np.log(y_test)) regressor.linear() model_list = regressor.auto_ml() regressor.model_metrics() regressor.save_model()