Source code for edamame.classifier.classification

import numpy as np
import pandas as pd
from  edamame.eda.tools import dataframe_review, dummy_control
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns
import pickle
from IPython.display import display, Markdown
import matplotlib.pyplot as plt 
from typing import Tuple, Literal, List, Union
from sklearn.svm import SVC
# pandas options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)


[docs]class TrainClassifier: """ This class represents a pipeline for training and handling classification models. Attributes: X_train (pd.DataFrame): The input training data. y_train (pd.Series): The target training data. X_test (pd.DataFrame): The input test data. y_test (pd.Series): The target test data. Example: >>> from edamame.classifier import TrainClassifier >>> classifier = TrainClassifier(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test) >>> logistic = classifier.logistic() >>> classifier.model_metrics(model_name="logisitc") >>> classifier.model_save(model_name="logisitc") >>> nb = classifier.gaussian_nb() >>> knn = classifier.knn() >>> tree = classifier.tree() >>> rf = classifier.random_forest() >>> xgb = classifier.xgboost() >>> svm = classifier.svm() >>> classifier.model_metrics() >>> # using AutoML >>> models = classifier.auto_ml() >>> classifier.save_model() """ def __init__(self, X_train, y_train, X_test, y_test): self.X_train = X_train self.y_train = y_train self.X_test = X_test self.y_test = y_test # check dataframe dataframe_review(self.X_train) dataframe_review(self.X_test) # check columns type dummy_control(self.X_train) dummy_control(self.X_test) # init the model self.__logistic_fit = {} self.__gaussian_nb_fit = {} self.__knn_fit = {} self.__tree_fit = {} self.__random_forest_fit = {} self.__xgb_fit = {} self.__svm_fit = {}
[docs] def logistic(self, **kwargs) -> LogisticRegression: """ Trains a logistic regression model using the training data and returns the fitted model. Args: **kwargs: Arbitrary keyword arguments to be passed to the `logistic` constructor. Returns: LogisticRegression: The trained logistic regression model. Example: >>> from edamame.classifier import TrainClassifier >>> classifier = TrainClassifier(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test) >>> logistic = classifier.logistic() """ logistic = LogisticRegression(**kwargs) logistic.fit(self.X_train, self.y_train.squeeze()) # save the model in the instance attributes self.__logistic_fit = logistic return self.__logistic_fit
[docs] def gaussian_nb(self, **kwargs) -> GaussianNB: """ Trains a Gaussian Naive Bayes classifier using the training data and returns the fitted model. Args: **kwargs: Arbitrary keyword arguments to be passed to the `Gaussian NB` constructor. Returns: GaussianNB: The trained Gaussian Naive Bayes classifier. Example: >>> from edamame.classifier import TrainClassifier >>> classifier = TrainClassifier(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test) >>> nb = classifier.gaussian_nb() """ gauss_nb = GaussianNB(**kwargs) gauss_nb.fit(self.X_train, self.y_train.squeeze()) # save the model in the instance attributes self.__gaussian_nb_fit = gauss_nb return self.__gaussian_nb_fit
# ------------ # # KNN # ------------ #
[docs] def knn(self, n_neighbors: Tuple[int, int, int] = (1, 50, 50), n_folds: int = 5, **kwargs) -> KNeighborsClassifier: """ Train a k-Nearest Neighbors classification model using the training data, and perform a grid search to find the best value of 'n_neighbors' hyperparameter. Args: n_neighbors (Tuple[int, int, int]): A tuple with three integers. The first and second integers are the range of the 'n_neighbors' hyperparameter that will be searched by the grid search, and the third integer is the number of values to generate in the interval [n_neighbors[0], n_neighbors[1]]. Default is [1, 50, 50]. n_folds (int): The number of cross-validation folds to use for the grid search. Default is 5. **kwargs: Arbitrary keyword arguments to be passed to the `KNN` constructor. Returns: KNeighborsClassifier: The trained k-Nearest Neighbors classification model with the best 'n_neighbors' hyperparameter found by the grid search. Example: >>> from edamame.classifier import TrainClassifier >>> classifier = TrainClassifier(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test) >>> knn = classifier.knn(n_neighbors=(1,50,50), n_folds=3) """ n_n = np.linspace(n_neighbors[0], n_neighbors[1], n_neighbors[2]).astype(np.int32) knn = KNeighborsClassifier(**kwargs) tuned_parameters = [{"n_neighbors": n_n}] grid_knn = GridSearchCV(knn, tuned_parameters, cv=n_folds, refit=True, verbose=0, scoring='accuracy') grid_knn.fit(self.X_train, self.y_train.squeeze()) # save the model in the instance attributes self.__knn_fit = grid_knn.best_estimator_ return self.__knn_fit
[docs] def tree(self, alpha: Tuple[float, float, int] = (0., 0.001, 5), impurity: Tuple[float, float, int] = (0., 0.00001, 5), n_folds: int = 5, **kwargs) -> DecisionTreeClassifier: """ Trains a decision tree classifier using the training data and returns the fitted model. Args: alpha (Tuple[float, float, int]): A tuple containing the minimum and maximum values of ccp_alpha and the number of values to try (default: (0., 0.001, 5)). impurity (Tuple[float, float, int]): A tuple containing the minimum and maximum values of min_impurity_decrease and the number of values to try (default: (0., 0.00001, 5)). n_folds (int): The number of cross-validation folds to use for grid search (default: 5). **kwargs: Arbitrary keyword arguments to be passed to the `tree` constructor. Returns: DecisionTreeClassifier: The trained decision tree classifier model. Example: >>> from edamame.classifier import TrainClassifier >>> classifier = TrainClassifier(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test) >>> tree = classifier.tree(alpha=(0., 0.001, 5), impurity=(0., 0.00001, 5), n_folds=3) """ alphas = np.linspace(alpha[0], alpha[1], alpha[2]) impurities = np.linspace(impurity[0], impurity[1], impurity[2]) tuned_parameters = [{"ccp_alpha": alphas, 'min_impurity_decrease': impurities}] tree = DecisionTreeClassifier(**kwargs) grid_tree = GridSearchCV(tree, tuned_parameters, cv=n_folds, refit=True, verbose=0, scoring='accuracy') grid_tree.fit(self.X_train, self.y_train.squeeze()) # save the model in the instance attributes self.__tree_fit = grid_tree.best_estimator_ return self.__tree_fit
[docs] def random_forest(self, n_estimators: Tuple[int, int, int] = (50, 1000, 5), n_folds: int = 2, **kwargs) -> RandomForestClassifier: """ Train a Random Forest classifier using the training data and return the fitted model. Args: n_estimators (Tuple[int, int, int]): The range of the number of trees in the forest. Default is (50, 1000, 5). n_folds (int): The number of folds in cross-validation. Default is 2. **kwargs: Arbitrary keyword arguments to be passed to the `random forest` constructor. Returns: RandomForestClassifier: The trained Random Forest classifier. Example: >>> from edamame.classifier import TrainClassifier >>> classifier = TrainClassifier(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test) >>> rf = classifier.random_forest(n_estimators=(50, 1000, 5), n_folds=2) """ n_estimators = np.linspace(n_estimators[0], n_estimators[1], n_estimators[2]).astype(np.int16) tuned_parameters = [{"n_estimators": n_estimators}] random_forest = RandomForestClassifier(warm_start=True, n_jobs=-1, **kwargs) grid_random_forest = GridSearchCV(random_forest, tuned_parameters, cv=n_folds, refit=True, verbose=0, scoring='accuracy') grid_random_forest.fit(self.X_train, self.y_train.squeeze()) # save the model in the instance attributes self.__random_forest_fit = grid_random_forest.best_estimator_ return self.__random_forest_fit
[docs] def xgboost(self, n_estimators: Tuple[int, int, int] = (10, 100, 5), n_folds: int = 2, **kwargs) -> XGBClassifier: """ Train an XGBoost classifier using the training data and return the fitted model. Args: n_estimators (Tuple[int, int, int]): The range of the number of boosting rounds. Default is (10, 100, 5). n_folds (int): The number of folds in cross-validation. Default is 2. **kwargs: Arbitrary keyword arguments to be passed to the `xgboost` constructor. Returns: XGBClassifier: The trained XGBoost classifier. Example: >>> from edamame.classifier import TrainClassifier >>> classifier = TrainClassifier(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test) >>> xgboost = classifier.xgboost(n_estimators=(10, 100, 5), n_folds=2) """ n_est = np.linspace(n_estimators[0], n_estimators[1], n_estimators[2]).astype(np.int16) tuned_parameters = {"n_estimators": n_est} xgb_m = XGBClassifier(**kwargs) grid_xgb = GridSearchCV(xgb_m, tuned_parameters, cv=n_folds, refit=True, verbose=0, scoring='accuracy') grid_xgb.fit(self.X_train, self.y_train.squeeze()) # save the model in the instance attributes self.__xgb_fit = grid_xgb.best_estimator_ return self.__xgb_fit
[docs] def svm(self, n_folds: int = 2, **kwargs) -> SVC: """ Trains an SVM classifier using the training data and returns the fitted model. Args: n_folds (int): The number of folds in cross-validation. Default is 2. **kwargs: Arbitrary keyword arguments to be passed to the `SVC` constructor. Returns: SVC: The trained SVM classifier. Example: >>> from edamame.classifier import TrainClassifier >>> classifier = TrainClassifier(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test) >>> svm = classifier.svm(kernel="linear", C=1.0, gamma="auto") """ n_kernel = ["linear", "poly", "rbf", "sigmoid"] tuned_parameters = {"kernel": n_kernel} svm_c = SVC(probability=True, **kwargs) grid_svm_c = GridSearchCV(svm_c, tuned_parameters, cv=n_folds, refit=True, verbose=0, scoring='accuracy') grid_svm_c.fit(self.X_train, self.y_train.squeeze()) # save the model in the instance attributes self.__svm_fit = grid_svm_c.best_estimator_ return self.__svm_fit
[docs] def model_metrics(self, model_name: Literal["all", "logistic", "gaussian_nb", "knn", "tree", "random_forest", "xgboost", "svm"] = 'all', cm: bool = False) -> None: """ Display classification metrics (confusion matrix and classification report) for specified or all trained models. Args: model_name (Literal["all", "logistic", "guassian_nb", "knn", "tree", "random_forest", "xgboost", "svm"]): The name of the model to display the metrics for. Defaults to 'all'. cm (bool): Whether to display the confusion matrix. Defaults to False. Returns: None Example: >>> from edamame.classifier import TrainClassifier >>> classifier = TrainClassifier(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test) >>> xgboost = classifier.xgboost(n_estimators=(10, 100, 5), n_folds=2) >>> classifier.model_metrics(model_name="xgboost") """ model_dct = {'logistic': 0, 'gaussian_nb': 1, 'knn': 2, 'tree': 3, 'random_forest': 4, 'xgboost': 5, 'svm': 6} model_list = [self.__logistic_fit, self.__gaussian_nb_fit, self.__knn_fit, self.__tree_fit, self.__random_forest_fit, self.__xgb_fit, self.__svm_fit] if model_name == 'all': for key in model_dct: if model_list[model_dct[key]].__class__.__name__ == 'dict': display(f'unable to show {key} model metrics') else: title = f'### {key} model metrics:' display(Markdown(title)) y_pred_train = model_list[model_dct[key]].predict(self.X_train) y_pred_test = model_list[model_dct[key]].predict(self.X_test) if cm: plt.figure(figsize=(10,4)) plt.subplot(121) sns.heatmap(confusion_matrix(self.y_train, y_pred_train), annot=True, fmt="2.0f") plt.title(f'{key} train') plt.subplot(122) sns.heatmap(confusion_matrix(self.y_test, y_pred_test), annot=True, fmt="2.0f") plt.title(f'{key} test') plt.show() title = f'#### Train classification report' display(Markdown(title)) print(classification_report(self.y_train, y_pred_train)) title = f'#### Test classification report' display(Markdown(title)) print(classification_report(self.y_test, y_pred_test)) else: if model_list[model_dct[model_name]].__class__.__name__ == 'dict': display(f'unable to show {model_name} model metrics') else: title = f'### {model_name} model metrics:' display(Markdown(title)) y_pred_train = model_list[model_dct[model_name]].predict(self.X_train) y_pred_test = model_list[model_dct[model_name]].predict(self.X_test) if cm: plt.figure(figsize=(10,4)) plt.subplot(121) sns.heatmap(confusion_matrix(self.y_train, y_pred_train), annot=True, fmt="2.0f") plt.title(f'{model_name} train') plt.subplot(122) sns.heatmap(confusion_matrix(self.y_test, y_pred_test), annot=True, fmt="2.0f") plt.title(f'{model_name} test') plt.show() title = f'#### Train classification report' display(Markdown(title)) print(classification_report(self.y_train, y_pred_train)) title = f'#### Test classification report' display(Markdown(title)) print(classification_report(self.y_test, y_pred_test))
[docs] def auto_ml(self, n_folds: int = 5, data: Literal['train', 'test'] = 'train') -> List: """ Perform automated machine learning with cross validation on a list of classification models. Args: n_folds (int): Number of cross-validation folds. Defaults to 5. data (Literal['train', 'test']): Target dataset for cross-validation. Must be either 'train' or 'test'. Defaults to 'train'. Returns: List: List of best-fit classification models for each algorithm. Example: >>> from edamame.classifier import TrainClassifier >>> classifier = TrainClassifier(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test) >>> model_list = classifier.auto_ml() """ kfold = KFold(n_splits=n_folds) cv_mean = [] score = [] std = [] classifier = ["Logistic", "Gaussian NB", "KNN", "Tree", "Random forest", "Xgboost", "SVM"] try: model_list = [LogisticRegression(), GaussianNB(), KNeighborsClassifier(n_neighbors=self.__knn_fit.n_neighbors), DecisionTreeClassifier(ccp_alpha=self.__tree_fit.ccp_alpha, min_impurity_decrease=self.__tree_fit.min_impurity_decrease), RandomForestClassifier(n_estimators=self.__random_forest_fit.n_estimators, warm_start=True, n_jobs=-1), XGBClassifier(n_estimators=self.__xgb_fit.n_estimators), SVC(kernel=self.__svm_fit.kernel)] except: # find best hyperparameters self.logistic() self.gaussian_nb() self.knn() self.tree() self.random_forest() self.xgboost() self.svm() # model list model_list = [LogisticRegression(), GaussianNB(), KNeighborsClassifier(n_neighbors=self.__knn_fit.n_neighbors), DecisionTreeClassifier(ccp_alpha=self.__tree_fit.ccp_alpha, min_impurity_decrease=self.__tree_fit.min_impurity_decrease), RandomForestClassifier(n_estimators = self.__random_forest_fit.n_estimators, warm_start=True, n_jobs=-1), XGBClassifier(n_estimators = self.__xgb_fit.n_estimators), SVC(kernel=self.__svm_fit.kernel)] # cross validation loop for model in model_list: if data == 'train': cv_result = cross_val_score(model, self.X_train, self.y_train.squeeze(), cv=kfold, scoring="accuracy") elif data == 'test': cv_result = cross_val_score(model, self.X_test, self.y_test.squeeze(), cv=kfold, scoring="accuracy") else: raise ValueError('insert valid target dataset (\'train\' or \'test\')') cv_mean.append(cv_result.mean()) std.append(cv_result.std()) score.append(cv_result) # dataframe for results df_kfold_result = pd.DataFrame({"CV Mean": cv_mean, "Std": std}, index=classifier) # display step string = f'### Metrics results on {data} set:' display(Markdown(string)) display(df_kfold_result) # boxplot on R2 box = pd.DataFrame(score, index=classifier) plt.figure(figsize=(10,8)) box.T.boxplot() plt.show() return [self.__logistic_fit, self.__gaussian_nb_fit, self.__knn_fit, self.__tree_fit, self.__random_forest_fit, self.__xgb_fit, self.__svm_fit]
[docs] def save_model(self, model_name: Literal["all", "logistic", "gaussian_nb", "knn", "tree", "random_forest", "xgboost", "svm"] = 'all') -> None: """ Saves the specified machine learning model or all models in the instance to a pickle file. Args: model_name (Literal["all", "linear", "lasso", "ridge", "tree", "random_forest", "xgboost", "svm"]): The name of the model to save. Defaults to 'all'. Returns: None Example: >>> from edamame.classifier import TrainClassifier >>> classifier = TrainClassifier(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test) >>> model_list = classifier.auto_ml() >>> classifier.save_model(model_name="all") """ model_dct = {'logistic': 0, 'gaussian_nb': 1, 'knn': 2, 'tree': 3, 'random_forest': 4, 'xgboost': 5, 'svm': 6} model_list = [self.__logistic_fit, self.__gaussian_nb_fit, self.__knn_fit, self.__tree_fit, self.__random_forest_fit, self.__xgb_fit, self.__svm_fit] if model_name == 'all': for key in model_dct: if model_list[model_dct[key]].__class__.__name__ == 'dict': display(f'unable to save {key} model') else: filename = f'{key}.pkl' with open(filename, 'wb') as file: pickle.dump(model_list[model_dct[key]], file) display(f'{filename} saved') else: if model_list[model_dct[model_name]].__class__.__name__ == 'dict': display(f'unable to save {model_name} model') else: filename = f'{model_name}.pkl' with open(filename, 'wb') as file: pickle.dump(model_list[model_dct[model_name]], file)
[docs]def classifier_metrics(model: Union[LogisticRegression, GaussianNB, KNeighborsClassifier, DecisionTreeClassifier, RandomForestClassifier, XGBClassifier, SVC], X: pd.DataFrame, y: pd.DataFrame, cm: bool = False) -> None: """ Display classification metrics (confusion matrix and classification report) for the model passed as input to the function. Args: model (Union[LogisticRegression, GaussianNB, KNeighborsClassifier, DecisionTreeClassifier, RandomForestClassifier, XGBClassifier, SVC]): Classification model. X (pd.DataFrame): Input features. y (pd.DataFrame): Target feature. cm (bool): Whether to display the confusion matrix. Defaults to False. Returns: None """ dataframe_review(X) dummy_control(X) y_pred = model.predict(X) title = f'#### Model metrics:' display(Markdown(title)) if cm: sns.heatmap(confusion_matrix(y, y_pred), annot=True, fmt="2.0f") plt.show() print(classification_report(y, y_pred))