Source code for edamame.eda.tools

import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import pandas as pd
from typing import Tuple
import numpy as np


[docs]def load_model(path: str): """ The function load the model saved previously in the pickle format. Args: path (str): Path to the model saved in .pkl """ with open(path, 'rb') as file: model = pickle.load(file) return model
[docs]def setup(X: pd.DataFrame, y: pd.DataFrame, dummy: bool = False, seed: int = 42, size: float = 0.25) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]: """ The function returns the following elements: X_train, y_train, X_test, y_test. Args: X (pd.DataFrame): The model matrix X (features matrix). y (pd.DataFrame): The target variable. dummy (bool): If False, the function produces the OHE. If True, the dummy encoding. seed (int): Random seed to apply at the train_test_split function. size (float): Size of the test dataset. Return: Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]: X_train, y_train, X_test, y_test. Example: >>> import edamame.eda as eda >>> df = pd.DataFrame({'category': ['A', 'B', 'A', 'B', 'C', 'A'], 'value': [1, 2, 3, 4, 4, 5], 'target': ['A2', 'A2', 'B2', 'B2', 'A2', 'B2']}) >>> X, y = eda.split_and_scaling(df, 'target') >>> X_train, y_train, X_test, y_test = eda.setup(X, y) """ # split dataset in train and test X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=size, random_state=seed) X_train = pd.get_dummies(data=X_train, drop_first=dummy) X_test = pd.get_dummies(data=X_test, drop_first=dummy) return X_train, y_train, X_test, y_test
[docs]def scaling(X: pd.DataFrame, minmaxscaler: bool = False) -> pd.DataFrame: """ The function returns the normalised/standardized matrix. Args: X (pd.DataFrame): The model matrix X/X_train/X_test minmaxscaler (bool): Select the type of scaling to apply to the numerical columns. By default is setted to the StandardScaler. If minmaxscaler is set to True the numerical columns is trasfomed to [0,1]. Return: pd.DataFrame Example: >>> import edamame.eda as eda >>> X = pd.DataFrame({'category': ['A', 'B', 'A', 'B', 'C', 'A'], 'value': [1, 2, 3, 4, 4, 5]}) >>> X = eda.scaling(X) """ # dataframe check dataframe_review(X) # scaling quantitative variables types = X.dtypes quant = types[types != 'object'] quant_columns = list(quant.index) if minmaxscaler: scaler = MinMaxScaler() else: scaler = StandardScaler() scaler.fit(X[quant_columns]) X[quant_columns] = scaler.transform(X[quant_columns]) return X
[docs]def ohe(array: np.ndarray) -> np.ndarray: """ Convert a NumPy array that represents the categorical label of the target variable and transform it using one-hot encoding. Args: array (np.ndarray): The target variables passed in input. Return: np.ndarray: The one-hot encoded NumPy array. Example: >>> import edamame.eda as eda >>> from sklearn import datasets >>> iris = datasets.load_iris() >>> y = iris.target >>> y_ohe = eda.ohe(y) """ num_classes = len(np.unique(array)) ohe_array = np.eye(num_classes)[array] return ohe_array
[docs]def dummy_control(data: pd.DataFrame) -> None: """ The function checks if the Pandas dataframe passed is encoded with dummy or OHE. Args: data (pd.Dataframe): A pandas DataFrame passed in input. Raises: TypeError: If the input DataFrame contains non-numerical columns. Returns: None """ types = data.dtypes qual_col = types[types == 'object'] if len(qual_col) != 0: raise TypeError('dataframe with non-numerical columns') else: pass
[docs]def dataframe_review(data: pd.DataFrame) -> None: """ The function checks if the object passed is a Pandas dataframe. Args: data (pd.Dataframe): A pandas DataFrame passed in input. Raises: TypeError: If the input DataFrame contains non-numerical columns. Returns: None """ if data.__class__.__name__ == 'DataFrame': pass else: raise TypeError('The data loaded is not a DataFrame')