
# Library Version check-in
import sys, numpy, scipy, pandas as pd, matplotlib, sklearn, seaborn as sns
print('Python: {}'.format(sys.version))
print('scipy: {}'.format(scipy.__version__))
print('numpy: {}'.format(numpy.__version__))
print('pandas: {}'.format(pd.__version__))
print('sklearn: {}'.format(sklearn.__version__))
print('matplotlib: {}'.format(matplotlib.__version__))
print('Seaborn: {}'.format(sns.__version__))
# No warning of any kind please!
import warnings
# will ignore any warnings
warnings.filterwarnings("ignore")
# Loading required Libraries
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
# Load the dataset from UCI
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
# column names for the dataset
names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'class']
# feeding the data with pandas, giving column names to dataset.
dataset = pd.read_csv(url, names= names)
# Peak at the data
dataset.head(10)
# dimensions of the dataset
r, c = dataset.shape
print('This dataset has ',r,' rows and ' ,c,' columns.')
# Grouping by Class
dataset.groupby('class').size()
# Statistical Summary
dataset.describe()
Univariate plots to better understand each attribute.
# plotting each variable
# box and whiskers plot
dataset.plot(kind='box', subplots=True, layout=(2,2), sharex=False, sharey=False, figsize=(15,13))
plt.show()
dataset.plot(kind='hist', subplots=True, layout = (2,2), sharex=False, sharey=False, figsize=(15,13))
plt.show()
Multivariate plots to better understand the relationships between attributes.
scatter_matrix(dataset, figsize=(15,10))
plt.show()
Describe the tools and techniques you will use necessary for a model to make a prediction
# 80-20 train-test-split
from sklearn.model_selection import train_test_split
array = dataset.values
X = array[:, 0:4]
y = array[:, 4]
test = 0.2
seed = 53
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test, random_state = seed)
scoring = 'accuracy'
We are using the metric of ‘accuracy‘ to evaluate models. This is a ratio of the number of correctly predicted instances in divided by the total number of instances in the dataset multiplied by 100 to give a percentage (e.g. 95% accurate). We will be using the scoring variable when we run build and evaluate each model next.
We'll evaluate these 6 algorithms:
This is a good mixture of simple linear (LR and LDA), nonlinear (KNN, CRT, NB and SVM) algorithms. We reset the random number seed before each run to ensure that the evaluation of each algorithm is performed using exactly the same data splits. It ensures the results are directly comparable.
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import lightgbm as lgb
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CRT', DecisionTreeClassifier()))
models.append(('GNN', GaussianNB()))
models.append(('SVM', SVC()))
models.append(('RF', RandomForestClassifier()))
# Now evaluating each model
from sklearn.model_selection import KFold, cross_val_score
results = []
names = []
# looping models in the list
for name, model in models:
kfold = KFold(n_splits=10, random_state=seed, shuffle=True)
cv_results = cross_val_score(model, X_train, y_train, cv = kfold, scoring = scoring)
results.append(cv_results)
names.append(name)
print(name, ': ', cv_results.mean(), cv_results.std())
# importing model for feature importance
from sklearn.ensemble import ExtraTreesClassifier
# passing the model
model = ExtraTreesClassifier(random_state = 53)
X = dataset.iloc[:, 0:4]
y = dataset.iloc[:, -1:]
# training the model
model.fit(X, y)
# extracting feature importance from model and making a dataframe of it in descending order
ETC_feature_importances = pd.DataFrame(model.feature_importances_, index = X.columns, columns=['ETC']).sort_values('ETC', ascending=False)
# removing traces of this model
model = None
# results
ETC_feature_importances
# passing the model
model = RandomForestClassifier(random_state = 53)
# training the model
model.fit(X, y)
# extracting feature importance from model and making a dataframe of it in descending order
RFC_feature_importances = pd.DataFrame(model.feature_importances_, index = X.columns, columns=['RFC']).sort_values('RFC', ascending=False)
# removing traces of this model
model = None
# show top 10 features
RFC_feature_importances
# importing model for feature importance
from sklearn.ensemble import AdaBoostClassifier
# passing the model
model = AdaBoostClassifier(random_state = 53)
model.fit(X, y)
# extracting feature importance from model and making a dataframe of it in descending order
ADB_feature_importances = pd.DataFrame(model.feature_importances_, index = X.columns, columns=['ADB']).sort_values('ADB', ascending=False)
# removing traces of this model
model = None
ADB_feature_importances
# importing model for feature importance
from sklearn.ensemble import GradientBoostingClassifier
# passing the model
model = GradientBoostingClassifier(random_state = 53)
# training the model
model.fit(X, y)
# extracting feature importance from model and making a dataframe of it in descending order
GBC_feature_importances = pd.DataFrame(model.feature_importances_, index = X.columns, columns=['GBC']).sort_values('GBC', ascending=False)
# removing traces of this model
model = None
# show top 10 features
GBC_feature_importances.head(10)
from sklearn.feature_selection import SelectKBest
kbest = SelectKBest(k = 3).fit(X,y)
mask = kbest.get_support()
new_features = X.columns[mask]
new_features
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
lda = DecisionTreeClassifier()
lda.fit(X_train, y_train)
predict = lda.predict(X_test)
print(accuracy_score(y_test, predict))
print(confusion_matrix(y_test, predict))
print(classification_report(y_test, predict))
lda = None