# import all libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
data = pd.read_excel("./Multiclass-dataset.xlsx")
data.head()
| P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | ... | P12 | P13 | P14 | P15 | P16 | P17 | P18 | P19 | P20 | Target Label | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 3.30 | 7.44 | 1.52 | 3.27 | 0.07 | 2.14 | 0.75 | 0.66 | 0.0 | 54.8 | ... | 50.7 | 6.55 | 4.09 | 4.26 | 0.01 | 0.00 | 24.7 | 2.7 | 1.6 | V1 |
| 1 | 3.43 | 7.63 | 1.63 | 3.27 | 0.05 | 2.01 | 0.74 | 0.65 | 0.0 | 51.8 | ... | 47.9 | 8.35 | 5.08 | 5.01 | 0.01 | 0.00 | 23.3 | 2.3 | 1.8 | V1 |
| 2 | 3.41 | 7.32 | 1.52 | 3.18 | 0.07 | 2.09 | 0.80 | 0.70 | 0.0 | 54.0 | ... | 54.4 | 9.27 | 6.85 | 7.14 | 0.19 | 0.06 | 25.0 | 2.5 | -0.9 | V1 |
| 3 | 3.78 | 7.85 | 1.69 | 3.35 | 0.03 | 1.98 | 0.77 | 0.67 | 0.0 | 57.7 | ... | 48.9 | 10.26 | 5.96 | 5.47 | 0.05 | 0.01 | 24.1 | 5.6 | 2.1 | V1 |
| 4 | 3.90 | 7.99 | 1.61 | 3.43 | 0.02 | 2.14 | 0.77 | 0.71 | 0.0 | 59.1 | ... | 54.1 | 8.19 | 5.81 | 4.72 | 0.64 | 0.16 | 26.8 | 2.5 | 2.1 | V1 |
5 rows × 21 columns
X = data.iloc[:,0:20]
X
| P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | P15 | P16 | P17 | P18 | P19 | P20 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 3.30 | 7.44 | 1.52 | 3.27 | 0.07 | 2.14 | 0.75 | 0.66 | 0.0 | 54.8 | 49.7 | 50.7 | 6.55 | 4.09 | 4.26 | 0.01 | 0.00 | 24.7 | 2.7 | 1.6 |
| 1 | 3.43 | 7.63 | 1.63 | 3.27 | 0.05 | 2.01 | 0.74 | 0.65 | 0.0 | 51.8 | 47.3 | 47.9 | 8.35 | 5.08 | 5.01 | 0.01 | 0.00 | 23.3 | 2.3 | 1.8 |
| 2 | 3.41 | 7.32 | 1.52 | 3.18 | 0.07 | 2.09 | 0.80 | 0.70 | 0.0 | 54.0 | 50.5 | 54.4 | 9.27 | 6.85 | 7.14 | 0.19 | 0.06 | 25.0 | 2.5 | -0.9 |
| 3 | 3.78 | 7.85 | 1.69 | 3.35 | 0.03 | 1.98 | 0.77 | 0.67 | 0.0 | 57.7 | 47.2 | 48.9 | 10.26 | 5.96 | 5.47 | 0.05 | 0.01 | 24.1 | 5.6 | 2.1 |
| 4 | 3.90 | 7.99 | 1.61 | 3.43 | 0.02 | 2.14 | 0.77 | 0.71 | 0.0 | 59.1 | 54.1 | 54.1 | 8.19 | 5.81 | 4.72 | 0.64 | 0.16 | 26.8 | 2.5 | 2.1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 395 | 3.78 | 7.63 | 1.89 | 2.91 | 0.07 | 1.54 | 0.82 | 0.69 | 0.0 | 178.1 | 173.0 | 147.0 | 7.78 | 10.44 | 11.53 | 0.02 | 0.01 | 73.2 | 2.9 | 10.7 |
| 396 | 3.94 | 7.75 | 1.78 | 3.22 | 0.05 | 1.81 | 0.83 | 0.69 | 0.0 | 182.2 | 183.5 | 165.6 | 12.58 | 13.42 | 12.30 | 0.01 | 0.00 | 76.3 | 2.2 | 5.2 |
| 397 | 3.43 | 7.39 | 1.63 | 3.11 | 0.05 | 1.91 | 0.79 | 0.68 | 0.0 | 171.8 | 168.4 | 148.6 | 12.81 | 14.65 | 14.40 | 0.01 | 0.00 | 71.6 | 2.9 | 7.6 |
| 398 | 3.96 | 7.80 | 1.88 | 3.03 | 0.06 | 1.61 | 0.82 | 0.70 | 0.0 | 175.0 | 168.9 | 144.1 | 12.03 | 13.00 | 11.39 | 0.00 | 0.00 | 71.9 | 3.2 | 10.4 |
| 399 | 4.17 | 8.02 | 1.86 | 3.27 | 0.06 | 1.75 | 0.82 | 0.69 | 0.0 | 171.7 | 169.5 | 152.8 | 14.24 | 16.01 | 14.09 | 0.09 | 0.02 | 72.0 | 2.9 | 5.8 |
400 rows × 20 columns
y = data[['Target Label']]
y
| Target Label | |
|---|---|
| 0 | V1 |
| 1 | V1 |
| 2 | V1 |
| 3 | V1 |
| 4 | V1 |
| ... | ... |
| 395 | V10 |
| 396 | V10 |
| 397 | V10 |
| 398 | V10 |
| 399 | V10 |
400 rows × 1 columns
y['Target Label'] = pd.factorize(y['Target Label'])[0]
<ipython-input-98-cb5e4653149a>:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy y['Target Label'] = pd.factorize(y['Target Label'])[0]
data.isnull().sum()
P1 0 P2 0 P3 0 P4 0 P5 0 P6 0 P7 0 P8 0 P9 0 P10 0 P11 0 P12 0 P13 0 P14 0 P15 0 P16 0 P17 0 P18 0 P19 0 P20 0 Target Label 0 dtype: int64
data.shape
(400, 21)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
X_train
| P1 | P2 | P3 | P4 | P5 | P6 | P7 | P8 | P9 | P10 | P11 | P12 | P13 | P14 | P15 | P16 | P17 | P18 | P19 | P20 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 93 | 3.47 | 7.56 | 1.73 | 3.04 | 0.05 | 1.75 | 0.76 | 0.66 | 0.0 | 61.2 | 54.7 | 55.9 | 10.02 | 6.63 | 5.94 | 0.00 | 0.00 | 27.3 | 3.5 | 1.5 |
| 23 | 3.26 | 7.37 | 1.52 | 3.18 | 0.08 | 2.09 | 0.75 | 0.67 | 0.0 | 59.4 | 54.1 | 54.4 | 7.46 | 6.47 | 6.08 | 0.50 | 0.15 | 26.9 | 2.7 | 2.0 |
| 299 | 3.55 | 7.54 | 1.61 | 3.18 | 0.05 | 1.98 | 0.79 | 0.69 | 0.0 | 91.4 | 59.5 | 53.1 | 15.25 | 8.95 | 7.46 | 1.79 | 0.51 | 32.9 | 14.5 | 10.9 |
| 13 | 3.35 | 7.42 | 1.52 | 3.18 | 0.05 | 2.09 | 0.76 | 0.69 | 0.0 | 49.7 | 47.5 | 48.9 | 6.55 | 5.40 | 5.23 | 0.09 | 0.03 | 23.1 | 1.3 | 0.8 |
| 90 | 3.65 | 7.49 | 1.61 | 3.18 | 0.07 | 1.98 | 0.82 | 0.71 | 0.0 | 54.7 | 51.9 | 54.4 | 7.74 | 6.20 | 6.05 | 0.00 | 0.00 | 25.5 | 1.9 | 0.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 255 | 3.03 | 6.91 | 1.44 | 3.01 | 0.04 | 2.09 | 0.80 | 0.70 | 0.0 | 114.6 | 68.5 | 50.3 | 11.31 | 7.24 | 6.11 | 2.67 | 0.88 | 38.7 | 19.4 | 20.2 |
| 72 | 3.90 | 7.95 | 1.61 | 3.43 | 0.06 | 2.14 | 0.78 | 0.71 | 0.0 | 171.4 | 157.6 | 128.1 | 11.83 | 13.43 | 13.12 | 0.04 | 0.01 | 68.6 | 5.1 | 14.6 |
| 396 | 3.94 | 7.75 | 1.78 | 3.22 | 0.05 | 1.81 | 0.83 | 0.69 | 0.0 | 182.2 | 183.5 | 165.6 | 12.58 | 13.42 | 12.30 | 0.01 | 0.00 | 76.3 | 2.2 | 5.2 |
| 235 | 3.81 | 7.97 | 1.72 | 3.28 | 0.06 | 1.91 | 0.76 | 0.68 | 0.0 | 88.2 | 55.8 | 46.9 | 20.03 | 12.10 | 9.66 | 0.00 | 0.00 | 31.2 | 14.7 | 13.1 |
| 37 | 3.60 | 7.63 | 1.82 | 2.96 | 0.08 | 1.63 | 0.78 | 0.67 | 0.0 | 52.3 | 50.2 | 50.2 | 9.66 | 7.32 | 6.94 | 0.32 | 0.09 | 24.4 | 1.0 | 1.8 |
320 rows × 20 columns
X_train.columns
Index(['P1', 'P2', 'P3', 'P4', 'P5', 'P6', 'P7', 'P8', 'P9', 'P10', 'P11',
'P12', 'P13', 'P14', 'P15', 'P16', 'P17', 'P18', 'P19', 'P20'],
dtype='object')
print(X_train.describe())
P1 P2 P3 P4 P5 P6 \
count 320.000000 320.000000 320.000000 320.000000 320.000000 320.000000
mean 3.444937 7.408656 1.594406 3.127719 0.056750 1.973719
std 0.391488 0.415670 0.136553 0.195338 0.014667 0.191690
min 2.500000 6.330000 1.250000 2.600000 0.030000 1.220000
25% 3.167500 7.100000 1.520000 3.010000 0.040000 1.870000
50% 3.440000 7.420000 1.610000 3.135000 0.060000 1.980000
75% 3.700000 7.685000 1.660000 3.270000 0.070000 2.090000
max 4.720000 8.630000 2.210000 3.690000 0.120000 2.580000
P7 P8 P9 P10 P11 P12 \
count 320.000000 320.000000 320.000000 320.000000 320.000000 320.000000
mean 0.787250 0.690500 0.023031 103.526250 81.046250 70.108125
std 0.026978 0.018205 0.098017 40.757266 44.489461 36.350503
min 0.670000 0.640000 0.000000 43.300000 42.600000 42.100000
25% 0.770000 0.680000 0.000000 75.900000 53.600000 49.800000
50% 0.790000 0.690000 0.000000 93.300000 59.550000 53.100000
75% 0.810000 0.700000 0.000000 119.475000 77.025000 60.800000
max 0.870000 0.760000 0.600000 183.200000 183.500000 165.600000
P13 P14 P15 P16 P17 P18 \
count 320.000000 320.000000 320.000000 320.000000 320.000000 320.000000
mean 13.663906 10.147656 8.408312 0.532406 0.156781 39.863750
std 3.950322 3.302641 3.308026 1.025690 0.303630 16.734755
min 5.700000 4.090000 3.800000 0.000000 0.000000 20.200000
25% 10.815000 7.560000 6.057500 0.000000 0.000000 28.800000
50% 13.650000 9.920000 7.335000 0.010000 0.000000 33.100000
75% 16.075000 12.415000 9.825000 0.270000 0.082500 41.725000
max 24.740000 22.180000 20.530000 3.730000 0.990000 76.300000
P19 P20
count 320.000000 320.000000
mean 10.072813 10.469688
std 5.917159 6.097812
min 0.100000 -0.900000
25% 3.800000 6.400000
50% 11.350000 10.800000
75% 15.325000 14.050000
max 21.800000 26.400000
y_train.describe()
| Target Label | |
|---|---|
| count | 320.000000 |
| mean | 4.403125 |
| std | 2.867501 |
| min | 0.000000 |
| 25% | 2.000000 |
| 50% | 4.000000 |
| 75% | 7.000000 |
| max | 9.000000 |
y_train.head()
| Target Label | |
|---|---|
| 93 | 2 |
| 23 | 0 |
| 299 | 7 |
| 13 | 0 |
| 90 | 2 |
scaling=StandardScaler()
scaling.fit(X_train)
Scaled_data=scaling.transform(X_train)
principal=PCA(n_components=3)
principal.fit(Scaled_data)
x=principal.transform(Scaled_data)
print(x.shape)
(320, 3)
y['Target Label']
0 0
1 0
2 0
3 0
4 0
..
395 9
396 9
397 9
398 9
399 9
Name: Target Label, Length: 400, dtype: int64
principal.components_
array([[ 2.53461730e-01, 2.11796403e-01, 2.24410473e-01,
1.17530835e-01, -2.05623020e-02, -1.12610573e-01,
1.42759661e-01, 9.32870155e-02, -6.60062387e-02,
3.57311566e-01, 3.72636250e-01, 3.67433012e-01,
-1.17006310e-02, 2.92132738e-01, 3.47494061e-01,
-3.43828681e-02, -4.21908601e-02, 3.71606307e-01,
-1.52468751e-01, 9.00843286e-02],
[-1.05808254e-01, -1.68413902e-01, -1.24655363e-04,
-2.18960887e-01, 3.57130523e-02, -1.38352480e-01,
1.99793362e-01, 6.77187983e-02, -9.46563937e-02,
1.32500716e-01, -1.55627219e-02, -8.39669464e-02,
3.36260031e-01, 2.25069217e-01, -1.67727086e-02,
3.76919365e-01, 3.87645351e-01, 4.46358734e-02,
4.02011856e-01, 4.40246759e-01],
[-1.71956279e-01, -2.77933150e-01, 2.05352630e-01,
-4.85272923e-01, 3.07789429e-01, -4.77927155e-01,
3.39640637e-01, -2.30707871e-01, 2.95676688e-02,
-3.12664378e-02, 1.75871413e-02, 4.42185780e-02,
-4.90747874e-02, -2.45163183e-02, 4.28856296e-02,
-2.00927659e-01, -1.85685973e-01, -6.62953910e-04,
-1.31392260e-01, -1.50710145e-01]])
plt.figure(figsize=(10,10))
plt.scatter(x[:,0],x[:,1],c=y_train['Target Label'],cmap='plasma')
plt.xlabel('pc1')
plt.ylabel('pc2')
Text(0, 0.5, 'pc2')
# immport relevant libraries for 3d graph
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure(figsize=(10,10))
# choose projection 3d for creating a 3d graph
axis = fig.add_subplot(111, projection='3d')
# x[:,0]is pc1,x[:,1] is pc2 while x[:,2] is pc3
axis.scatter(x[:,0],x[:,1],x[:,2], c=y_train['Target Label'],cmap='plasma')
axis.set_xlabel("PC1", fontsize=10)
axis.set_ylabel("PC2", fontsize=10)
axis.set_zlabel("PC3", fontsize=10)
Text(0.5, 0, 'PC3')
print(principal.explained_variance_ratio_)
[0.31884557 0.17022365 0.12875232]
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(max_depth=2, random_state=0)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
<ipython-input-138-4b9b744ebb97>:4: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel(). classifier.fit(X_train, y_train)
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
print('\n')
print(f'Accuracy is {accuracy_score(y_test, y_pred)}')
[[6 0 1 0 0 0 0 0 0 0] [0 6 0 0 0 0 0 0 0 0] [0 0 8 1 0 0 0 0 0 0] [0 0 1 2 0 0 0 0 1 0] [0 0 0 7 0 0 0 0 0 0] [0 0 2 8 2 0 0 0 0 0] [0 0 0 0 0 0 7 1 0 0] [0 0 0 0 0 0 1 7 0 0] [0 0 0 2 0 0 0 0 6 0] [0 6 0 0 0 0 0 0 0 5]] Accuracy is 0.5875
print(f'Accuracy is {accuracy_score(y_test, y_pred)}')
Accuracy is 0.5875