import cv2
from sklearn.model_selection import train_test_split
import numpy as np
from typing import List, Tuple
[docs]
class BuildDataset:
"""
Builds a dataset by merging multiple datasets with the given parameters.
Attributes:
path (List[str]): List of paths to the numpy files containing the datasets.
label (List[int]): List of labels corresponding to each dataset.
size (tuple): Tuple specifying the size of the images in the dataset. Defaults to None.
color (bool): Whether the images are in color or grayscale. Defaults to True.
split (bool): Whether to split the dataset into train and test sets. Defaults to True.
perc (float): The percentage of data to use for the test set. Defaults to 0.1.
Examples:
>>> import xeye
>>> # list of directory (paths for the .npz files)
>>> path = ['batch_1.npz','batch_2.npz', 'batch_3.npz']
>>> # list of labels associated with the images inside the .npz files
>>> label = [0,1,2]
>>> data = xeye.BuildDataset(path=path, label=label, size = None, color=True, split=True, perc=0.2)
>>> data.build()
"""
def __init__(self, path: List[str], label: List[int], size: Tuple = None, color: bool = True, split: bool = True, perc: float = 0.1) -> None:
self.path = path
self.label = label
self.size = size
self.color = color
self.split = split
self.perc = perc
self.height = 0
self.width = 0
self._tensor = {}
self._temp_tensor = []
def _control(self) -> None:
"""
Checks if the datasets to merge have the same colour space.
Raises:
ValueError: If the datasets have different color spaces.
Returns:
None
Notes:
* This method extracts the height, width, and color channels of each image in the dataset using numpy.
* It then checks if the images have the same color space by comparing the color channel values for each image.
* If the sizes of the images in the dataset are not specified in the instance variables, it sets the maximum height and width of all the images as the dataset size.
"""
height = []
width = []
color_ch = []
for i in range(len(self.path)):
data = np.load(f'{self.path[i]}')
x = data['x']
height.append(x.shape[1]) # height
width.append(x.shape[2]) # width
try:
color_ch.append(x.shape[3]) # color
except:
color_ch.append(1)
# Size control
if self.size == None:
self.size = []
self.size.append(max(height))
self.size.append(max(width))
self.size = tuple(self.size)
else:
pass
# Color channels control
if len(set(color_ch)) != 1:
raise ValueError("Datasets with different colour spaces...used datasets with the same colour spaces for the images")
else:
pass
[docs]
def build(self) -> None:
"""
Builds a new dataset by merging the datasets with the parameters indicated by the instance variables.
Raises:
ValueError: If the path and label lists do not have the same length.
ValueError: If the datasets being merged have different color spaces.
Returns:
None
Note:
The method calls the `_control` method to check if the datasets being merged have the same color space. The resulting merged dataset is stored as a numpy array in `_tensor['X']` and `_tensor['y']`. If the `split` instance variable is set to `True`, the merged dataset is split into training and testing sets using the `train_test_split` method from scikit-learn and saved as a numpy array in the file 'dataset.npz'. Otherwise, the merged dataset is saved as a numpy array in the file 'datasetall.npz'.
"""
# control method calling
self._control()
# control if the path and label lists have the same length
if len(self.path) != len(self.label):
raise ValueError("Path and label lists doesn't have the same length...")
# Create the tensor X
if self.color == True:
self._tensor['X'] = np.empty((0,self.size[0],self.size[1],3)).astype('uint8')
else:
self._tensor['X'] = np.empty((0,self.size[0],self.size[1])).astype('uint8')
# array for label y
self._tensor['y'] = np.empty((0))
# loop
for i in range(len(self.path)):
data = np.load(f'{self.path[i]}')
x = data['x']
if self.color == True:
self._temp_tensor = np.zeros((x.shape[0],self.size[0],self.size[1],3)).astype('uint8')
else:
self._temp_tensor = np.zeros((x.shape[0],self.size[0],self.size[1])).astype('uint8')
# inner loop for resizing the images
for j in range(x.shape[0]):
new_img = cv2.resize(x[j], (self.size[1],self.size[0])) # (width, height)
self._temp_tensor[j] = new_img
# save the resized images
self._tensor['X'] = np.concatenate((self._tensor['X'],self._temp_tensor),axis=0)
self._tensor['y'] = np.append(self._tensor['y'], np.repeat(self.label[i], x.shape[0], axis = 0))
# create the dataset
if self.split == True:
self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self._tensor['X'], self._tensor['y'], test_size=self.perc, random_state=123)
np.savez('dataset.npz', X_train=self.X_train, X_test=self.X_test, y_train=self.y_train, y_test=self.y_test)
else:
np.savez('datasetall.npz', x = self._tensor['X'], y = self._tensor['y'])