Datasets¶
How to work with datasets used for training and testing models.
Imports¶
In [1]:
Copied!
from typing import Callable, Tuple
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from typing import Callable, Tuple
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
Custom Dataset Classes¶
From the docs:
"All datasets that represent a map from keys to data samples should subclass it. All subclasses should overwrite :meth:
__getitem__
, supporting fetching a data sample for a given key. Subclasses could also optionally overwrite :meth:__len__
, which is expected to return the size of the dataset by many :class:~torch.utils.data.Sampler
implementations and the default options of :class:~torch.utils.data.DataLoader
."
See the docs for more and for subclasses of Dataset
- e.g. IterableDataset
.
In [2]:
Copied!
Transformer = Callable[[Tuple[float, float]], Tuple[float, float]]
class MyRandomRegressionDataset(Dataset):
def __init__(self, n_samples: int, transform: Transformer = None):
self.n_samples = n_samples
self.transform = transform
X = np.random.normal(0, 1, n_samples)
y = 0.5 * X + np.sqrt(0.75) * np.random.normal(0, 1, n_samples)
self.X = torch.from_numpy(X)
self.y = torch.from_numpy(y)
def __len__(self) -> int:
return self.n_samples
def __getitem__(self, idx) -> Tuple[float, float]:
sample = (self.X[idx], self.y[idx])
if self.transform:
return self.transform(sample)
else:
return (self.X[idx], self.y[idx])
regression_data = MyRandomRegressionDataset(100)
print(f"regression_data size = {len(regression_data)}")
print(f"regression_data[5] = {regression_data[5]}")
Transformer = Callable[[Tuple[float, float]], Tuple[float, float]]
class MyRandomRegressionDataset(Dataset):
def __init__(self, n_samples: int, transform: Transformer = None):
self.n_samples = n_samples
self.transform = transform
X = np.random.normal(0, 1, n_samples)
y = 0.5 * X + np.sqrt(0.75) * np.random.normal(0, 1, n_samples)
self.X = torch.from_numpy(X)
self.y = torch.from_numpy(y)
def __len__(self) -> int:
return self.n_samples
def __getitem__(self, idx) -> Tuple[float, float]:
sample = (self.X[idx], self.y[idx])
if self.transform:
return self.transform(sample)
else:
return (self.X[idx], self.y[idx])
regression_data = MyRandomRegressionDataset(100)
print(f"regression_data size = {len(regression_data)}")
print(f"regression_data[5] = {regression_data[5]}")
regression_data size = 100 regression_data[5] = (tensor(-0.5618, dtype=torch.float64), tensor(-0.5182, dtype=torch.float64))
Transformers¶
Data transformations can be handled at load-time on a sample-by-sample basis.
In [3]:
Copied!
class ScalarTransform:
def __init__(self, multiplier: float):
self.multiplier = multiplier
def __call__(self, sample: Tuple[float, float]) -> Tuple[float, float]:
X_trans = self.multiplier * sample[0]
y_trans = self.multiplier * sample[1]
return (X_trans, y_trans)
regression_data_scaled = MyRandomRegressionDataset(100, transform=ScalarTransform(2))
print(f"regression_data_scaled[5] = {regression_data_scaled[5]}")
class ScalarTransform:
def __init__(self, multiplier: float):
self.multiplier = multiplier
def __call__(self, sample: Tuple[float, float]) -> Tuple[float, float]:
X_trans = self.multiplier * sample[0]
y_trans = self.multiplier * sample[1]
return (X_trans, y_trans)
regression_data_scaled = MyRandomRegressionDataset(100, transform=ScalarTransform(2))
print(f"regression_data_scaled[5] = {regression_data_scaled[5]}")
regression_data_scaled[5] = (tensor(2.3606, dtype=torch.float64), tensor(-1.7589, dtype=torch.float64))