Linear Regression with SGD¶
Exploring how one of the most basic machine learning models can be implemented using PyTorch and Stochastic Gradient Descent (SGD), from first principles. We then explore the same model using the PyTorch optimiser.
Imports¶
In [1]:
Copied!
from typing import Sequence, Tuple
import torch
from torch.utils.data import Dataset, DataLoader
from typing import Sequence, Tuple
import torch
from torch.utils.data import Dataset, DataLoader
Configuration¶
In [2]:
Copied!
torch.manual_seed(1)
torch.manual_seed(1)
Out[2]:
<torch._C.Generator at 0x10572e310>
Linear Regression with Stochastic Gradient Descent¶
Start by creating a dataset and dataloader for the task.
In [3]:
Copied!
class LinearModelData(Dataset):
def __init__(self, b: float, w: float):
self.w = torch.tensor(w)
self.b = torch.tensor(b)
self.X = torch.arange(-2, 2, 0.01).view(-1, 1)
self.y = self.b + self.w * self.X + torch.randn(self.X.size())
self.len = self.y.shape[0]
def __getitem__(self, idx: float) -> Tuple[torch.FloatTensor, torch.FloatTensor]:
return (self.X[idx], self.y[idx])
def __len__(self) -> int:
return self.len
data = LinearModelData(b=0, w=1)
print(f"n_samples = {len(data)}")
print(f"data[0] = {data[0]}")
data_loader = DataLoader(dataset=data, batch_size=5)
data_batches = list(data_loader)
print(f"mini_batch[0] = {data_batches[0]}")
class LinearModelData(Dataset):
def __init__(self, b: float, w: float):
self.w = torch.tensor(w)
self.b = torch.tensor(b)
self.X = torch.arange(-2, 2, 0.01).view(-1, 1)
self.y = self.b + self.w * self.X + torch.randn(self.X.size())
self.len = self.y.shape[0]
def __getitem__(self, idx: float) -> Tuple[torch.FloatTensor, torch.FloatTensor]:
return (self.X[idx], self.y[idx])
def __len__(self) -> int:
return self.len
data = LinearModelData(b=0, w=1)
print(f"n_samples = {len(data)}")
print(f"data[0] = {data[0]}")
data_loader = DataLoader(dataset=data, batch_size=5)
data_batches = list(data_loader)
print(f"mini_batch[0] = {data_batches[0]}")
n_samples = 400 data[0] = (tensor([-2.]), tensor([-3.5256])) mini_batch[0] = [tensor([[-2.0000], [-1.9900], [-1.9800], [-1.9700], [-1.9600]]), tensor([[-3.5256], [-2.7402], [-2.6340], [-3.5795], [-2.0602]])]
Now define the model.
In [4]:
Copied!
class LinearRegression(torch.nn.Module):
"""Linear regression from first principles using gradient descent."""
def __init__(self):
super().__init__()
self.b = torch.tensor([torch.randn(1)], requires_grad=True)
self.w = torch.tensor([torch.randn(1)], requires_grad=True)
def forward(self, x: torch.Tensor) -> torch.Tensor:
"""Compute a prediction."""
y_hat = self.b + self.w * x
return y_hat
class MSELoss(torch.nn.Module):
"""Mean squared-error loss from first principles."""
def __init__(self):
super().__init__()
def forward(self, y_hat: torch.Tensor, y: torch.Tensor):
"""Compute loss."""
return torch.mean((y_hat - y) ** 2)
def train_linear_regression(
model: torch.nn.Module,
criterion: torch.nn.Module,
data_loader: DataLoader,
n_epochs: int,
learning_rate: float,
) -> Sequence[float]:
"""Train the model over multiple epochs recording the loss for each."""
def process_batch(X: torch.Tensor, y: torch.Tensor) -> float:
y_hat = model.forward(X)
loss = criterion(y_hat, y)
loss.backward()
model.w.data -= lr * model.w.grad.data
model.w.grad.data.zero_()
model.b.data -= lr * model.b.grad.data
model.b.grad.data.zero_()
return loss.detach().numpy().tolist()
def process_epoch() -> float:
return [process_batch(X, y) for X, y in data_loader][-1]
lr = torch.tensor(learning_rate)
training_run = [process_epoch() for epoch in range(n_epochs)]
return training_run
class LinearRegression(torch.nn.Module):
"""Linear regression from first principles using gradient descent."""
def __init__(self):
super().__init__()
self.b = torch.tensor([torch.randn(1)], requires_grad=True)
self.w = torch.tensor([torch.randn(1)], requires_grad=True)
def forward(self, x: torch.Tensor) -> torch.Tensor:
"""Compute a prediction."""
y_hat = self.b + self.w * x
return y_hat
class MSELoss(torch.nn.Module):
"""Mean squared-error loss from first principles."""
def __init__(self):
super().__init__()
def forward(self, y_hat: torch.Tensor, y: torch.Tensor):
"""Compute loss."""
return torch.mean((y_hat - y) ** 2)
def train_linear_regression(
model: torch.nn.Module,
criterion: torch.nn.Module,
data_loader: DataLoader,
n_epochs: int,
learning_rate: float,
) -> Sequence[float]:
"""Train the model over multiple epochs recording the loss for each."""
def process_batch(X: torch.Tensor, y: torch.Tensor) -> float:
y_hat = model.forward(X)
loss = criterion(y_hat, y)
loss.backward()
model.w.data -= lr * model.w.grad.data
model.w.grad.data.zero_()
model.b.data -= lr * model.b.grad.data
model.b.grad.data.zero_()
return loss.detach().numpy().tolist()
def process_epoch() -> float:
return [process_batch(X, y) for X, y in data_loader][-1]
lr = torch.tensor(learning_rate)
training_run = [process_epoch() for epoch in range(n_epochs)]
return training_run
Train the model.
In [5]:
Copied!
lin_reg = LinearRegression()
mse_loss = MSELoss()
print("initial parameters:")
for k, v in lin_reg.state_dict().items():
print(f" {k}: {v}")
print("\npost-training parameters:")
per_epoch_loss = train_linear_regression(
lin_reg, mse_loss, data_loader, n_epochs=20, learning_rate=0.05
)
for k, v in lin_reg.state_dict().items():
print(f" {k}: {v}")
print("\nloss per-epoch:")
per_epoch_loss
lin_reg = LinearRegression()
mse_loss = MSELoss()
print("initial parameters:")
for k, v in lin_reg.state_dict().items():
print(f" {k}: {v}")
print("\npost-training parameters:")
per_epoch_loss = train_linear_regression(
lin_reg, mse_loss, data_loader, n_epochs=20, learning_rate=0.05
)
for k, v in lin_reg.state_dict().items():
print(f" {k}: {v}")
print("\nloss per-epoch:")
per_epoch_loss
initial parameters: post-training parameters: loss per-epoch:
Out[5]:
[1.5875461101531982, 1.5779359340667725, 1.5811569690704346, 1.580075740814209, 1.580438256263733, 1.5803166627883911, 1.5803577899932861, 1.58034348487854, 1.580348253250122, 1.5803463459014893, 1.5803474187850952, 1.580346941947937, 1.580346941947937, 1.5803475379943848, 1.580346941947937, 1.5803475379943848, 1.580346941947937, 1.5803475379943848, 1.580346941947937, 1.5803475379943848]
Testing the model on unseen data.
In [6]:
Copied!
test_data = LinearModelData(b=0, w=1)
rmse = torch.sqrt(torch.mean((lin_reg.forward(test_data.X) - test_data.y) ** 2))
rmse
test_data = LinearModelData(b=0, w=1)
rmse = torch.sqrt(torch.mean((lin_reg.forward(test_data.X) - test_data.y) ** 2))
rmse
Out[6]:
tensor(1.0321, grad_fn=<SqrtBackward0>)
Which is in-line what one would expect with a noise term that is a standard Normal distribution.
Linear Regression with the PyTorch Optimiser¶
In [7]:
Copied!
class LinearRegressionPyTorch(torch.nn.Module):
def __init__(self, input_size: int, output_size: int):
super().__init__()
self.model = torch.nn.Linear(input_size, output_size)
def forward(self, X) -> torch.FloatTensor:
"""Compute a prediction."""
return self.model(X)
def train_linear_regression_pytorch(
model: torch.nn.Module,
criterion: torch.nn.Module,
data_loader: DataLoader,
n_epochs: int,
learning_rate: float,
) -> Sequence[float]:
"""Train the model over multiple epochs recording the loss for each."""
def process_batch(X: torch.Tensor, y: torch.Tensor) -> float:
y_hat = model.forward(X)
loss = criterion(y_hat, y)
optimiser.zero_grad()
loss.backward()
optimiser.step()
return loss.detach().numpy().tolist()
def process_epoch() -> float:
return [process_batch(X, y) for X, y in data_loader][-1]
optimiser = torch.optim.SGD(model.parameters(), lr=0.05)
training_run = [process_epoch() for epoch in range(n_epochs)]
return training_run
class LinearRegressionPyTorch(torch.nn.Module):
def __init__(self, input_size: int, output_size: int):
super().__init__()
self.model = torch.nn.Linear(input_size, output_size)
def forward(self, X) -> torch.FloatTensor:
"""Compute a prediction."""
return self.model(X)
def train_linear_regression_pytorch(
model: torch.nn.Module,
criterion: torch.nn.Module,
data_loader: DataLoader,
n_epochs: int,
learning_rate: float,
) -> Sequence[float]:
"""Train the model over multiple epochs recording the loss for each."""
def process_batch(X: torch.Tensor, y: torch.Tensor) -> float:
y_hat = model.forward(X)
loss = criterion(y_hat, y)
optimiser.zero_grad()
loss.backward()
optimiser.step()
return loss.detach().numpy().tolist()
def process_epoch() -> float:
return [process_batch(X, y) for X, y in data_loader][-1]
optimiser = torch.optim.SGD(model.parameters(), lr=0.05)
training_run = [process_epoch() for epoch in range(n_epochs)]
return training_run
We now training the model using optim
.
Testing the model on unseen data.
In [9]:
Copied!
rmse = torch.sqrt(torch.mean((lin_reg_pt.forward(test_data.X) - test_data.y) ** 2))
rmse
rmse = torch.sqrt(torch.mean((lin_reg_pt.forward(test_data.X) - test_data.y) ** 2))
rmse
Out[9]:
tensor(1.0321, grad_fn=<SqrtBackward0>)