Linear Regression with SGD¶

Exploring how one of the most basic machine learning models can be implemented using PyTorch and Stochastic Gradient Descent (SGD), from first principles. We then explore the same model using the PyTorch optimiser.

Imports¶

In [1]:

Copied!

from typing import Sequence, Tuple

import torch
from torch.utils.data import Dataset, DataLoader
from typing import Sequence, Tuple

import torch
from torch.utils.data import Dataset, DataLoader

Configuration¶

In [2]:

Copied!

torch.manual_seed(1)
torch.manual_seed(1)

Out[2]:

<torch._C.Generator at 0x10572e310>

Linear Regression with Stochastic Gradient Descent¶

Start by creating a dataset and dataloader for the task.

In [3]:

Copied!





class LinearModelData(Dataset):
    def __init__(self, b: float, w: float):
        self.w = torch.tensor(w)
        self.b = torch.tensor(b)
        self.X = torch.arange(-2, 2, 0.01).view(-1, 1)
        self.y = self.b + self.w * self.X + torch.randn(self.X.size())
        self.len = self.y.shape[0]

    def __getitem__(self, idx: float) -> Tuple[torch.FloatTensor, torch.FloatTensor]:
        return (self.X[idx], self.y[idx])

    def __len__(self) -> int:
        return self.len


data = LinearModelData(b=0, w=1)
print(f"n_samples = {len(data)}")
print(f"data[0] = {data[0]}")

data_loader = DataLoader(dataset=data, batch_size=5)
data_batches = list(data_loader)
print(f"mini_batch[0] = {data_batches[0]}")
class LinearModelData(Dataset):
    def __init__(self, b: float, w: float):
        self.w = torch.tensor(w)
        self.b = torch.tensor(b)
        self.X = torch.arange(-2, 2, 0.01).view(-1, 1)
        self.y = self.b + self.w * self.X + torch.randn(self.X.size())
        self.len = self.y.shape[0]

    def __getitem__(self, idx: float) -> Tuple[torch.FloatTensor, torch.FloatTensor]:
        return (self.X[idx], self.y[idx])

    def __len__(self) -> int:
        return self.len


data = LinearModelData(b=0, w=1)
print(f"n_samples = {len(data)}")
print(f"data[0] = {data[0]}")

data_loader = DataLoader(dataset=data, batch_size=5)
data_batches = list(data_loader)
print(f"mini_batch[0] = {data_batches[0]}")

n_samples = 400
data[0] = (tensor([-2.]), tensor([-3.5256]))
mini_batch[0] = [tensor([[-2.0000],
        [-1.9900],
        [-1.9800],
        [-1.9700],
        [-1.9600]]), tensor([[-3.5256],
        [-2.7402],
        [-2.6340],
        [-3.5795],
        [-2.0602]])]

Now define the model.

In [4]:

Copied!





class LinearRegression(torch.nn.Module):
    """Linear regression from first principles using gradient descent."""

    def __init__(self):
        super().__init__()
        self.b = torch.tensor([torch.randn(1)], requires_grad=True)
        self.w = torch.tensor([torch.randn(1)], requires_grad=True)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """Compute a prediction."""
        y_hat = self.b + self.w * x
        return y_hat


class MSELoss(torch.nn.Module):
    """Mean squared-error loss from first principles."""

    def __init__(self):
        super().__init__()

    def forward(self, y_hat: torch.Tensor, y: torch.Tensor):
        """Compute loss."""
        return torch.mean((y_hat - y) ** 2)


def train_linear_regression(
    model: torch.nn.Module,
    criterion: torch.nn.Module,
    data_loader: DataLoader,
    n_epochs: int,
    learning_rate: float,
) -> Sequence[float]:
    """Train the model over multiple epochs recording the loss for each."""

    def process_batch(X: torch.Tensor, y: torch.Tensor) -> float:
        y_hat = model.forward(X)
        loss = criterion(y_hat, y)
        loss.backward()

        model.w.data -= lr * model.w.grad.data
        model.w.grad.data.zero_()

        model.b.data -= lr * model.b.grad.data
        model.b.grad.data.zero_()

        return loss.detach().numpy().tolist()

    def process_epoch() -> float:
        return [process_batch(X, y) for X, y in data_loader][-1]

    lr = torch.tensor(learning_rate)
    training_run = [process_epoch() for epoch in range(n_epochs)]
    return training_run
class LinearRegression(torch.nn.Module):
    """Linear regression from first principles using gradient descent."""

    def __init__(self):
        super().__init__()
        self.b = torch.tensor([torch.randn(1)], requires_grad=True)
        self.w = torch.tensor([torch.randn(1)], requires_grad=True)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """Compute a prediction."""
        y_hat = self.b + self.w * x
        return y_hat


class MSELoss(torch.nn.Module):
    """Mean squared-error loss from first principles."""

    def __init__(self):
        super().__init__()

    def forward(self, y_hat: torch.Tensor, y: torch.Tensor):
        """Compute loss."""
        return torch.mean((y_hat - y) ** 2)


def train_linear_regression(
    model: torch.nn.Module,
    criterion: torch.nn.Module,
    data_loader: DataLoader,
    n_epochs: int,
    learning_rate: float,
) -> Sequence[float]:
    """Train the model over multiple epochs recording the loss for each."""

    def process_batch(X: torch.Tensor, y: torch.Tensor) -> float:
        y_hat = model.forward(X)
        loss = criterion(y_hat, y)
        loss.backward()

        model.w.data -= lr * model.w.grad.data
        model.w.grad.data.zero_()

        model.b.data -= lr * model.b.grad.data
        model.b.grad.data.zero_()

        return loss.detach().numpy().tolist()

    def process_epoch() -> float:
        return [process_batch(X, y) for X, y in data_loader][-1]

    lr = torch.tensor(learning_rate)
    training_run = [process_epoch() for epoch in range(n_epochs)]
    return training_run

Train the model.

In [5]:

Copied!





lin_reg = LinearRegression()
mse_loss = MSELoss()
print("initial parameters:")
for k, v in lin_reg.state_dict().items():
    print(f"  {k}: {v}")

print("\npost-training parameters:")
per_epoch_loss = train_linear_regression(
    lin_reg, mse_loss, data_loader, n_epochs=20, learning_rate=0.05
)
for k, v in lin_reg.state_dict().items():
    print(f"  {k}: {v}")

print("\nloss per-epoch:")
per_epoch_loss
lin_reg = LinearRegression()
mse_loss = MSELoss()
print("initial parameters:")
for k, v in lin_reg.state_dict().items():
    print(f"  {k}: {v}")

print("\npost-training parameters:")
per_epoch_loss = train_linear_regression(
    lin_reg, mse_loss, data_loader, n_epochs=20, learning_rate=0.05
)
for k, v in lin_reg.state_dict().items():
    print(f"  {k}: {v}")

print("\nloss per-epoch:")
per_epoch_loss

initial parameters:

post-training parameters:

loss per-epoch:

Out[5]:

[1.5875461101531982,
 1.5779359340667725,
 1.5811569690704346,
 1.580075740814209,
 1.580438256263733,
 1.5803166627883911,
 1.5803577899932861,
 1.58034348487854,
 1.580348253250122,
 1.5803463459014893,
 1.5803474187850952,
 1.580346941947937,
 1.580346941947937,
 1.5803475379943848,
 1.580346941947937,
 1.5803475379943848,
 1.580346941947937,
 1.5803475379943848,
 1.580346941947937,
 1.5803475379943848]

Which is in-line what one would expect with a noise term that is a standard Normal distribution.

Linear Regression with the PyTorch Optimiser¶

In [7]:

Copied!





class LinearRegressionPyTorch(torch.nn.Module):
    def __init__(self, input_size: int, output_size: int):
        super().__init__()
        self.model = torch.nn.Linear(input_size, output_size)

    def forward(self, X) -> torch.FloatTensor:
        """Compute a prediction."""
        return self.model(X)


def train_linear_regression_pytorch(
    model: torch.nn.Module,
    criterion: torch.nn.Module,
    data_loader: DataLoader,
    n_epochs: int,
    learning_rate: float,
) -> Sequence[float]:
    """Train the model over multiple epochs recording the loss for each."""

    def process_batch(X: torch.Tensor, y: torch.Tensor) -> float:
        y_hat = model.forward(X)
        loss = criterion(y_hat, y)
        optimiser.zero_grad()
        loss.backward()
        optimiser.step()
        return loss.detach().numpy().tolist()

    def process_epoch() -> float:
        return [process_batch(X, y) for X, y in data_loader][-1]

    optimiser = torch.optim.SGD(model.parameters(), lr=0.05)
    training_run = [process_epoch() for epoch in range(n_epochs)]
    return training_run
class LinearRegressionPyTorch(torch.nn.Module):
    def __init__(self, input_size: int, output_size: int):
        super().__init__()
        self.model = torch.nn.Linear(input_size, output_size)

    def forward(self, X) -> torch.FloatTensor:
        """Compute a prediction."""
        return self.model(X)


def train_linear_regression_pytorch(
    model: torch.nn.Module,
    criterion: torch.nn.Module,
    data_loader: DataLoader,
    n_epochs: int,
    learning_rate: float,
) -> Sequence[float]:
    """Train the model over multiple epochs recording the loss for each."""

    def process_batch(X: torch.Tensor, y: torch.Tensor) -> float:
        y_hat = model.forward(X)
        loss = criterion(y_hat, y)
        optimiser.zero_grad()
        loss.backward()
        optimiser.step()
        return loss.detach().numpy().tolist()

    def process_epoch() -> float:
        return [process_batch(X, y) for X, y in data_loader][-1]

    optimiser = torch.optim.SGD(model.parameters(), lr=0.05)
    training_run = [process_epoch() for epoch in range(n_epochs)]
    return training_run

We now training the model using optim.

In [8]:

Copied!





lin_reg_pt = LinearRegressionPyTorch(1, 1)
mse_loss_pt = torch.nn.MSELoss()
train_linear_regression_pytorch(
    lin_reg_pt, mse_loss_pt, data_loader, n_epochs=20, learning_rate=0.05
)
lin_reg_pt = LinearRegressionPyTorch(1, 1)
mse_loss_pt = torch.nn.MSELoss()
train_linear_regression_pytorch(
    lin_reg_pt, mse_loss_pt, data_loader, n_epochs=20, learning_rate=0.05
)

Out[8]:

[1.5170482397079468,
 1.601954698562622,
 1.5731377601623535,
 1.5827713012695312,
 1.5795338153839111,
 1.5806201696395874,
 1.580255389213562,
 1.5803780555725098,
 1.5803368091583252,
 1.5803507566452026,
 1.5803462266921997,
 1.5803475379943848,
 1.580346941947937,
 1.5803475379943848,
 1.580346941947937,
 1.580346941947937,
 1.580346941947937,
 1.580346941947937,
 1.580346941947937,
 1.580346941947937]

Testing the model on unseen data.

In [9]:

Copied!

rmse = torch.sqrt(torch.mean((lin_reg_pt.forward(test_data.X) - test_data.y) ** 2))
rmse
rmse = torch.sqrt(torch.mean((lin_reg_pt.forward(test_data.X) - test_data.y) ** 2))
rmse

Out[9]:

tensor(1.0321, grad_fn=<SqrtBackward0>)