API Reference¶

Attention blocks for transformer models.

`ModelConfigError` ¶

Bases: Exception

Custom exception class for model configuration errors.

Source code in src/llmz/components/attention.py

class ModelConfigError(Exception):
    """Custom exception class for model configuration errors."""

    pass

`MultiHeadAttention` ¶

Bases: Module

Basic causal attention block.

Source code in src/llmz/components/attention.py

class MultiHeadAttention(nn.Module):
    """Basic causal attention block."""

    def __init__(
        self,
        context_size: int,
        dim_in: int,
        dim_out: int,
        n_heads: int = 1,
        dropout: float = 0.6,
        qkv_bias: bool = False,
    ):
        """Initialise module.

        Args:
            dim_in: Dimension of input word embeddings.
            dim_out: Dimension of output attention embeddings.
            context_size: The number of input word embeddings in the sequence.
            n_heads: The number of attention heads. Defaults to 1.
            dropout: The dropout rate. Defaults to 0.6.
            qkv_bias: Whether or not to include bias in the linear layers used to
                compute W_query, W_key and W_value. Defaults to False.

        Raises:
            ModelConfigError: if dim_out % n_heads

        """
        super().__init__()
        if dim_out % n_heads != 0:
            raise ModelConfigError("dim_out % n_heads != 0")

        self.dim_out = dim_out
        self.n_heads = n_heads
        self.dim_head = dim_out // n_heads  # // --> returns int
        self.W_query = nn.Linear(dim_in, dim_out, bias=qkv_bias)
        self.W_key = nn.Linear(dim_in, dim_out, bias=qkv_bias)
        self.W_value = nn.Linear(dim_in, dim_out, bias=qkv_bias)
        self.out_proj = nn.Linear(dim_out, dim_out)
        self.dropout = nn.Dropout(dropout)
        self.register_buffer(
            "mask", torch.triu(torch.ones(context_size, context_size), diagonal=1)
        )  # these are not parameters

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """Execute the module's forward pass.

        Args:
            x: Batch of token embeddings.

        Returns:
            Batch of attention weighted embeddings.

        """
        batch_size, seq_len, dim_in = x.size()

        # get mask for sequence length
        mask_bool = self.mask.bool()[:seq_len, :seq_len]

        # single head (dim = batch_size, n_heads, seq_len, dim_out)
        queries = self.W_query(x)
        keys = self.W_key(x)
        values = self.W_value(x)

        # split single head into multiple heads
        queries = queries.view(batch_size, seq_len, self.n_heads, self.dim_head)
        keys = keys.view(batch_size, seq_len, self.n_heads, self.dim_head)
        values = values.view(batch_size, seq_len, self.n_heads, self.dim_head)

        # reshape in size = batch_size, n_heads, seq_len, head_dim
        queries = queries.transpose(1, 2)
        keys = keys.transpose(1, 2)
        values = values.transpose(1, 2)

        # compute attention scores (matrix multiplication works on final two dimensions)
        attn_scores = queries @ keys.transpose(2, 3)
        attn_scores.masked_fill_(mask_bool, -torch.inf)

        # compute attention weights from attention scores (dim = -1 -> last dim in size)
        attn_weights = torch.softmax(attn_scores / keys.size()[-1] ** 0.5, dim=-1)
        attn_weights = self.dropout(attn_weights)

        # compute context embeddings & reshape to batch_size, seq_len, n_heads, head_dim
        context_embeddings = (attn_weights @ values).transpose(1, 2)

        # reshape to factor-out the multiple heads and take into account dim_out
        context_embeddings = context_embeddings.view(batch_size, seq_len, self.dim_out)
        context_embeddings = self.out_proj(context_embeddings)
        return context_embeddings

`init(context_size, dim_in, dim_out, n_heads=1, dropout=0.6, qkv_bias=False)` ¶

Initialise module.

Parameters:

Name	Type	Description	Default
`dim_in`	`int`	Dimension of input word embeddings.	required
`dim_out`	`int`	Dimension of output attention embeddings.	required
`context_size`	`int`	The number of input word embeddings in the sequence.	required
`n_heads`	`int`	The number of attention heads. Defaults to 1.	`1`
`dropout`	`float`	The dropout rate. Defaults to 0.6.	`0.6`
`qkv_bias`	`bool`	Whether or not to include bias in the linear layers used to compute W_query, W_key and W_value. Defaults to False.	`False`

Raises:

Type	Description
`ModelConfigError`	if dim_out % n_heads

Source code in src/llmz/components/attention.py

def __init__(
    self,
    context_size: int,
    dim_in: int,
    dim_out: int,
    n_heads: int = 1,
    dropout: float = 0.6,
    qkv_bias: bool = False,
):
    """Initialise module.

    Args:
        dim_in: Dimension of input word embeddings.
        dim_out: Dimension of output attention embeddings.
        context_size: The number of input word embeddings in the sequence.
        n_heads: The number of attention heads. Defaults to 1.
        dropout: The dropout rate. Defaults to 0.6.
        qkv_bias: Whether or not to include bias in the linear layers used to
            compute W_query, W_key and W_value. Defaults to False.

    Raises:
        ModelConfigError: if dim_out % n_heads

    """
    super().__init__()
    if dim_out % n_heads != 0:
        raise ModelConfigError("dim_out % n_heads != 0")

    self.dim_out = dim_out
    self.n_heads = n_heads
    self.dim_head = dim_out // n_heads  # // --> returns int
    self.W_query = nn.Linear(dim_in, dim_out, bias=qkv_bias)
    self.W_key = nn.Linear(dim_in, dim_out, bias=qkv_bias)
    self.W_value = nn.Linear(dim_in, dim_out, bias=qkv_bias)
    self.out_proj = nn.Linear(dim_out, dim_out)
    self.dropout = nn.Dropout(dropout)
    self.register_buffer(
        "mask", torch.triu(torch.ones(context_size, context_size), diagonal=1)
    )  # these are not parameters

`forward(x)` ¶

Execute the module's forward pass.

Parameters:

Name	Type	Description	Default
`x`	`Tensor`	Batch of token embeddings.	required

Returns:

Type	Description
`Tensor`	Batch of attention weighted embeddings.

Source code in src/llmz/components/attention.py

def forward(self, x: torch.Tensor) -> torch.Tensor:
    """Execute the module's forward pass.

    Args:
        x: Batch of token embeddings.

    Returns:
        Batch of attention weighted embeddings.

    """
    batch_size, seq_len, dim_in = x.size()

    # get mask for sequence length
    mask_bool = self.mask.bool()[:seq_len, :seq_len]

    # single head (dim = batch_size, n_heads, seq_len, dim_out)
    queries = self.W_query(x)
    keys = self.W_key(x)
    values = self.W_value(x)

    # split single head into multiple heads
    queries = queries.view(batch_size, seq_len, self.n_heads, self.dim_head)
    keys = keys.view(batch_size, seq_len, self.n_heads, self.dim_head)
    values = values.view(batch_size, seq_len, self.n_heads, self.dim_head)

    # reshape in size = batch_size, n_heads, seq_len, head_dim
    queries = queries.transpose(1, 2)
    keys = keys.transpose(1, 2)
    values = values.transpose(1, 2)

    # compute attention scores (matrix multiplication works on final two dimensions)
    attn_scores = queries @ keys.transpose(2, 3)
    attn_scores.masked_fill_(mask_bool, -torch.inf)

    # compute attention weights from attention scores (dim = -1 -> last dim in size)
    attn_weights = torch.softmax(attn_scores / keys.size()[-1] ** 0.5, dim=-1)
    attn_weights = self.dropout(attn_weights)

    # compute context embeddings & reshape to batch_size, seq_len, n_heads, head_dim
    context_embeddings = (attn_weights @ values).transpose(1, 2)

    # reshape to factor-out the multiple heads and take into account dim_out
    context_embeddings = context_embeddings.view(batch_size, seq_len, self.dim_out)
    context_embeddings = self.out_proj(context_embeddings)
    return context_embeddings

Normalisation operations.

`LayerNormalisation` ¶

Bases: Module

Layer normalisation.

Normalises batches of input tensors close zero mean and unit variance. The module allows for some trained deviation from the a mean of zero and a variance of one.

Source code in src/llmz/components/normalisation.py

class LayerNormalisation(nn.Module):
    """Layer normalisation.

    Normalises batches of input tensors close zero mean and unit variance. The module
    allows for some trained deviation from the a mean of zero and a variance of one.
    """

    def __init__(self, dim_in: int):
        """Initialise module.

        Args:
            dim_in: Dimension of the input batches.

        """
        super().__init__()
        self.epsilon = 1e-5

        # Trainable element-by-element adjustments to output tensors
        self.shift = nn.Parameter(torch.zeros(dim_in))
        self.scale = nn.Parameter(torch.ones(dim_in))

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """Forward pass of module.

        Args:
            x: input tensors.

        Returns:
            Tensor-by-tensor normalised version of the inputs.

        """
        x_mean = x.mean(dim=-1, keepdim=True)
        x_stdev = x.std(dim=-1, keepdim=True, unbiased=False)  # unbiased as n -> inf
        x_norm = (x - x_mean) / (x_stdev + self.epsilon)
        return self.shift + self.scale * x_norm

`init(dim_in)` ¶

Initialise module.

Parameters:

Name	Type	Description	Default
`dim_in`	`int`	Dimension of the input batches.	required

Source code in src/llmz/components/normalisation.py

def __init__(self, dim_in: int):
    """Initialise module.

    Args:
        dim_in: Dimension of the input batches.

    """
    super().__init__()
    self.epsilon = 1e-5

    # Trainable element-by-element adjustments to output tensors
    self.shift = nn.Parameter(torch.zeros(dim_in))
    self.scale = nn.Parameter(torch.ones(dim_in))

`forward(x)` ¶

Forward pass of module.

Parameters:

Name	Type	Description	Default
`x`	`Tensor`	input tensors.	required

Returns:

Type	Description
`Tensor`	Tensor-by-tensor normalised version of the inputs.

Source code in src/llmz/components/normalisation.py

def forward(self, x: torch.Tensor) -> torch.Tensor:
    """Forward pass of module.

    Args:
        x: input tensors.

    Returns:
        Tensor-by-tensor normalised version of the inputs.

    """
    x_mean = x.mean(dim=-1, keepdim=True)
    x_stdev = x.std(dim=-1, keepdim=True, unbiased=False)  # unbiased as n -> inf
    x_norm = (x - x_mean) / (x_stdev + self.epsilon)
    return self.shift + self.scale * x_norm

Activation functions for transformer models.

`GELU` ¶

Bases: Module

Guassian Error Linear Unit (GELU).

Implemented using an approximation to x * F(x), where F is the cumulative normal distribution function. See 'Build a LLM (from scratch)' by S. Raschka (2024), p105.

Source code in src/llmz/components/activations.py

class GELU(nn.Module):
    """Guassian Error Linear Unit (GELU).

    Implemented using an approximation to `x * F(x)`, where `F` is the cumulative
    normal distribution function. See 'Build a LLM (from scratch)' by S. Raschka
    (2024), p105.
    """

    def __init__(self):
        """Initialise module."""
        super().__init__()

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """Execute the module's forward pass.

        Args:
            x: Batch of input tensors.

        Returns:
            Batch of output tensors that have been filtered on an element-by-element
                basis using the GELU activation function.

        """
        tanh_exponent = torch.sqrt(torch.tensor(2.0 / torch.pi)) * (
            x + 0.044715 * torch.pow(x, 3)
        )
        gelu_x = 0.5 * x * (1.0 + torch.tanh(tanh_exponent))
        return gelu_x

`init()` ¶

Initialise module.

Source code in src/llmz/components/activations.py

def __init__(self):
    """Initialise module."""
    super().__init__()

`forward(x)` ¶

Execute the module's forward pass.

Parameters:

Name	Type	Description	Default
`x`	`Tensor`	Batch of input tensors.	required

Returns:

Type	Description
`Tensor`	Batch of output tensors that have been filtered on an element-by-element basis using the GELU activation function.

Source code in src/llmz/components/activations.py

def forward(self, x: torch.Tensor) -> torch.Tensor:
    """Execute the module's forward pass.

    Args:
        x: Batch of input tensors.

    Returns:
        Batch of output tensors that have been filtered on an element-by-element
            basis using the GELU activation function.

    """
    tanh_exponent = torch.sqrt(torch.tensor(2.0 / torch.pi)) * (
        x + 0.044715 * torch.pow(x, 3)
    )
    gelu_x = 0.5 * x * (1.0 + torch.tanh(tanh_exponent))
    return gelu_x

Transformer block for LLMs.

`TransformerBlockGPT2` ¶

Bases: Module

Basic transformer block with multi-head attention as used in GPT2.

Source code in src/llmz/components/transformers.py

class TransformerBlockGPT2(nn.Module):
    """Basic transformer block with multi-head attention as used in GPT2."""

    def __init__(
        self,
        context_size: int,
        dim_in: int,
        n_heads: int = 1,
        dropout: float = 0.6,
        qkv_bias: bool = False,
    ):
        """Initialise module.

        Args:
            dim_in: Dimension of input word embeddings.
            context_size: The number of input word embeddings in the sequence.
            n_heads: The number of attention heads. Defaults to 1.
            dropout: The dropout rate. Defaults to 0.6.
            qkv_bias: Whether or not to include bias in the linear layers used to
                compute W_query, W_key and W_value. Defaults to False.

        """
        super().__init__()
        self.attention = MultiHeadAttention(
            context_size, dim_in, dim_in, n_heads, dropout, qkv_bias
        )
        self.linear_1 = nn.Linear(dim_in, dim_in * 2)
        self.linear_2 = nn.Linear(dim_in * 2, dim_in)
        self.normalise_1 = LayerNormalisation(dim_in)
        self.normalise_2 = LayerNormalisation(dim_in)
        self.dropout = nn.Dropout(dropout)
        self.gelu = GELU()

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """Execute the module's forward pass.

        Args:
            x: Batch of token embeddings.

        Returns:
            Batch of attention weighted embeddings.

        """
        y1 = self.normalise_1(x)
        y1 = self.attention(y1)
        y1 = self.dropout(y1)

        y2 = self.normalise_2(y1 + x)
        y2 = self.linear_1(y2)
        y2 = self.gelu(y2)
        y2 = self.linear_2(y2)
        y2 = self.dropout(y2)

        return y1 + y2

`init(context_size, dim_in, n_heads=1, dropout=0.6, qkv_bias=False)` ¶

Initialise module.

Parameters:

Name	Type	Description	Default
`dim_in`	`int`	Dimension of input word embeddings.	required
`context_size`	`int`	The number of input word embeddings in the sequence.	required
`n_heads`	`int`	The number of attention heads. Defaults to 1.	`1`
`dropout`	`float`	The dropout rate. Defaults to 0.6.	`0.6`
`qkv_bias`	`bool`	Whether or not to include bias in the linear layers used to compute W_query, W_key and W_value. Defaults to False.	`False`

Source code in src/llmz/components/transformers.py

def __init__(
    self,
    context_size: int,
    dim_in: int,
    n_heads: int = 1,
    dropout: float = 0.6,
    qkv_bias: bool = False,
):
    """Initialise module.

    Args:
        dim_in: Dimension of input word embeddings.
        context_size: The number of input word embeddings in the sequence.
        n_heads: The number of attention heads. Defaults to 1.
        dropout: The dropout rate. Defaults to 0.6.
        qkv_bias: Whether or not to include bias in the linear layers used to
            compute W_query, W_key and W_value. Defaults to False.

    """
    super().__init__()
    self.attention = MultiHeadAttention(
        context_size, dim_in, dim_in, n_heads, dropout, qkv_bias
    )
    self.linear_1 = nn.Linear(dim_in, dim_in * 2)
    self.linear_2 = nn.Linear(dim_in * 2, dim_in)
    self.normalise_1 = LayerNormalisation(dim_in)
    self.normalise_2 = LayerNormalisation(dim_in)
    self.dropout = nn.Dropout(dropout)
    self.gelu = GELU()

`forward(x)` ¶

Execute the module's forward pass.

Parameters:

Name	Type	Description	Default
`x`	`Tensor`	Batch of token embeddings.	required

Returns:

Type	Description
`Tensor`	Batch of attention weighted embeddings.

Source code in src/llmz/components/transformers.py

def forward(self, x: torch.Tensor) -> torch.Tensor:
    """Execute the module's forward pass.

    Args:
        x: Batch of token embeddings.

    Returns:
        Batch of attention weighted embeddings.

    """
    y1 = self.normalise_1(x)
    y1 = self.attention(y1)
    y1 = self.dropout(y1)

    y2 = self.normalise_2(y1 + x)
    y2 = self.linear_1(y2)
    y2 = self.gelu(y2)
    y2 = self.linear_2(y2)
    y2 = self.dropout(y2)

    return y1 + y2

Datasets for LLMs.

`GPTSmallTextDataset` ¶

Bases: Dataset

GPT dataset interface for any 'small' text data.

This will tokenize all text in-memory using a GPT2's tokenization algorithm, which is a pre-trained Bite Pair Encoding (BPE).

Source code in src/llmz/datasets.py

class GPTSmallTextDataset(Dataset):
    """GPT dataset interface for any 'small' text data.

    This will tokenize all text in-memory using a GPT2's tokenization algorithm, which
    is a pre-trained Bite Pair Encoding (BPE).
    """

    def __init__(self, text: str, max_length: int = 256, stride: int = 128):
        """Initialise.

        Args:
            text: Raw text data to convert into tokens.
            max_length: Number of tokens for each data instance. Defaults to 256.
            stride: Separation (in tokens) between consecutive instances. Defaults to
                128.

        """
        tokenizer = tiktoken.get_encoding("gpt2")
        tokens = tokenizer.encode(text)

        n_tokens = len(tokens)
        n_instances = int((n_tokens - max_length) / stride)
        if n_instances == 0:
            raise RuntimeError("max_length + stride <= number of tokens")

        self._X = torch.ones((n_instances, max_length))
        self._y = torch.ones((n_instances, max_length))

        for n, i in enumerate(range(0, n_tokens - max_length, stride)):
            self._X[n,] = torch.tensor(tokens[i : i + max_length])
            self._y[n,] = torch.tensor(tokens[i + 1 : i + max_length + 1])

    def create_data_loader(
        self,
        batch_size: int = 4,
        shuffle: bool = True,
        drop_last: bool = True,
        num_workers: int = 0,
    ) -> DataLoader:
        """Create data loader.

        Args:
            batch_size: The batch size. Defaults to 4.
            shuffle: Whether to randomise instance order after each iteration. Defaults
                to True.
            drop_last: Drop last batch if less than `batch_size`. Defaults to True.
            num_workers: Number of CPU processes to use for pre-processing. Defaults to
                0.

        Returns:
            A fully configured DataLoader

        """
        return DataLoader(
            self,
            batch_size=batch_size,
            shuffle=shuffle,
            drop_last=drop_last,
            num_workers=num_workers,
        )

    def __len__(self) -> int:
        return self._X.size(0)

    def __getitem__(self, idx: int) -> tuple[torch.Tensor, torch.Tensor]:
        return self._X[idx,], self._y[idx,]

`init(text, max_length=256, stride=128)` ¶

Initialise.

Parameters:

Name	Type	Description	Default
`text`	`str`	Raw text data to convert into tokens.	required
`max_length`	`int`	Number of tokens for each data instance. Defaults to 256.	`256`
`stride`	`int`	Separation (in tokens) between consecutive instances. Defaults to 128.	`128`

Source code in src/llmz/datasets.py

def __init__(self, text: str, max_length: int = 256, stride: int = 128):
    """Initialise.

    Args:
        text: Raw text data to convert into tokens.
        max_length: Number of tokens for each data instance. Defaults to 256.
        stride: Separation (in tokens) between consecutive instances. Defaults to
            128.

    """
    tokenizer = tiktoken.get_encoding("gpt2")
    tokens = tokenizer.encode(text)

    n_tokens = len(tokens)
    n_instances = int((n_tokens - max_length) / stride)
    if n_instances == 0:
        raise RuntimeError("max_length + stride <= number of tokens")

    self._X = torch.ones((n_instances, max_length))
    self._y = torch.ones((n_instances, max_length))

    for n, i in enumerate(range(0, n_tokens - max_length, stride)):
        self._X[n,] = torch.tensor(tokens[i : i + max_length])
        self._y[n,] = torch.tensor(tokens[i + 1 : i + max_length + 1])

`create_data_loader(batch_size=4, shuffle=True, drop_last=True, num_workers=0)` ¶

Create data loader.

Parameters:

Name	Type	Description	Default
`batch_size`	`int`	The batch size. Defaults to 4.	`4`
`shuffle`	`bool`	Whether to randomise instance order after each iteration. Defaults to True.	`True`
`drop_last`	`bool`	Drop last batch if less than `batch_size`. Defaults to True.	`True`
`num_workers`	`int`	Number of CPU processes to use for pre-processing. Defaults to 0.	`0`

Returns:

Type	Description
`DataLoader`	A fully configured DataLoader

Source code in src/llmz/datasets.py

def create_data_loader(
    self,
    batch_size: int = 4,
    shuffle: bool = True,
    drop_last: bool = True,
    num_workers: int = 0,
) -> DataLoader:
    """Create data loader.

    Args:
        batch_size: The batch size. Defaults to 4.
        shuffle: Whether to randomise instance order after each iteration. Defaults
            to True.
        drop_last: Drop last batch if less than `batch_size`. Defaults to True.
        num_workers: Number of CPU processes to use for pre-processing. Defaults to
            0.

    Returns:
        A fully configured DataLoader

    """
    return DataLoader(
        self,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers,
    )

Evaluation and metrics.

`EvalResult` ¶

Bases: NamedTuple

Container for evaluation results produced during training.

Source code in src/llmz/evaluate.py

class EvalResult(NamedTuple):
    """Container for evaluation results produced during training."""

    step: int
    results: dict[str, Result]

`Evaluator` ¶

Model evaluator.

This class executes and stores all model evaluations during training.

Source code in src/llmz/evaluate.py

class Evaluator:
    """Model evaluator.

    This class executes and stores all model evaluations during training.
    """

    def __init__(
        self,
        train_dataloader: DataLoader,
        val_dataloader: DataLoader,
        metrics_fn: Callable[[nn.Module, DataLoader], dict[str, Result]],
        scenarios_fn: Callable[[nn.Module], dict[str, Result]] | None = None,
    ):
        """Initialise.

        Args:
            train_dataloader: DataLoader for training data.
            val_dataloader: DataLoader for validation data.
            metrics_fn: Callable that returns a dictionary of metrics given a model and
                a dataloader.
            scenarios_fn: Optional callable that returns a dictionary of results/outputs
                given a model - e.g., generated text given an example prompt. Defaults
                to None.

        """
        self.train_dl = train_dataloader
        self.val_dl = val_dataloader
        self.metrics_fn = metrics_fn
        self.scenarios_fn = scenarios_fn
        self._eval_records: list[EvalResult] = []

    def evaluate(
        self, step: int, model: nn.Module, log: logging.Logger | None = None
    ) -> None:
        """Evaluate model.

        Args:
            step: The number of training steps applied to the model.
            model: The model to evaluate.
            log: Optional logger for logging results? Defaults to custom llmz logger.

        Return:
            All evaluations for the model after training steps.

        """
        train_metrics = {
            f"train_{k}": v for k, v in self.metrics_fn(model, self.train_dl).items()
        }
        val_metrics = {
            f"val_{k}": v for k, v in self.metrics_fn(model, self.val_dl).items()
        }
        scenarios = self.scenarios_fn(model) if self.scenarios_fn else {}

        eval_record = EvalResult(step, {**train_metrics, **val_metrics, **scenarios})
        self._eval_records.append(eval_record)

        if log:
            log_msg = f"{eval_record.step=}: " + ", ".join(
                f"{k}={v}" for k, v in eval_record.results.items()
            )
            log.info(log_msg)

    def __getitem__(self, idx: int) -> EvalResult:
        return self._eval_records[idx]

    def __iter__(self) -> Iterator[EvalResult]:
        return iter(self._eval_records)

    def __len__(self) -> int:
        return len(self._eval_records)

`init(train_dataloader, val_dataloader, metrics_fn, scenarios_fn=None)` ¶

Initialise.

Parameters:

Name	Type	Description	Default
`train_dataloader`	`DataLoader`	DataLoader for training data.	required
`val_dataloader`	`DataLoader`	DataLoader for validation data.	required
`metrics_fn`	`Callable[[Module, DataLoader], dict[str, Result]]`	Callable that returns a dictionary of metrics given a model and a dataloader.	required
`scenarios_fn`	`Callable[[Module], dict[str, Result]] \| None`	Optional callable that returns a dictionary of results/outputs given a model - e.g., generated text given an example prompt. Defaults to None.	`None`

Source code in src/llmz/evaluate.py

def __init__(
    self,
    train_dataloader: DataLoader,
    val_dataloader: DataLoader,
    metrics_fn: Callable[[nn.Module, DataLoader], dict[str, Result]],
    scenarios_fn: Callable[[nn.Module], dict[str, Result]] | None = None,
):
    """Initialise.

    Args:
        train_dataloader: DataLoader for training data.
        val_dataloader: DataLoader for validation data.
        metrics_fn: Callable that returns a dictionary of metrics given a model and
            a dataloader.
        scenarios_fn: Optional callable that returns a dictionary of results/outputs
            given a model - e.g., generated text given an example prompt. Defaults
            to None.

    """
    self.train_dl = train_dataloader
    self.val_dl = val_dataloader
    self.metrics_fn = metrics_fn
    self.scenarios_fn = scenarios_fn
    self._eval_records: list[EvalResult] = []

`evaluate(step, model, log=None)` ¶

Evaluate model.

Parameters:

Name	Type	Description	Default
`step`	`int`	The number of training steps applied to the model.	required
`model`	`Module`	The model to evaluate.	required
`log`	`Logger \| None`	Optional logger for logging results? Defaults to custom llmz logger.	`None`

Return

All evaluations for the model after training steps.

Source code in src/llmz/evaluate.py

def evaluate(
    self, step: int, model: nn.Module, log: logging.Logger | None = None
) -> None:
    """Evaluate model.

    Args:
        step: The number of training steps applied to the model.
        model: The model to evaluate.
        log: Optional logger for logging results? Defaults to custom llmz logger.

    Return:
        All evaluations for the model after training steps.

    """
    train_metrics = {
        f"train_{k}": v for k, v in self.metrics_fn(model, self.train_dl).items()
    }
    val_metrics = {
        f"val_{k}": v for k, v in self.metrics_fn(model, self.val_dl).items()
    }
    scenarios = self.scenarios_fn(model) if self.scenarios_fn else {}

    eval_record = EvalResult(step, {**train_metrics, **val_metrics, **scenarios})
    self._eval_records.append(eval_record)

    if log:
        log_msg = f"{eval_record.step=}: " + ", ".join(
            f"{k}={v}" for k, v in eval_record.results.items()
        )
        log.info(log_msg)

`basic_llm_metrics(model, dl)` ¶

Compute basic LLM metrics for a dataloader.

Parameters:

Name	Type	Description	Default
`model`	`Module`	Model to use for inference.	required
`dl`	`DataLoader`	Dataloader with data batches for inference.	required

Source code in src/llmz/evaluate.py

def basic_llm_metrics(model: nn.Module, dl: DataLoader) -> dict[str, float]:
    """Compute basic LLM metrics for a dataloader.

    Args:
        model: Model to use for inference.
        dl: Dataloader with data batches for inference.

    """
    loss = sum(
        f.cross_entropy(model(X).flatten(0, 1), y.flatten()).item() for X, y in dl
    ) / len(dl)
    return {"loss": loss}

Tools for text generation .

`decode(token_logits, strategy='greedy', temperature=1.0, *, k=5)` ¶

Decode generative model output using the specified strategy.

Source code in src/llmz/generate.py

def decode(
    token_logits: torch.Tensor,
    strategy: Literal["greedy", "sample", "topk"] = "greedy",
    temperature: float = 1.0,
    *,
    k: int = 5,
) -> int:
    """Decode generative model output using the specified strategy."""
    match strategy:
        case "greedy":
            return _greedy_decoding(token_logits, temperature)
        case "topk":
            return _top_k_decoding(token_logits, temperature, k)
        case "sample":
            return _sample_decoding(token_logits, temperature)

`format_generated_words(text, prompt)` ¶

Format list of words into a readable paragraph.

Source code in src/llmz/generate.py

def format_generated_words(text: str, prompt: str) -> str:
    """Format list of words into a readable paragraph."""
    text = _capitalise_sentences(text, sentence_delimiter=". ")
    text = "==> " + prompt.upper().strip() + " " + text.strip()
    return "\n".join([line for line in wrap(text, width=89)])

`generate(model, prompt, tokenizer, strategy='greedy', output_length=60, temperature=1.0, random_seed=42, device=torch.device('cpu'), *, k=2)` ¶

Generate new text conditional on a text prompt.

Source code in src/llmz/generate.py

def generate(
    model: nn.Module,
    prompt: str,
    tokenizer: _Tokenizer,
    strategy: Literal["greedy", "sample", "topk"] = "greedy",
    output_length: int = 60,
    temperature: float = 1.0,
    random_seed: int = 42,
    device: torch.device = torch.device("cpu"),
    *,
    k: int = 2,
) -> str:
    """Generate new text conditional on a text prompt."""
    torch.manual_seed(random_seed)

    model.to(device)
    model.eval()

    prompt_tokens = tokenizer(prompt)
    token_sequence = prompt_tokens.copy()
    for _ in range(output_length):
        x = torch.tensor([token_sequence], device=device)
        token_logits = model(x)
        token_pred = decode(token_logits[0, -1], strategy, temperature, k=k)
        token_sequence += [token_pred]

    new_token_sequence = token_sequence[len(prompt_tokens) :]
    new_token_sequence = token_sequence[len(prompt_tokens) :]
    return format_generated_words(tokenizer.tokens2text(new_token_sequence), prompt)

`print_wrapped(text, width=89)` ¶

Print text with word wrapping.

Source code in src/llmz/generate.py

def print_wrapped(text: str, width: int = 89) -> None:
    """Print text with word wrapping."""
    wrapped_text = "\n".join(wrap(text, width=width))
    print(wrapped_text)

Implementation of GPT2.

`GPT2` ¶

Bases: Module

Implementation of OpenAI's GPT2 model.

Source code in src/llmz/gpt2.py

class GPT2(nn.Module):
    """Implementation of OpenAI's GPT2 model."""

    def __init__(
        self,
        vocab_size: int,
        embed_dim: int,
        context_size: int,
        n_tsfmr_blocks: int = 1,
        n_attn_heads: int = 1,
        dropout: float = 0.6,
        qkv_bias: bool = False,
    ):
        """Initialise model.

        Args:
            vocab_size: The number of unique tokens that the model expects to encounter.
            embed_dim: Dimension of input word embeddings.
            context_size: The number of input word embeddings in the sequence.
            n_tsfmr_blocks: The number of transformer blocks stacked together.
            n_attn_heads: The number of attention heads in every transformer block.
                Defaults to 1.
            dropout: The dropout rate. Defaults to 0.6.
            qkv_bias: Whether or not to include bias in the linear layers used to
                compute W_query, W_key and W_value. Defaults to False.

        """
        super().__init__()

        self.context_size = context_size
        self.token_embed = nn.Embedding(vocab_size, embed_dim)
        self.position_embed = nn.Embedding(context_size, embed_dim)
        self.dropout_embed = nn.Dropout(p=dropout)

        self.tsfmr_stack = nn.Sequential(
            *[
                TransformerBlockGPT2(
                    context_size, embed_dim, n_attn_heads, dropout, qkv_bias
                )
                for _ in range(n_tsfmr_blocks)
            ]
        )

        self.final_norm = LayerNormalisation(embed_dim)
        self.output_head = nn.Linear(embed_dim, vocab_size, bias=False)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """Execute the module's forward pass.

        Args:
            x: Batch of token embeddings.

        Returns:
            Batch of attention weighted embeddings.

        """
        seq_len = x.size()[1]
        if seq_len > self.context_size:
            msg = f"seq_len ({seq_len}) > context_size ({self.context_size})"
            raise GPT2InferenceError(msg)

        positions = torch.arange(0, seq_len, device=x.device)
        y = self.token_embed(x) + self.position_embed(positions)
        y = self.dropout_embed(y)
        y = self.tsfmr_stack(y)
        y = self.final_norm(y)
        logits = self.output_head(y)
        return logits

`init(vocab_size, embed_dim, context_size, n_tsfmr_blocks=1, n_attn_heads=1, dropout=0.6, qkv_bias=False)` ¶

Initialise model.

Parameters:

Name	Type	Description	Default
`vocab_size`	`int`	The number of unique tokens that the model expects to encounter.	required
`embed_dim`	`int`	Dimension of input word embeddings.	required
`context_size`	`int`	The number of input word embeddings in the sequence.	required
`n_tsfmr_blocks`	`int`	The number of transformer blocks stacked together.	`1`
`n_attn_heads`	`int`	The number of attention heads in every transformer block. Defaults to 1.	`1`
`dropout`	`float`	The dropout rate. Defaults to 0.6.	`0.6`
`qkv_bias`	`bool`	Whether or not to include bias in the linear layers used to compute W_query, W_key and W_value. Defaults to False.	`False`

Source code in src/llmz/gpt2.py

def __init__(
    self,
    vocab_size: int,
    embed_dim: int,
    context_size: int,
    n_tsfmr_blocks: int = 1,
    n_attn_heads: int = 1,
    dropout: float = 0.6,
    qkv_bias: bool = False,
):
    """Initialise model.

    Args:
        vocab_size: The number of unique tokens that the model expects to encounter.
        embed_dim: Dimension of input word embeddings.
        context_size: The number of input word embeddings in the sequence.
        n_tsfmr_blocks: The number of transformer blocks stacked together.
        n_attn_heads: The number of attention heads in every transformer block.
            Defaults to 1.
        dropout: The dropout rate. Defaults to 0.6.
        qkv_bias: Whether or not to include bias in the linear layers used to
            compute W_query, W_key and W_value. Defaults to False.

    """
    super().__init__()

    self.context_size = context_size
    self.token_embed = nn.Embedding(vocab_size, embed_dim)
    self.position_embed = nn.Embedding(context_size, embed_dim)
    self.dropout_embed = nn.Dropout(p=dropout)

    self.tsfmr_stack = nn.Sequential(
        *[
            TransformerBlockGPT2(
                context_size, embed_dim, n_attn_heads, dropout, qkv_bias
            )
            for _ in range(n_tsfmr_blocks)
        ]
    )

    self.final_norm = LayerNormalisation(embed_dim)
    self.output_head = nn.Linear(embed_dim, vocab_size, bias=False)

`forward(x)` ¶

Execute the module's forward pass.

Parameters:

Name	Type	Description	Default
`x`	`Tensor`	Batch of token embeddings.	required

Returns:

Type	Description
`Tensor`	Batch of attention weighted embeddings.

Source code in src/llmz/gpt2.py

def forward(self, x: torch.Tensor) -> torch.Tensor:
    """Execute the module's forward pass.

    Args:
        x: Batch of token embeddings.

    Returns:
        Batch of attention weighted embeddings.

    """
    seq_len = x.size()[1]
    if seq_len > self.context_size:
        msg = f"seq_len ({seq_len}) > context_size ({self.context_size})"
        raise GPT2InferenceError(msg)

    positions = torch.arange(0, seq_len, device=x.device)
    y = self.token_embed(x) + self.position_embed(positions)
    y = self.dropout_embed(y)
    y = self.tsfmr_stack(y)
    y = self.final_norm(y)
    logits = self.output_head(y)
    return logits

`GPT2Config` `dataclass` ¶

Container class for GPT2 model hyper-parameters.

This class will validate parameters and then allow GPT2 objects to be created using keyword argument expansion - e.g,

config = GPT2Config(...)
model = GPT2(**config)

Parameters:

Name	Type	Description	Default
`vocab_size`	`int`	The number of unique tokens that the model expects to encounter.	required
`embed_dim`	`int`	Dimension of input word embeddings.	required
`context_size`	`int`	The number of input word embeddings in the sequence.	required
`n_tsfmr_blocks`	`int`	The number of transformer blocks stacked together.	`1`
`n_attn_heads`	`int`	The number of attention heads in every transformer block. Defaults to 1.	`1`
`dropout`	`float`	The dropout rate. Defaults to 0.6.	`0.6`
`qkv_bias`	`bool`	Whether or not to include bias in the linear layers used to compute W_query, W_key and W_value. Defaults to False.	`False`

Raises:

Type	Description
`GPT2ConfigError`	if any int or float parameter is <= 0, or embed_dim % n_attn_heads != 0

Source code in src/llmz/gpt2.py

@dataclass(frozen=True)
class GPT2Config:
    """Container class for GPT2 model hyper-parameters.

    This class will validate parameters and then allow GPT2 objects to be created using
    keyword argument expansion - e.g,

    ```python
    config = GPT2Config(...)
    model = GPT2(**config)
    ```

    Args:
        vocab_size: The number of unique tokens that the model expects to encounter.
        embed_dim: Dimension of input word embeddings.
        context_size: The number of input word embeddings in the sequence.
        n_tsfmr_blocks: The number of transformer blocks stacked together.
        n_attn_heads: The number of attention heads in every transformer block.
            Defaults to 1.
        dropout: The dropout rate. Defaults to 0.6.
        qkv_bias: Whether or not to include bias in the linear layers used to
            compute W_query, W_key and W_value. Defaults to False.

    Raises:
        GPT2ConfigError: if any int or float parameter is <= 0, or
            embed_dim % n_attn_heads != 0

    """

    vocab_size: int
    embed_dim: int
    context_size: int
    n_tsfmr_blocks: int = 1
    n_attn_heads: int = 1
    dropout: float = 0.6
    qkv_bias: bool = False

    def __post_init__(self) -> None:
        """Validate fields after initialisation."""
        errors: list[str] = []

        for field, value in self.__dict__.items():
            if type(value) in (int, float) and value <= 0:
                errors.append(f"{field} is not > 0")

        if self.embed_dim % self.n_attn_heads != 0:
            errors.append("embed_dim % n_attn_heads != 0")

        if errors:
            msg = "invalid GPT2 parameters: " + "\n ".join(errors)
            raise GPT2ConfigError(msg)

    def keys(self) -> KeysView[str]:
        """Get iterator of field keys.

        Part of Mapping protocol required to enable keyword argument expansion using the
        `**` operator.
        """
        return asdict(self).keys()

    def __getitem__(self, key: str) -> GPT2ConfigValue:
        """Get config value via its field name.

        Part of Mapping protocol required to enable keyword argument expansion using the
        `**` operator.
        """
        return asdict(self)[key]

    def __str__(self) -> str:
        """Format config as a string."""
        str_repr = "GPT2Config("
        for key, value in asdict(self).items():
            str_repr += f"{key}={value}, "
        str_repr = str_repr[: len(str_repr) - 2]  # remove final ', '
        str_repr += ")"
        return str_repr

    def __repr__(self) -> str:
        """Format config for the command line."""
        cli_repr = "GPT2Config(\n"
        for key, value in asdict(self).items():
            cli_repr += f"  {key}={value},\n"
        cli_repr += ")"
        return cli_repr

`getitem(key)` ¶

Get config value via its field name.

Part of Mapping protocol required to enable keyword argument expansion using the ** operator.

Source code in src/llmz/gpt2.py

def __getitem__(self, key: str) -> GPT2ConfigValue:
    """Get config value via its field name.

    Part of Mapping protocol required to enable keyword argument expansion using the
    `**` operator.
    """
    return asdict(self)[key]

`__post_init__()` ¶

Validate fields after initialisation.

Source code in src/llmz/gpt2.py

def __post_init__(self) -> None:
    """Validate fields after initialisation."""
    errors: list[str] = []

    for field, value in self.__dict__.items():
        if type(value) in (int, float) and value <= 0:
            errors.append(f"{field} is not > 0")

    if self.embed_dim % self.n_attn_heads != 0:
        errors.append("embed_dim % n_attn_heads != 0")

    if errors:
        msg = "invalid GPT2 parameters: " + "\n ".join(errors)
        raise GPT2ConfigError(msg)

`repr()` ¶

Format config for the command line.

Source code in src/llmz/gpt2.py

def __repr__(self) -> str:
    """Format config for the command line."""
    cli_repr = "GPT2Config(\n"
    for key, value in asdict(self).items():
        cli_repr += f"  {key}={value},\n"
    cli_repr += ")"
    return cli_repr

`str()` ¶

Format config as a string.

Source code in src/llmz/gpt2.py

def __str__(self) -> str:
    """Format config as a string."""
    str_repr = "GPT2Config("
    for key, value in asdict(self).items():
        str_repr += f"{key}={value}, "
    str_repr = str_repr[: len(str_repr) - 2]  # remove final ', '
    str_repr += ")"
    return str_repr

`keys()` ¶

Get iterator of field keys.

Part of Mapping protocol required to enable keyword argument expansion using the ** operator.

Source code in src/llmz/gpt2.py

def keys(self) -> KeysView[str]:
    """Get iterator of field keys.

    Part of Mapping protocol required to enable keyword argument expansion using the
    `**` operator.
    """
    return asdict(self).keys()

`GPT2ConfigError` ¶

Bases: Exception

Custom exception for GPT2 inference errors.

Source code in src/llmz/gpt2.py

class GPT2ConfigError(Exception):
    """Custom exception for GPT2 inference errors."""

    pass

`GPT2InferenceError` ¶

Bases: Exception

Custom exception for GPT2 inference errors.

Source code in src/llmz/gpt2.py

class GPT2InferenceError(Exception):
    """Custom exception for GPT2 inference errors."""

    pass

`GPT2Tokenizer` ¶

Bases: _Tokenizer

Pre-trained version of GPT2's tokenizer.

Source code in src/llmz/gpt2.py

class GPT2Tokenizer(_Tokenizer):
    """Pre-trained version of GPT2's tokenizer."""

    def __init__(self) -> None:
        """Initialise tokenizer."""
        self._tokenizer = tiktoken.get_encoding("gpt2")

    def text2tokens(self, text: str) -> list[int]:
        """Map a string to a list of tokens."""
        return self._tokenizer.encode(text)

    def tokens2text(self, tokens: list[int]) -> str:
        """Map a list of tokens to a string.."""
        return self._tokenizer.decode(tokens)

`init()` ¶

Initialise tokenizer.

Source code in src/llmz/gpt2.py

def __init__(self) -> None:
    """Initialise tokenizer."""
    self._tokenizer = tiktoken.get_encoding("gpt2")

`text2tokens(text)` ¶

Map a string to a list of tokens.

Source code in src/llmz/gpt2.py

def text2tokens(self, text: str) -> list[int]:
    """Map a string to a list of tokens."""
    return self._tokenizer.encode(text)

`tokens2text(tokens)` ¶

Map a list of tokens to a string..

Source code in src/llmz/gpt2.py

def tokens2text(self, tokens: list[int]) -> str:
    """Map a list of tokens to a string.."""
    return self._tokenizer.decode(tokens)

Functions for training LLMs.

`GradientClipCallback` ¶

Callable class that clips model gradient using max norm.

Source code in src/llmz/train.py

class GradientClipCallback:
    """Callable class that clips model gradient using max norm."""

    def __init__(self, clip_grad_norm: float = torch.inf):
        """Initialise."""
        self.clip_grad_norm = clip_grad_norm

    def __call__(self, model: nn.Module) -> None:
        """Clip model gradients."""
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=self.clip_grad_norm)

`call(model)` ¶

Clip model gradients.

Source code in src/llmz/train.py

def __call__(self, model: nn.Module) -> None:
    """Clip model gradients."""
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=self.clip_grad_norm)

`init(clip_grad_norm=torch.inf)` ¶

Initialise.

Source code in src/llmz/train.py

def __init__(self, clip_grad_norm: float = torch.inf):
    """Initialise."""
    self.clip_grad_norm = clip_grad_norm

`LinearWarmupCosineAnnealingLRSchedule` ¶

LR schedule using cosine annealing with linear warmup.

Source code in src/llmz/train.py

class LinearWarmupCosineAnnealingLRSchedule:
    """LR schedule using cosine annealing with linear warmup."""

    def __init__(
        self, num_steps: int, warmup_steps: int, initial_lr: float, peak_lr: float
    ):
        """Initialise.

        Args:
            num_steps: The total number of steps for the schedule.
            warmup_steps: Number of steps in the linear warmup phase.
            initial_lr: Learning rate at first step.
            peak_lr: Peak learning rate at end of warmup phase.

        """
        value_errors: list[str] = []
        if num_steps <= 0:
            value_errors.append(" * num_steps <= 0")
        if warmup_steps > num_steps:
            value_errors.append(" * warmup_steps > num_steps")
        if initial_lr <= 0.0:
            value_errors.append(" * initial_lr <= 0.0")
        if peak_lr < initial_lr:
            value_errors.append(" * peak_lr < initial_lr")

        if value_errors:
            e = ValueError("Invalid arguments for LR schedule")
            for error in value_errors:
                e.add_note(error)
            raise e

        self.num_steps = num_steps
        self.warmup_steps = warmup_steps
        self.cosine_steps = num_steps - warmup_steps
        self.initial_lr = initial_lr
        self.lr_cosine_delta = peak_lr - initial_lr
        self.lr_warmup_delta = (peak_lr - initial_lr) / warmup_steps

    def __call__(self, step: int) -> float:
        """Get learning rate for given step.

        Args:
            step: The global training step.

        Returns:
            The learning rate for the global training step.

        Raises:
            ValueError: If step < 0.

        """
        if step < 0:
            raise ValueError(f"{step=}, must be > 0")
        elif step >= 0 and step < self.warmup_steps:
            lr = self.initial_lr + step * self.lr_warmup_delta
        elif step >= self.warmup_steps and step <= self.num_steps:
            step_cosine = step - self.warmup_steps
            x = math.pi * step_cosine / self.cosine_steps
            lr = self.initial_lr + self.lr_cosine_delta * 0.5 * (1.0 + math.cos(x))
        else:
            lr = self.initial_lr

        return lr

`call(step)` ¶

Get learning rate for given step.

Parameters:

Name	Type	Description	Default
`step`	`int`	The global training step.	required

Returns:

Type	Description
`float`	The learning rate for the global training step.

Raises:

Type	Description
`ValueError`	If step < 0.

Source code in src/llmz/train.py

def __call__(self, step: int) -> float:
    """Get learning rate for given step.

    Args:
        step: The global training step.

    Returns:
        The learning rate for the global training step.

    Raises:
        ValueError: If step < 0.

    """
    if step < 0:
        raise ValueError(f"{step=}, must be > 0")
    elif step >= 0 and step < self.warmup_steps:
        lr = self.initial_lr + step * self.lr_warmup_delta
    elif step >= self.warmup_steps and step <= self.num_steps:
        step_cosine = step - self.warmup_steps
        x = math.pi * step_cosine / self.cosine_steps
        lr = self.initial_lr + self.lr_cosine_delta * 0.5 * (1.0 + math.cos(x))
    else:
        lr = self.initial_lr

    return lr

`init(num_steps, warmup_steps, initial_lr, peak_lr)` ¶

Initialise.

Parameters:

Name	Type	Description	Default
`num_steps`	`int`	The total number of steps for the schedule.	required
`warmup_steps`	`int`	Number of steps in the linear warmup phase.	required
`initial_lr`	`float`	Learning rate at first step.	required
`peak_lr`	`float`	Peak learning rate at end of warmup phase.	required

Source code in src/llmz/train.py

def __init__(
    self, num_steps: int, warmup_steps: int, initial_lr: float, peak_lr: float
):
    """Initialise.

    Args:
        num_steps: The total number of steps for the schedule.
        warmup_steps: Number of steps in the linear warmup phase.
        initial_lr: Learning rate at first step.
        peak_lr: Peak learning rate at end of warmup phase.

    """
    value_errors: list[str] = []
    if num_steps <= 0:
        value_errors.append(" * num_steps <= 0")
    if warmup_steps > num_steps:
        value_errors.append(" * warmup_steps > num_steps")
    if initial_lr <= 0.0:
        value_errors.append(" * initial_lr <= 0.0")
    if peak_lr < initial_lr:
        value_errors.append(" * peak_lr < initial_lr")

    if value_errors:
        e = ValueError("Invalid arguments for LR schedule")
        for error in value_errors:
            e.add_note(error)
        raise e

    self.num_steps = num_steps
    self.warmup_steps = warmup_steps
    self.cosine_steps = num_steps - warmup_steps
    self.initial_lr = initial_lr
    self.lr_cosine_delta = peak_lr - initial_lr
    self.lr_warmup_delta = (peak_lr - initial_lr) / warmup_steps

`autoregressive_llm_loss(model, X_batch, y_batch)` ¶

Compute loss for AR LLMs like GPTs.

Parameters:

Name	Type	Description	Default
`model`	`Module`	The language model.	required
`X_batch`	`Tensor`	Batch of input tokens.	required
`y_batch`	`Tensor`	Batch of output tokens - i.e., next token from the input sequence.	required

Returns:

Type	Description
`Tensor`	Mean cross-entropy loss for the batch.

Source code in src/llmz/train.py

def autoregressive_llm_loss(
    model: nn.Module, X_batch: torch.Tensor, y_batch: torch.Tensor
) -> torch.Tensor:
    """Compute loss for AR LLMs like GPTs.

    Args:
        model: The language model.
        X_batch: Batch of input tokens.
        y_batch: Batch of output tokens - i.e., next token from the input sequence.

    Returns:
        Mean cross-entropy loss for the batch.

    """
    # model outputs logits as softmax is implemented in cross-entropy calc.
    logits = model(X_batch)

    # flatten logits from [BATCH, SEQ_LEN, N_CLASSES] to [BATCH * SEQ_LEN, N_CLASSES]
    # flatten y_batch from [BATCH, SEQ_LEN] to [BATCH * SEQ_LEN]
    loss = nn.functional.cross_entropy(logits.flatten(0, 1), y_batch.flatten())
    return loss

`train(model, loss_calc, optimiser, lr_schedule, train_dataloader, train_epochs, eval_freq_steps, evaluator, model_backward_callbacks=None, log_freq_steps=100, device=torch.device('cpu'))` ¶

Trains model.

Parameters:

Name	Type	Description	Default
`model`	`Module`	The PyTorch model to train.	required
`loss_calc`	`Callable[[Module, Tensor, Tensor], Tensor]`	Function that calculates and returns loss for model and batch.	required
`optimiser`	`Optimizer`	The optimizer for updating model parameters.	required
`lr_schedule`	`Callable[[int], float] \| LRScheduler`	Function to compute learning rate for training step.	required
`train_dataloader`	`DataLoader`	DataLoader for training data.	required
`train_epochs`	`int`	Number of training epochs.	required
`eval_freq_steps`	`int`	Number of steps between evaluations.	required
`evaluator`	`Evaluator`	A handler for all model evaluations.	required
`model_backward_callbacks`	`list[Callable[[Module], None]] \| None`	Optional callbacks for model after backward pass.	`None`
`log_freq_steps`	`int`	Number of steps between basic progress logging to stdout. Defaults to 100.	`100`
`device`	`device`	The processor to use for training. Defaults to CPU.	`device('cpu')`

Source code in src/llmz/train.py

def train(
    model: nn.Module,
    loss_calc: Callable[[nn.Module, torch.Tensor, torch.Tensor], torch.Tensor],
    optimiser: optim.Optimizer,
    lr_schedule: Callable[[int], float] | optim.lr_scheduler.LRScheduler,
    train_dataloader: DataLoader,
    train_epochs: int,
    eval_freq_steps: int,
    evaluator: Evaluator,
    model_backward_callbacks: list[Callable[[nn.Module], None]] | None = None,
    log_freq_steps: int = 100,
    device: torch.device = torch.device("cpu"),
) -> None:
    """Trains model.

    Args:
        model: The PyTorch model to train.
        loss_calc: Function that calculates and returns loss for model and batch.
        optimiser: The optimizer for updating model parameters.
        lr_schedule: Function to compute learning rate for training step.
        train_dataloader: DataLoader for training data.
        train_epochs: Number of training epochs.
        eval_freq_steps: Number of steps between evaluations.
        evaluator: A handler for all model evaluations.
        model_backward_callbacks: Optional callbacks for model after backward pass.
        log_freq_steps: Number of steps between basic progress logging to stdout.
            Defaults to 100.
        device: The processor to use for training. Defaults to CPU.

    """
    if not isinstance(lr_schedule, optim.lr_scheduler.LRScheduler):
        lr_schedule = optim.lr_scheduler.LambdaLR(optimiser, lr_schedule)

    model = model.to(device)
    step = 0

    for epoch in range(1, train_epochs + 1):
        for X_batch, y_batch in train_dataloader:
            X_batch = X_batch.to(device, non_blocking=True)
            y_batch = y_batch.to(device, non_blocking=True)

            step += 1
            model.train()
            optimiser.zero_grad()

            loss = loss_calc(model, X_batch, y_batch)
            loss.backward()

            if model_backward_callbacks:
                for callback in model_backward_callbacks:
                    callback(model)

            optimiser.step()
            lr_schedule.step()

            if step % log_freq_steps == 0:
                log.info(f"{step=}, {epoch=}")

            if step % eval_freq_steps == 0:
                evaluator.evaluate(step, model)

API Reference¶

ModelConfigError ¶

MultiHeadAttention ¶

__init__(context_size, dim_in, dim_out, n_heads=1, dropout=0.6, qkv_bias=False) ¶

forward(x) ¶

LayerNormalisation ¶

__init__(dim_in) ¶

forward(x) ¶

GELU ¶

__init__() ¶

forward(x) ¶

TransformerBlockGPT2 ¶

__init__(context_size, dim_in, n_heads=1, dropout=0.6, qkv_bias=False) ¶

forward(x) ¶

GPTSmallTextDataset ¶

__init__(text, max_length=256, stride=128) ¶

create_data_loader(batch_size=4, shuffle=True, drop_last=True, num_workers=0) ¶

EvalResult ¶

Evaluator ¶

__init__(train_dataloader, val_dataloader, metrics_fn, scenarios_fn=None) ¶

evaluate(step, model, log=None) ¶

basic_llm_metrics(model, dl) ¶

decode(token_logits, strategy='greedy', temperature=1.0, *, k=5) ¶

format_generated_words(text, prompt) ¶

generate(model, prompt, tokenizer, strategy='greedy', output_length=60, temperature=1.0, random_seed=42, device=torch.device('cpu'), *, k=2) ¶

print_wrapped(text, width=89) ¶

GPT2 ¶

__init__(vocab_size, embed_dim, context_size, n_tsfmr_blocks=1, n_attn_heads=1, dropout=0.6, qkv_bias=False) ¶

forward(x) ¶

GPT2Config dataclass ¶

__getitem__(key) ¶

__post_init__() ¶

__repr__() ¶

__str__() ¶

keys() ¶

GPT2ConfigError ¶

GPT2InferenceError ¶

GPT2Tokenizer ¶

__init__() ¶

text2tokens(text) ¶

tokens2text(tokens) ¶

GradientClipCallback ¶

__call__(model) ¶

__init__(clip_grad_norm=torch.inf) ¶

LinearWarmupCosineAnnealingLRSchedule ¶

__call__(step) ¶

__init__(num_steps, warmup_steps, initial_lr, peak_lr) ¶

autoregressive_llm_loss(model, X_batch, y_batch) ¶

train(model, loss_calc, optimiser, lr_schedule, train_dataloader, train_epochs, eval_freq_steps, evaluator, model_backward_callbacks=None, log_freq_steps=100, device=torch.device('cpu')) ¶

`ModelConfigError` ¶

`MultiHeadAttention` ¶

`init(context_size, dim_in, dim_out, n_heads=1, dropout=0.6, qkv_bias=False)` ¶

`forward(x)` ¶

`LayerNormalisation` ¶

`init(dim_in)` ¶

`forward(x)` ¶

`GELU` ¶

`init()` ¶

`forward(x)` ¶

`TransformerBlockGPT2` ¶

`init(context_size, dim_in, n_heads=1, dropout=0.6, qkv_bias=False)` ¶

`forward(x)` ¶

`GPTSmallTextDataset` ¶

`init(text, max_length=256, stride=128)` ¶

`create_data_loader(batch_size=4, shuffle=True, drop_last=True, num_workers=0)` ¶

`EvalResult` ¶

`Evaluator` ¶

`init(train_dataloader, val_dataloader, metrics_fn, scenarios_fn=None)` ¶

`evaluate(step, model, log=None)` ¶

`basic_llm_metrics(model, dl)` ¶

`decode(token_logits, strategy='greedy', temperature=1.0, *, k=5)` ¶

`format_generated_words(text, prompt)` ¶

`generate(model, prompt, tokenizer, strategy='greedy', output_length=60, temperature=1.0, random_seed=42, device=torch.device('cpu'), *, k=2)` ¶

`print_wrapped(text, width=89)` ¶

`GPT2` ¶

`init(vocab_size, embed_dim, context_size, n_tsfmr_blocks=1, n_attn_heads=1, dropout=0.6, qkv_bias=False)` ¶

`forward(x)` ¶

`GPT2Config` `dataclass` ¶

`getitem(key)` ¶

`__post_init__()` ¶

`repr()` ¶

`str()` ¶

`keys()` ¶

`GPT2ConfigError` ¶

`GPT2InferenceError` ¶

`GPT2Tokenizer` ¶

`init()` ¶

`text2tokens(text)` ¶

`tokens2text(tokens)` ¶

`GradientClipCallback` ¶

`call(model)` ¶

`init(clip_grad_norm=torch.inf)` ¶

`LinearWarmupCosineAnnealingLRSchedule` ¶

`call(step)` ¶

`init(num_steps, warmup_steps, initial_lr, peak_lr)` ¶

`autoregressive_llm_loss(model, X_batch, y_batch)` ¶

`train(model, loss_calc, optimiser, lr_schedule, train_dataloader, train_epochs, eval_freq_steps, evaluator, model_backward_callbacks=None, log_freq_steps=100, device=torch.device('cpu'))` ¶