Skip to content

API Reference

Attention blocks for transformer models.

ModelConfigError

Bases: Exception

Custom exception class for model configuration errors.

Source code in src/llmz/components/attention.py
 97
 98
 99
100
class ModelConfigError(Exception):
    """Custom exception class for model configuration errors."""

    pass

MultiHeadAttention

Bases: Module

Basic causal attention block.

Source code in src/llmz/components/attention.py
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
class MultiHeadAttention(nn.Module):
    """Basic causal attention block."""

    def __init__(
        self,
        context_size: int,
        dim_in: int,
        dim_out: int,
        n_heads: int = 1,
        dropout: float = 0.6,
        qkv_bias: bool = False,
    ):
        """Initialise module.

        Args:
            dim_in: Dimension of input word embeddings.
            dim_out: Dimension of output attention embeddings.
            context_size: The number of input word embeddings in the sequence.
            n_heads: The number of attention heads. Defaults to 1.
            dropout: The dropout rate. Defaults to 0.6.
            qkv_bias: Whether or not to include bias in the linear layers used to
                compute W_query, W_key and W_value. Defaults to False.

        Raises:
            ModelConfigError: if dim_out % n_heads

        """
        super().__init__()
        if dim_out % n_heads != 0:
            raise ModelConfigError("dim_out % n_heads != 0")

        self.dim_out = dim_out
        self.n_heads = n_heads
        self.dim_head = dim_out // n_heads  # // --> returns int
        self.W_query = nn.Linear(dim_in, dim_out, bias=qkv_bias)
        self.W_key = nn.Linear(dim_in, dim_out, bias=qkv_bias)
        self.W_value = nn.Linear(dim_in, dim_out, bias=qkv_bias)
        self.out_proj = nn.Linear(dim_out, dim_out)
        self.dropout = nn.Dropout(dropout)
        self.register_buffer(
            "mask", torch.triu(torch.ones(context_size, context_size), diagonal=1)
        )  # these are not parameters

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """Execute the module's forward pass.

        Args:
            x: Batch of token embeddings.

        Returns:
            Batch of attention weighted embeddings.

        """
        batch_size, seq_len, dim_in = x.size()

        # get mask for sequence length
        mask_bool = self.mask.bool()[:seq_len, :seq_len]

        # single head (dim = batch_size, n_heads, seq_len, dim_out)
        queries = self.W_query(x)
        keys = self.W_key(x)
        values = self.W_value(x)

        # split single head into multiple heads
        queries = queries.view(batch_size, seq_len, self.n_heads, self.dim_head)
        keys = keys.view(batch_size, seq_len, self.n_heads, self.dim_head)
        values = values.view(batch_size, seq_len, self.n_heads, self.dim_head)

        # reshape in size = batch_size, n_heads, seq_len, head_dim
        queries = queries.transpose(1, 2)
        keys = keys.transpose(1, 2)
        values = values.transpose(1, 2)

        # compute attention scores (matrix multiplication works on final two dimensions)
        attn_scores = queries @ keys.transpose(2, 3)
        attn_scores.masked_fill_(mask_bool, -torch.inf)

        # compute attention weights from attention scores (dim = -1 -> last dim in size)
        attn_weights = torch.softmax(attn_scores / keys.size()[-1] ** 0.5, dim=-1)
        attn_weights = self.dropout(attn_weights)

        # compute context embeddings & reshape to batch_size, seq_len, n_heads, head_dim
        context_embeddings = (attn_weights @ values).transpose(1, 2)

        # reshape to factor-out the multiple heads and take into account dim_out
        context_embeddings = context_embeddings.view(batch_size, seq_len, self.dim_out)
        context_embeddings = self.out_proj(context_embeddings)
        return context_embeddings

__init__(context_size, dim_in, dim_out, n_heads=1, dropout=0.6, qkv_bias=False)

Initialise module.

Parameters:

Name Type Description Default
dim_in int

Dimension of input word embeddings.

required
dim_out int

Dimension of output attention embeddings.

required
context_size int

The number of input word embeddings in the sequence.

required
n_heads int

The number of attention heads. Defaults to 1.

1
dropout float

The dropout rate. Defaults to 0.6.

0.6
qkv_bias bool

Whether or not to include bias in the linear layers used to compute W_query, W_key and W_value. Defaults to False.

False

Raises:

Type Description
ModelConfigError

if dim_out % n_heads

Source code in src/llmz/components/attention.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
def __init__(
    self,
    context_size: int,
    dim_in: int,
    dim_out: int,
    n_heads: int = 1,
    dropout: float = 0.6,
    qkv_bias: bool = False,
):
    """Initialise module.

    Args:
        dim_in: Dimension of input word embeddings.
        dim_out: Dimension of output attention embeddings.
        context_size: The number of input word embeddings in the sequence.
        n_heads: The number of attention heads. Defaults to 1.
        dropout: The dropout rate. Defaults to 0.6.
        qkv_bias: Whether or not to include bias in the linear layers used to
            compute W_query, W_key and W_value. Defaults to False.

    Raises:
        ModelConfigError: if dim_out % n_heads

    """
    super().__init__()
    if dim_out % n_heads != 0:
        raise ModelConfigError("dim_out % n_heads != 0")

    self.dim_out = dim_out
    self.n_heads = n_heads
    self.dim_head = dim_out // n_heads  # // --> returns int
    self.W_query = nn.Linear(dim_in, dim_out, bias=qkv_bias)
    self.W_key = nn.Linear(dim_in, dim_out, bias=qkv_bias)
    self.W_value = nn.Linear(dim_in, dim_out, bias=qkv_bias)
    self.out_proj = nn.Linear(dim_out, dim_out)
    self.dropout = nn.Dropout(dropout)
    self.register_buffer(
        "mask", torch.triu(torch.ones(context_size, context_size), diagonal=1)
    )  # these are not parameters

forward(x)

Execute the module's forward pass.

Parameters:

Name Type Description Default
x Tensor

Batch of token embeddings.

required

Returns:

Type Description
Tensor

Batch of attention weighted embeddings.

Source code in src/llmz/components/attention.py
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
def forward(self, x: torch.Tensor) -> torch.Tensor:
    """Execute the module's forward pass.

    Args:
        x: Batch of token embeddings.

    Returns:
        Batch of attention weighted embeddings.

    """
    batch_size, seq_len, dim_in = x.size()

    # get mask for sequence length
    mask_bool = self.mask.bool()[:seq_len, :seq_len]

    # single head (dim = batch_size, n_heads, seq_len, dim_out)
    queries = self.W_query(x)
    keys = self.W_key(x)
    values = self.W_value(x)

    # split single head into multiple heads
    queries = queries.view(batch_size, seq_len, self.n_heads, self.dim_head)
    keys = keys.view(batch_size, seq_len, self.n_heads, self.dim_head)
    values = values.view(batch_size, seq_len, self.n_heads, self.dim_head)

    # reshape in size = batch_size, n_heads, seq_len, head_dim
    queries = queries.transpose(1, 2)
    keys = keys.transpose(1, 2)
    values = values.transpose(1, 2)

    # compute attention scores (matrix multiplication works on final two dimensions)
    attn_scores = queries @ keys.transpose(2, 3)
    attn_scores.masked_fill_(mask_bool, -torch.inf)

    # compute attention weights from attention scores (dim = -1 -> last dim in size)
    attn_weights = torch.softmax(attn_scores / keys.size()[-1] ** 0.5, dim=-1)
    attn_weights = self.dropout(attn_weights)

    # compute context embeddings & reshape to batch_size, seq_len, n_heads, head_dim
    context_embeddings = (attn_weights @ values).transpose(1, 2)

    # reshape to factor-out the multiple heads and take into account dim_out
    context_embeddings = context_embeddings.view(batch_size, seq_len, self.dim_out)
    context_embeddings = self.out_proj(context_embeddings)
    return context_embeddings

Normalisation operations.

LayerNormalisation

Bases: Module

Layer normalisation.

Normalises batches of input tensors close zero mean and unit variance. The module allows for some trained deviation from the a mean of zero and a variance of one.

Source code in src/llmz/components/normalisation.py
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
class LayerNormalisation(nn.Module):
    """Layer normalisation.

    Normalises batches of input tensors close zero mean and unit variance. The module
    allows for some trained deviation from the a mean of zero and a variance of one.
    """

    def __init__(self, dim_in: int):
        """Initialise module.

        Args:
            dim_in: Dimension of the input batches.

        """
        super().__init__()
        self.epsilon = 1e-5

        # Trainable element-by-element adjustments to output tensors
        self.shift = nn.Parameter(torch.zeros(dim_in))
        self.scale = nn.Parameter(torch.ones(dim_in))

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """Forward pass of module.

        Args:
            x: input tensors.

        Returns:
            Tensor-by-tensor normalised version of the inputs.

        """
        x_mean = x.mean(dim=-1, keepdim=True)
        x_stdev = x.std(dim=-1, keepdim=True, unbiased=False)  # unbiased as n -> inf
        x_norm = (x - x_mean) / (x_stdev + self.epsilon)
        return self.shift + self.scale * x_norm

__init__(dim_in)

Initialise module.

Parameters:

Name Type Description Default
dim_in int

Dimension of the input batches.

required
Source code in src/llmz/components/normalisation.py
14
15
16
17
18
19
20
21
22
23
24
25
26
def __init__(self, dim_in: int):
    """Initialise module.

    Args:
        dim_in: Dimension of the input batches.

    """
    super().__init__()
    self.epsilon = 1e-5

    # Trainable element-by-element adjustments to output tensors
    self.shift = nn.Parameter(torch.zeros(dim_in))
    self.scale = nn.Parameter(torch.ones(dim_in))

forward(x)

Forward pass of module.

Parameters:

Name Type Description Default
x Tensor

input tensors.

required

Returns:

Type Description
Tensor

Tensor-by-tensor normalised version of the inputs.

Source code in src/llmz/components/normalisation.py
28
29
30
31
32
33
34
35
36
37
38
39
40
41
def forward(self, x: torch.Tensor) -> torch.Tensor:
    """Forward pass of module.

    Args:
        x: input tensors.

    Returns:
        Tensor-by-tensor normalised version of the inputs.

    """
    x_mean = x.mean(dim=-1, keepdim=True)
    x_stdev = x.std(dim=-1, keepdim=True, unbiased=False)  # unbiased as n -> inf
    x_norm = (x - x_mean) / (x_stdev + self.epsilon)
    return self.shift + self.scale * x_norm

Activation functions for transformer models.

GELU

Bases: Module

Guassian Error Linear Unit (GELU).

Implemented using an approximation to x * F(x), where F is the cumulative normal distribution function. See 'Build a LLM (from scratch)' by S. Raschka (2024), p105.

Source code in src/llmz/components/activations.py
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
class GELU(nn.Module):
    """Guassian Error Linear Unit (GELU).

    Implemented using an approximation to `x * F(x)`, where `F` is the cumulative
    normal distribution function. See 'Build a LLM (from scratch)' by S. Raschka
    (2024), p105.
    """

    def __init__(self):
        """Initialise module."""
        super().__init__()

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """Execute the module's forward pass.

        Args:
            x: Batch of input tensors.

        Returns:
            Batch of output tensors that have been filtered on an element-by-element
                basis using the GELU activation function.

        """
        tanh_exponent = torch.sqrt(torch.tensor(2.0 / torch.pi)) * (
            x + 0.044715 * torch.pow(x, 3)
        )
        gelu_x = 0.5 * x * (1.0 + torch.tanh(tanh_exponent))
        return gelu_x

__init__()

Initialise module.

Source code in src/llmz/components/activations.py
15
16
17
def __init__(self):
    """Initialise module."""
    super().__init__()

forward(x)

Execute the module's forward pass.

Parameters:

Name Type Description Default
x Tensor

Batch of input tensors.

required

Returns:

Type Description
Tensor

Batch of output tensors that have been filtered on an element-by-element basis using the GELU activation function.

Source code in src/llmz/components/activations.py
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
def forward(self, x: torch.Tensor) -> torch.Tensor:
    """Execute the module's forward pass.

    Args:
        x: Batch of input tensors.

    Returns:
        Batch of output tensors that have been filtered on an element-by-element
            basis using the GELU activation function.

    """
    tanh_exponent = torch.sqrt(torch.tensor(2.0 / torch.pi)) * (
        x + 0.044715 * torch.pow(x, 3)
    )
    gelu_x = 0.5 * x * (1.0 + torch.tanh(tanh_exponent))
    return gelu_x

Transformer block for LLMs.

TransformerBlockGPT2

Bases: Module

Basic transformer block with multi-head attention as used in GPT2.

Source code in src/llmz/components/transformers.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
class TransformerBlockGPT2(nn.Module):
    """Basic transformer block with multi-head attention as used in GPT2."""

    def __init__(
        self,
        context_size: int,
        dim_in: int,
        n_heads: int = 1,
        dropout: float = 0.6,
        qkv_bias: bool = False,
    ):
        """Initialise module.

        Args:
            dim_in: Dimension of input word embeddings.
            context_size: The number of input word embeddings in the sequence.
            n_heads: The number of attention heads. Defaults to 1.
            dropout: The dropout rate. Defaults to 0.6.
            qkv_bias: Whether or not to include bias in the linear layers used to
                compute W_query, W_key and W_value. Defaults to False.

        """
        super().__init__()
        self.attention = MultiHeadAttention(
            context_size, dim_in, dim_in, n_heads, dropout, qkv_bias
        )
        self.linear_1 = nn.Linear(dim_in, dim_in * 2)
        self.linear_2 = nn.Linear(dim_in * 2, dim_in)
        self.normalise_1 = LayerNormalisation(dim_in)
        self.normalise_2 = LayerNormalisation(dim_in)
        self.dropout = nn.Dropout(dropout)
        self.gelu = GELU()

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """Execute the module's forward pass.

        Args:
            x: Batch of token embeddings.

        Returns:
            Batch of attention weighted embeddings.

        """
        y1 = self.normalise_1(x)
        y1 = self.attention(y1)
        y1 = self.dropout(y1)

        y2 = self.normalise_2(y1 + x)
        y2 = self.linear_1(y2)
        y2 = self.gelu(y2)
        y2 = self.linear_2(y2)
        y2 = self.dropout(y2)

        return y1 + y2

__init__(context_size, dim_in, n_heads=1, dropout=0.6, qkv_bias=False)

Initialise module.

Parameters:

Name Type Description Default
dim_in int

Dimension of input word embeddings.

required
context_size int

The number of input word embeddings in the sequence.

required
n_heads int

The number of attention heads. Defaults to 1.

1
dropout float

The dropout rate. Defaults to 0.6.

0.6
qkv_bias bool

Whether or not to include bias in the linear layers used to compute W_query, W_key and W_value. Defaults to False.

False
Source code in src/llmz/components/transformers.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
def __init__(
    self,
    context_size: int,
    dim_in: int,
    n_heads: int = 1,
    dropout: float = 0.6,
    qkv_bias: bool = False,
):
    """Initialise module.

    Args:
        dim_in: Dimension of input word embeddings.
        context_size: The number of input word embeddings in the sequence.
        n_heads: The number of attention heads. Defaults to 1.
        dropout: The dropout rate. Defaults to 0.6.
        qkv_bias: Whether or not to include bias in the linear layers used to
            compute W_query, W_key and W_value. Defaults to False.

    """
    super().__init__()
    self.attention = MultiHeadAttention(
        context_size, dim_in, dim_in, n_heads, dropout, qkv_bias
    )
    self.linear_1 = nn.Linear(dim_in, dim_in * 2)
    self.linear_2 = nn.Linear(dim_in * 2, dim_in)
    self.normalise_1 = LayerNormalisation(dim_in)
    self.normalise_2 = LayerNormalisation(dim_in)
    self.dropout = nn.Dropout(dropout)
    self.gelu = GELU()

forward(x)

Execute the module's forward pass.

Parameters:

Name Type Description Default
x Tensor

Batch of token embeddings.

required

Returns:

Type Description
Tensor

Batch of attention weighted embeddings.

Source code in src/llmz/components/transformers.py
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
def forward(self, x: torch.Tensor) -> torch.Tensor:
    """Execute the module's forward pass.

    Args:
        x: Batch of token embeddings.

    Returns:
        Batch of attention weighted embeddings.

    """
    y1 = self.normalise_1(x)
    y1 = self.attention(y1)
    y1 = self.dropout(y1)

    y2 = self.normalise_2(y1 + x)
    y2 = self.linear_1(y2)
    y2 = self.gelu(y2)
    y2 = self.linear_2(y2)
    y2 = self.dropout(y2)

    return y1 + y2

Datasets for LLMs.

GPTSmallTextDataset

Bases: Dataset

GPT dataset interface for any 'small' text data.

This will tokenize all text in-memory using a GPT2's tokenization algorithm, which is a pre-trained Bite Pair Encoding (BPE).

Source code in src/llmz/datasets.py
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
class GPTSmallTextDataset(Dataset):
    """GPT dataset interface for any 'small' text data.

    This will tokenize all text in-memory using a GPT2's tokenization algorithm, which
    is a pre-trained Bite Pair Encoding (BPE).
    """

    def __init__(self, text: str, max_length: int = 256, stride: int = 128):
        """Initialise.

        Args:
            text: Raw text data to convert into tokens.
            max_length: Number of tokens for each data instance. Defaults to 256.
            stride: Separation (in tokens) between consecutive instances. Defaults to
                128.

        """
        tokenizer = tiktoken.get_encoding("gpt2")
        tokens = tokenizer.encode(text)

        n_tokens = len(tokens)
        n_instances = int((n_tokens - max_length) / stride)
        if n_instances == 0:
            raise RuntimeError("max_length + stride <= number of tokens")

        self._X = torch.ones((n_instances, max_length))
        self._y = torch.ones((n_instances, max_length))

        for n, i in enumerate(range(0, n_tokens - max_length, stride)):
            self._X[n,] = torch.tensor(tokens[i : i + max_length])
            self._y[n,] = torch.tensor(tokens[i + 1 : i + max_length + 1])

    def create_data_loader(
        self,
        batch_size: int = 4,
        shuffle: bool = True,
        drop_last: bool = True,
        num_workers: int = 0,
    ) -> DataLoader:
        """Create data loader.

        Args:
            batch_size: The batch size. Defaults to 4.
            shuffle: Whether to randomise instance order after each iteration. Defaults
                to True.
            drop_last: Drop last batch if less than `batch_size`. Defaults to True.
            num_workers: Number of CPU processes to use for pre-processing. Defaults to
                0.

        Returns:
            A fully configured DataLoader

        """
        return DataLoader(
            self,
            batch_size=batch_size,
            shuffle=shuffle,
            drop_last=drop_last,
            num_workers=num_workers,
        )

    def __len__(self) -> int:
        return self._X.size(0)

    def __getitem__(self, idx: int) -> tuple[torch.Tensor, torch.Tensor]:
        return self._X[idx,], self._y[idx,]

__init__(text, max_length=256, stride=128)

Initialise.

Parameters:

Name Type Description Default
text str

Raw text data to convert into tokens.

required
max_length int

Number of tokens for each data instance. Defaults to 256.

256
stride int

Separation (in tokens) between consecutive instances. Defaults to 128.

128
Source code in src/llmz/datasets.py
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
def __init__(self, text: str, max_length: int = 256, stride: int = 128):
    """Initialise.

    Args:
        text: Raw text data to convert into tokens.
        max_length: Number of tokens for each data instance. Defaults to 256.
        stride: Separation (in tokens) between consecutive instances. Defaults to
            128.

    """
    tokenizer = tiktoken.get_encoding("gpt2")
    tokens = tokenizer.encode(text)

    n_tokens = len(tokens)
    n_instances = int((n_tokens - max_length) / stride)
    if n_instances == 0:
        raise RuntimeError("max_length + stride <= number of tokens")

    self._X = torch.ones((n_instances, max_length))
    self._y = torch.ones((n_instances, max_length))

    for n, i in enumerate(range(0, n_tokens - max_length, stride)):
        self._X[n,] = torch.tensor(tokens[i : i + max_length])
        self._y[n,] = torch.tensor(tokens[i + 1 : i + max_length + 1])

create_data_loader(batch_size=4, shuffle=True, drop_last=True, num_workers=0)

Create data loader.

Parameters:

Name Type Description Default
batch_size int

The batch size. Defaults to 4.

4
shuffle bool

Whether to randomise instance order after each iteration. Defaults to True.

True
drop_last bool

Drop last batch if less than batch_size. Defaults to True.

True
num_workers int

Number of CPU processes to use for pre-processing. Defaults to 0.

0

Returns:

Type Description
DataLoader

A fully configured DataLoader

Source code in src/llmz/datasets.py
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
def create_data_loader(
    self,
    batch_size: int = 4,
    shuffle: bool = True,
    drop_last: bool = True,
    num_workers: int = 0,
) -> DataLoader:
    """Create data loader.

    Args:
        batch_size: The batch size. Defaults to 4.
        shuffle: Whether to randomise instance order after each iteration. Defaults
            to True.
        drop_last: Drop last batch if less than `batch_size`. Defaults to True.
        num_workers: Number of CPU processes to use for pre-processing. Defaults to
            0.

    Returns:
        A fully configured DataLoader

    """
    return DataLoader(
        self,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers,
    )

Evaluation and metrics.

EvalResult

Bases: NamedTuple

Container for evaluation results produced during training.

Source code in src/llmz/evaluate.py
14
15
16
17
18
class EvalResult(NamedTuple):
    """Container for evaluation results produced during training."""

    step: int
    results: dict[str, Result]

Evaluator

Model evaluator.

This class executes and stores all model evaluations during training.

Source code in src/llmz/evaluate.py
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
class Evaluator:
    """Model evaluator.

    This class executes and stores all model evaluations during training.
    """

    def __init__(
        self,
        train_dataloader: DataLoader,
        val_dataloader: DataLoader,
        metrics_fn: Callable[[nn.Module, DataLoader], dict[str, Result]],
        scenarios_fn: Callable[[nn.Module], dict[str, Result]] | None = None,
    ):
        """Initialise.

        Args:
            train_dataloader: DataLoader for training data.
            val_dataloader: DataLoader for validation data.
            metrics_fn: Callable that returns a dictionary of metrics given a model and
                a dataloader.
            scenarios_fn: Optional callable that returns a dictionary of results/outputs
                given a model - e.g., generated text given an example prompt. Defaults
                to None.

        """
        self.train_dl = train_dataloader
        self.val_dl = val_dataloader
        self.metrics_fn = metrics_fn
        self.scenarios_fn = scenarios_fn
        self._eval_records: list[EvalResult] = []

    def evaluate(
        self, step: int, model: nn.Module, log: logging.Logger | None = None
    ) -> None:
        """Evaluate model.

        Args:
            step: The number of training steps applied to the model.
            model: The model to evaluate.
            log: Optional logger for logging results? Defaults to custom llmz logger.

        Return:
            All evaluations for the model after training steps.

        """
        train_metrics = {
            f"train_{k}": v for k, v in self.metrics_fn(model, self.train_dl).items()
        }
        val_metrics = {
            f"val_{k}": v for k, v in self.metrics_fn(model, self.val_dl).items()
        }
        scenarios = self.scenarios_fn(model) if self.scenarios_fn else {}

        eval_record = EvalResult(step, {**train_metrics, **val_metrics, **scenarios})
        self._eval_records.append(eval_record)

        if log:
            log_msg = f"{eval_record.step=}: " + ", ".join(
                f"{k}={v}" for k, v in eval_record.results.items()
            )
            log.info(log_msg)

    def __getitem__(self, idx: int) -> EvalResult:
        return self._eval_records[idx]

    def __iter__(self) -> Iterator[EvalResult]:
        return iter(self._eval_records)

    def __len__(self) -> int:
        return len(self._eval_records)

__init__(train_dataloader, val_dataloader, metrics_fn, scenarios_fn=None)

Initialise.

Parameters:

Name Type Description Default
train_dataloader DataLoader

DataLoader for training data.

required
val_dataloader DataLoader

DataLoader for validation data.

required
metrics_fn Callable[[Module, DataLoader], dict[str, Result]]

Callable that returns a dictionary of metrics given a model and a dataloader.

required
scenarios_fn Callable[[Module], dict[str, Result]] | None

Optional callable that returns a dictionary of results/outputs given a model - e.g., generated text given an example prompt. Defaults to None.

None
Source code in src/llmz/evaluate.py
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
def __init__(
    self,
    train_dataloader: DataLoader,
    val_dataloader: DataLoader,
    metrics_fn: Callable[[nn.Module, DataLoader], dict[str, Result]],
    scenarios_fn: Callable[[nn.Module], dict[str, Result]] | None = None,
):
    """Initialise.

    Args:
        train_dataloader: DataLoader for training data.
        val_dataloader: DataLoader for validation data.
        metrics_fn: Callable that returns a dictionary of metrics given a model and
            a dataloader.
        scenarios_fn: Optional callable that returns a dictionary of results/outputs
            given a model - e.g., generated text given an example prompt. Defaults
            to None.

    """
    self.train_dl = train_dataloader
    self.val_dl = val_dataloader
    self.metrics_fn = metrics_fn
    self.scenarios_fn = scenarios_fn
    self._eval_records: list[EvalResult] = []

evaluate(step, model, log=None)

Evaluate model.

Parameters:

Name Type Description Default
step int

The number of training steps applied to the model.

required
model Module

The model to evaluate.

required
log Logger | None

Optional logger for logging results? Defaults to custom llmz logger.

None
Return

All evaluations for the model after training steps.

Source code in src/llmz/evaluate.py
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
def evaluate(
    self, step: int, model: nn.Module, log: logging.Logger | None = None
) -> None:
    """Evaluate model.

    Args:
        step: The number of training steps applied to the model.
        model: The model to evaluate.
        log: Optional logger for logging results? Defaults to custom llmz logger.

    Return:
        All evaluations for the model after training steps.

    """
    train_metrics = {
        f"train_{k}": v for k, v in self.metrics_fn(model, self.train_dl).items()
    }
    val_metrics = {
        f"val_{k}": v for k, v in self.metrics_fn(model, self.val_dl).items()
    }
    scenarios = self.scenarios_fn(model) if self.scenarios_fn else {}

    eval_record = EvalResult(step, {**train_metrics, **val_metrics, **scenarios})
    self._eval_records.append(eval_record)

    if log:
        log_msg = f"{eval_record.step=}: " + ", ".join(
            f"{k}={v}" for k, v in eval_record.results.items()
        )
        log.info(log_msg)

basic_llm_metrics(model, dl)

Compute basic LLM metrics for a dataloader.

Parameters:

Name Type Description Default
model Module

Model to use for inference.

required
dl DataLoader

Dataloader with data batches for inference.

required
Source code in src/llmz/evaluate.py
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
def basic_llm_metrics(model: nn.Module, dl: DataLoader) -> dict[str, float]:
    """Compute basic LLM metrics for a dataloader.

    Args:
        model: Model to use for inference.
        dl: Dataloader with data batches for inference.

    """
    loss = sum(
        f.cross_entropy(model(X).flatten(0, 1), y.flatten()).item() for X, y in dl
    ) / len(dl)
    return {"loss": loss}

Tools for text generation .

decode(token_logits, strategy='greedy', temperature=1.0, *, k=5)

Decode generative model output using the specified strategy.

Source code in src/llmz/generate.py
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
def decode(
    token_logits: torch.Tensor,
    strategy: Literal["greedy", "sample", "topk"] = "greedy",
    temperature: float = 1.0,
    *,
    k: int = 5,
) -> int:
    """Decode generative model output using the specified strategy."""
    match strategy:
        case "greedy":
            return _greedy_decoding(token_logits, temperature)
        case "topk":
            return _top_k_decoding(token_logits, temperature, k)
        case "sample":
            return _sample_decoding(token_logits, temperature)

format_generated_words(text, prompt)

Format list of words into a readable paragraph.

Source code in src/llmz/generate.py
92
93
94
95
96
def format_generated_words(text: str, prompt: str) -> str:
    """Format list of words into a readable paragraph."""
    text = _capitalise_sentences(text, sentence_delimiter=". ")
    text = "==> " + prompt.upper().strip() + " " + text.strip()
    return "\n".join([line for line in wrap(text, width=89)])

generate(model, prompt, tokenizer, strategy='greedy', output_length=60, temperature=1.0, random_seed=42, device=torch.device('cpu'), *, k=2)

Generate new text conditional on a text prompt.

Source code in src/llmz/generate.py
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
def generate(
    model: nn.Module,
    prompt: str,
    tokenizer: _Tokenizer,
    strategy: Literal["greedy", "sample", "topk"] = "greedy",
    output_length: int = 60,
    temperature: float = 1.0,
    random_seed: int = 42,
    device: torch.device = torch.device("cpu"),
    *,
    k: int = 2,
) -> str:
    """Generate new text conditional on a text prompt."""
    torch.manual_seed(random_seed)

    model.to(device)
    model.eval()

    prompt_tokens = tokenizer(prompt)
    token_sequence = prompt_tokens.copy()
    for _ in range(output_length):
        x = torch.tensor([token_sequence], device=device)
        token_logits = model(x)
        token_pred = decode(token_logits[0, -1], strategy, temperature, k=k)
        token_sequence += [token_pred]

    new_token_sequence = token_sequence[len(prompt_tokens) :]
    new_token_sequence = token_sequence[len(prompt_tokens) :]
    return format_generated_words(tokenizer.tokens2text(new_token_sequence), prompt)

print_wrapped(text, width=89)

Print text with word wrapping.

Source code in src/llmz/generate.py
106
107
108
109
def print_wrapped(text: str, width: int = 89) -> None:
    """Print text with word wrapping."""
    wrapped_text = "\n".join(wrap(text, width=width))
    print(wrapped_text)

Implementation of GPT2.

GPT2

Bases: Module

Implementation of OpenAI's GPT2 model.

Source code in src/llmz/gpt2.py
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
class GPT2(nn.Module):
    """Implementation of OpenAI's GPT2 model."""

    def __init__(
        self,
        vocab_size: int,
        embed_dim: int,
        context_size: int,
        n_tsfmr_blocks: int = 1,
        n_attn_heads: int = 1,
        dropout: float = 0.6,
        qkv_bias: bool = False,
    ):
        """Initialise model.

        Args:
            vocab_size: The number of unique tokens that the model expects to encounter.
            embed_dim: Dimension of input word embeddings.
            context_size: The number of input word embeddings in the sequence.
            n_tsfmr_blocks: The number of transformer blocks stacked together.
            n_attn_heads: The number of attention heads in every transformer block.
                Defaults to 1.
            dropout: The dropout rate. Defaults to 0.6.
            qkv_bias: Whether or not to include bias in the linear layers used to
                compute W_query, W_key and W_value. Defaults to False.

        """
        super().__init__()

        self.context_size = context_size
        self.token_embed = nn.Embedding(vocab_size, embed_dim)
        self.position_embed = nn.Embedding(context_size, embed_dim)
        self.dropout_embed = nn.Dropout(p=dropout)

        self.tsfmr_stack = nn.Sequential(
            *[
                TransformerBlockGPT2(
                    context_size, embed_dim, n_attn_heads, dropout, qkv_bias
                )
                for _ in range(n_tsfmr_blocks)
            ]
        )

        self.final_norm = LayerNormalisation(embed_dim)
        self.output_head = nn.Linear(embed_dim, vocab_size, bias=False)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """Execute the module's forward pass.

        Args:
            x: Batch of token embeddings.

        Returns:
            Batch of attention weighted embeddings.

        """
        seq_len = x.size()[1]
        if seq_len > self.context_size:
            msg = f"seq_len ({seq_len}) > context_size ({self.context_size})"
            raise GPT2InferenceError(msg)

        positions = torch.arange(0, seq_len, device=x.device)
        y = self.token_embed(x) + self.position_embed(positions)
        y = self.dropout_embed(y)
        y = self.tsfmr_stack(y)
        y = self.final_norm(y)
        logits = self.output_head(y)
        return logits

__init__(vocab_size, embed_dim, context_size, n_tsfmr_blocks=1, n_attn_heads=1, dropout=0.6, qkv_bias=False)

Initialise model.

Parameters:

Name Type Description Default
vocab_size int

The number of unique tokens that the model expects to encounter.

required
embed_dim int

Dimension of input word embeddings.

required
context_size int

The number of input word embeddings in the sequence.

required
n_tsfmr_blocks int

The number of transformer blocks stacked together.

1
n_attn_heads int

The number of attention heads in every transformer block. Defaults to 1.

1
dropout float

The dropout rate. Defaults to 0.6.

0.6
qkv_bias bool

Whether or not to include bias in the linear layers used to compute W_query, W_key and W_value. Defaults to False.

False
Source code in src/llmz/gpt2.py
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
def __init__(
    self,
    vocab_size: int,
    embed_dim: int,
    context_size: int,
    n_tsfmr_blocks: int = 1,
    n_attn_heads: int = 1,
    dropout: float = 0.6,
    qkv_bias: bool = False,
):
    """Initialise model.

    Args:
        vocab_size: The number of unique tokens that the model expects to encounter.
        embed_dim: Dimension of input word embeddings.
        context_size: The number of input word embeddings in the sequence.
        n_tsfmr_blocks: The number of transformer blocks stacked together.
        n_attn_heads: The number of attention heads in every transformer block.
            Defaults to 1.
        dropout: The dropout rate. Defaults to 0.6.
        qkv_bias: Whether or not to include bias in the linear layers used to
            compute W_query, W_key and W_value. Defaults to False.

    """
    super().__init__()

    self.context_size = context_size
    self.token_embed = nn.Embedding(vocab_size, embed_dim)
    self.position_embed = nn.Embedding(context_size, embed_dim)
    self.dropout_embed = nn.Dropout(p=dropout)

    self.tsfmr_stack = nn.Sequential(
        *[
            TransformerBlockGPT2(
                context_size, embed_dim, n_attn_heads, dropout, qkv_bias
            )
            for _ in range(n_tsfmr_blocks)
        ]
    )

    self.final_norm = LayerNormalisation(embed_dim)
    self.output_head = nn.Linear(embed_dim, vocab_size, bias=False)

forward(x)

Execute the module's forward pass.

Parameters:

Name Type Description Default
x Tensor

Batch of token embeddings.

required

Returns:

Type Description
Tensor

Batch of attention weighted embeddings.

Source code in src/llmz/gpt2.py
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
def forward(self, x: torch.Tensor) -> torch.Tensor:
    """Execute the module's forward pass.

    Args:
        x: Batch of token embeddings.

    Returns:
        Batch of attention weighted embeddings.

    """
    seq_len = x.size()[1]
    if seq_len > self.context_size:
        msg = f"seq_len ({seq_len}) > context_size ({self.context_size})"
        raise GPT2InferenceError(msg)

    positions = torch.arange(0, seq_len, device=x.device)
    y = self.token_embed(x) + self.position_embed(positions)
    y = self.dropout_embed(y)
    y = self.tsfmr_stack(y)
    y = self.final_norm(y)
    logits = self.output_head(y)
    return logits

GPT2Config dataclass

Container class for GPT2 model hyper-parameters.

This class will validate parameters and then allow GPT2 objects to be created using keyword argument expansion - e.g,

config = GPT2Config(...)
model = GPT2(**config)

Parameters:

Name Type Description Default
vocab_size int

The number of unique tokens that the model expects to encounter.

required
embed_dim int

Dimension of input word embeddings.

required
context_size int

The number of input word embeddings in the sequence.

required
n_tsfmr_blocks int

The number of transformer blocks stacked together.

1
n_attn_heads int

The number of attention heads in every transformer block. Defaults to 1.

1
dropout float

The dropout rate. Defaults to 0.6.

0.6
qkv_bias bool

Whether or not to include bias in the linear layers used to compute W_query, W_key and W_value. Defaults to False.

False

Raises:

Type Description
GPT2ConfigError

if any int or float parameter is <= 0, or embed_dim % n_attn_heads != 0

Source code in src/llmz/gpt2.py
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
@dataclass(frozen=True)
class GPT2Config:
    """Container class for GPT2 model hyper-parameters.

    This class will validate parameters and then allow GPT2 objects to be created using
    keyword argument expansion - e.g,

    ```python
    config = GPT2Config(...)
    model = GPT2(**config)
    ```

    Args:
        vocab_size: The number of unique tokens that the model expects to encounter.
        embed_dim: Dimension of input word embeddings.
        context_size: The number of input word embeddings in the sequence.
        n_tsfmr_blocks: The number of transformer blocks stacked together.
        n_attn_heads: The number of attention heads in every transformer block.
            Defaults to 1.
        dropout: The dropout rate. Defaults to 0.6.
        qkv_bias: Whether or not to include bias in the linear layers used to
            compute W_query, W_key and W_value. Defaults to False.

    Raises:
        GPT2ConfigError: if any int or float parameter is <= 0, or
            embed_dim % n_attn_heads != 0

    """

    vocab_size: int
    embed_dim: int
    context_size: int
    n_tsfmr_blocks: int = 1
    n_attn_heads: int = 1
    dropout: float = 0.6
    qkv_bias: bool = False

    def __post_init__(self) -> None:
        """Validate fields after initialisation."""
        errors: list[str] = []

        for field, value in self.__dict__.items():
            if type(value) in (int, float) and value <= 0:
                errors.append(f"{field} is not > 0")

        if self.embed_dim % self.n_attn_heads != 0:
            errors.append("embed_dim % n_attn_heads != 0")

        if errors:
            msg = "invalid GPT2 parameters: " + "\n ".join(errors)
            raise GPT2ConfigError(msg)

    def keys(self) -> KeysView[str]:
        """Get iterator of field keys.

        Part of Mapping protocol required to enable keyword argument expansion using the
        `**` operator.
        """
        return asdict(self).keys()

    def __getitem__(self, key: str) -> GPT2ConfigValue:
        """Get config value via its field name.

        Part of Mapping protocol required to enable keyword argument expansion using the
        `**` operator.
        """
        return asdict(self)[key]

    def __str__(self) -> str:
        """Format config as a string."""
        str_repr = "GPT2Config("
        for key, value in asdict(self).items():
            str_repr += f"{key}={value}, "
        str_repr = str_repr[: len(str_repr) - 2]  # remove final ', '
        str_repr += ")"
        return str_repr

    def __repr__(self) -> str:
        """Format config for the command line."""
        cli_repr = "GPT2Config(\n"
        for key, value in asdict(self).items():
            cli_repr += f"  {key}={value},\n"
        cli_repr += ")"
        return cli_repr

__getitem__(key)

Get config value via its field name.

Part of Mapping protocol required to enable keyword argument expansion using the ** operator.

Source code in src/llmz/gpt2.py
77
78
79
80
81
82
83
def __getitem__(self, key: str) -> GPT2ConfigValue:
    """Get config value via its field name.

    Part of Mapping protocol required to enable keyword argument expansion using the
    `**` operator.
    """
    return asdict(self)[key]

__post_init__()

Validate fields after initialisation.

Source code in src/llmz/gpt2.py
54
55
56
57
58
59
60
61
62
63
64
65
66
67
def __post_init__(self) -> None:
    """Validate fields after initialisation."""
    errors: list[str] = []

    for field, value in self.__dict__.items():
        if type(value) in (int, float) and value <= 0:
            errors.append(f"{field} is not > 0")

    if self.embed_dim % self.n_attn_heads != 0:
        errors.append("embed_dim % n_attn_heads != 0")

    if errors:
        msg = "invalid GPT2 parameters: " + "\n ".join(errors)
        raise GPT2ConfigError(msg)

__repr__()

Format config for the command line.

Source code in src/llmz/gpt2.py
 94
 95
 96
 97
 98
 99
100
def __repr__(self) -> str:
    """Format config for the command line."""
    cli_repr = "GPT2Config(\n"
    for key, value in asdict(self).items():
        cli_repr += f"  {key}={value},\n"
    cli_repr += ")"
    return cli_repr

__str__()

Format config as a string.

Source code in src/llmz/gpt2.py
85
86
87
88
89
90
91
92
def __str__(self) -> str:
    """Format config as a string."""
    str_repr = "GPT2Config("
    for key, value in asdict(self).items():
        str_repr += f"{key}={value}, "
    str_repr = str_repr[: len(str_repr) - 2]  # remove final ', '
    str_repr += ")"
    return str_repr

keys()

Get iterator of field keys.

Part of Mapping protocol required to enable keyword argument expansion using the ** operator.

Source code in src/llmz/gpt2.py
69
70
71
72
73
74
75
def keys(self) -> KeysView[str]:
    """Get iterator of field keys.

    Part of Mapping protocol required to enable keyword argument expansion using the
    `**` operator.
    """
    return asdict(self).keys()

GPT2ConfigError

Bases: Exception

Custom exception for GPT2 inference errors.

Source code in src/llmz/gpt2.py
189
190
191
192
class GPT2ConfigError(Exception):
    """Custom exception for GPT2 inference errors."""

    pass

GPT2InferenceError

Bases: Exception

Custom exception for GPT2 inference errors.

Source code in src/llmz/gpt2.py
195
196
197
198
class GPT2InferenceError(Exception):
    """Custom exception for GPT2 inference errors."""

    pass

GPT2Tokenizer

Bases: _Tokenizer

Pre-trained version of GPT2's tokenizer.

Source code in src/llmz/gpt2.py
173
174
175
176
177
178
179
180
181
182
183
184
185
186
class GPT2Tokenizer(_Tokenizer):
    """Pre-trained version of GPT2's tokenizer."""

    def __init__(self) -> None:
        """Initialise tokenizer."""
        self._tokenizer = tiktoken.get_encoding("gpt2")

    def text2tokens(self, text: str) -> list[int]:
        """Map a string to a list of tokens."""
        return self._tokenizer.encode(text)

    def tokens2text(self, tokens: list[int]) -> str:
        """Map a list of tokens to a string.."""
        return self._tokenizer.decode(tokens)

__init__()

Initialise tokenizer.

Source code in src/llmz/gpt2.py
176
177
178
def __init__(self) -> None:
    """Initialise tokenizer."""
    self._tokenizer = tiktoken.get_encoding("gpt2")

text2tokens(text)

Map a string to a list of tokens.

Source code in src/llmz/gpt2.py
180
181
182
def text2tokens(self, text: str) -> list[int]:
    """Map a string to a list of tokens."""
    return self._tokenizer.encode(text)

tokens2text(tokens)

Map a list of tokens to a string..

Source code in src/llmz/gpt2.py
184
185
186
def tokens2text(self, tokens: list[int]) -> str:
    """Map a list of tokens to a string.."""
    return self._tokenizer.decode(tokens)

Functions for training LLMs.

GradientClipCallback

Callable class that clips model gradient using max norm.

Source code in src/llmz/train.py
87
88
89
90
91
92
93
94
95
96
class GradientClipCallback:
    """Callable class that clips model gradient using max norm."""

    def __init__(self, clip_grad_norm: float = torch.inf):
        """Initialise."""
        self.clip_grad_norm = clip_grad_norm

    def __call__(self, model: nn.Module) -> None:
        """Clip model gradients."""
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=self.clip_grad_norm)

__call__(model)

Clip model gradients.

Source code in src/llmz/train.py
94
95
96
def __call__(self, model: nn.Module) -> None:
    """Clip model gradients."""
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=self.clip_grad_norm)

__init__(clip_grad_norm=torch.inf)

Initialise.

Source code in src/llmz/train.py
90
91
92
def __init__(self, clip_grad_norm: float = torch.inf):
    """Initialise."""
    self.clip_grad_norm = clip_grad_norm

LinearWarmupCosineAnnealingLRSchedule

LR schedule using cosine annealing with linear warmup.

Source code in src/llmz/train.py
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
class LinearWarmupCosineAnnealingLRSchedule:
    """LR schedule using cosine annealing with linear warmup."""

    def __init__(
        self, num_steps: int, warmup_steps: int, initial_lr: float, peak_lr: float
    ):
        """Initialise.

        Args:
            num_steps: The total number of steps for the schedule.
            warmup_steps: Number of steps in the linear warmup phase.
            initial_lr: Learning rate at first step.
            peak_lr: Peak learning rate at end of warmup phase.

        """
        value_errors: list[str] = []
        if num_steps <= 0:
            value_errors.append(" * num_steps <= 0")
        if warmup_steps > num_steps:
            value_errors.append(" * warmup_steps > num_steps")
        if initial_lr <= 0.0:
            value_errors.append(" * initial_lr <= 0.0")
        if peak_lr < initial_lr:
            value_errors.append(" * peak_lr < initial_lr")

        if value_errors:
            e = ValueError("Invalid arguments for LR schedule")
            for error in value_errors:
                e.add_note(error)
            raise e

        self.num_steps = num_steps
        self.warmup_steps = warmup_steps
        self.cosine_steps = num_steps - warmup_steps
        self.initial_lr = initial_lr
        self.lr_cosine_delta = peak_lr - initial_lr
        self.lr_warmup_delta = (peak_lr - initial_lr) / warmup_steps

    def __call__(self, step: int) -> float:
        """Get learning rate for given step.

        Args:
            step: The global training step.

        Returns:
            The learning rate for the global training step.

        Raises:
            ValueError: If step < 0.

        """
        if step < 0:
            raise ValueError(f"{step=}, must be > 0")
        elif step >= 0 and step < self.warmup_steps:
            lr = self.initial_lr + step * self.lr_warmup_delta
        elif step >= self.warmup_steps and step <= self.num_steps:
            step_cosine = step - self.warmup_steps
            x = math.pi * step_cosine / self.cosine_steps
            lr = self.initial_lr + self.lr_cosine_delta * 0.5 * (1.0 + math.cos(x))
        else:
            lr = self.initial_lr

        return lr

__call__(step)

Get learning rate for given step.

Parameters:

Name Type Description Default
step int

The global training step.

required

Returns:

Type Description
float

The learning rate for the global training step.

Raises:

Type Description
ValueError

If step < 0.

Source code in src/llmz/train.py
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
def __call__(self, step: int) -> float:
    """Get learning rate for given step.

    Args:
        step: The global training step.

    Returns:
        The learning rate for the global training step.

    Raises:
        ValueError: If step < 0.

    """
    if step < 0:
        raise ValueError(f"{step=}, must be > 0")
    elif step >= 0 and step < self.warmup_steps:
        lr = self.initial_lr + step * self.lr_warmup_delta
    elif step >= self.warmup_steps and step <= self.num_steps:
        step_cosine = step - self.warmup_steps
        x = math.pi * step_cosine / self.cosine_steps
        lr = self.initial_lr + self.lr_cosine_delta * 0.5 * (1.0 + math.cos(x))
    else:
        lr = self.initial_lr

    return lr

__init__(num_steps, warmup_steps, initial_lr, peak_lr)

Initialise.

Parameters:

Name Type Description Default
num_steps int

The total number of steps for the schedule.

required
warmup_steps int

Number of steps in the linear warmup phase.

required
initial_lr float

Learning rate at first step.

required
peak_lr float

Peak learning rate at end of warmup phase.

required
Source code in src/llmz/train.py
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
def __init__(
    self, num_steps: int, warmup_steps: int, initial_lr: float, peak_lr: float
):
    """Initialise.

    Args:
        num_steps: The total number of steps for the schedule.
        warmup_steps: Number of steps in the linear warmup phase.
        initial_lr: Learning rate at first step.
        peak_lr: Peak learning rate at end of warmup phase.

    """
    value_errors: list[str] = []
    if num_steps <= 0:
        value_errors.append(" * num_steps <= 0")
    if warmup_steps > num_steps:
        value_errors.append(" * warmup_steps > num_steps")
    if initial_lr <= 0.0:
        value_errors.append(" * initial_lr <= 0.0")
    if peak_lr < initial_lr:
        value_errors.append(" * peak_lr < initial_lr")

    if value_errors:
        e = ValueError("Invalid arguments for LR schedule")
        for error in value_errors:
            e.add_note(error)
        raise e

    self.num_steps = num_steps
    self.warmup_steps = warmup_steps
    self.cosine_steps = num_steps - warmup_steps
    self.initial_lr = initial_lr
    self.lr_cosine_delta = peak_lr - initial_lr
    self.lr_warmup_delta = (peak_lr - initial_lr) / warmup_steps

autoregressive_llm_loss(model, X_batch, y_batch)

Compute loss for AR LLMs like GPTs.

Parameters:

Name Type Description Default
model Module

The language model.

required
X_batch Tensor

Batch of input tokens.

required
y_batch Tensor

Batch of output tokens - i.e., next token from the input sequence.

required

Returns:

Type Description
Tensor

Mean cross-entropy loss for the batch.

Source code in src/llmz/train.py
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
def autoregressive_llm_loss(
    model: nn.Module, X_batch: torch.Tensor, y_batch: torch.Tensor
) -> torch.Tensor:
    """Compute loss for AR LLMs like GPTs.

    Args:
        model: The language model.
        X_batch: Batch of input tokens.
        y_batch: Batch of output tokens - i.e., next token from the input sequence.

    Returns:
        Mean cross-entropy loss for the batch.

    """
    # model outputs logits as softmax is implemented in cross-entropy calc.
    logits = model(X_batch)

    # flatten logits from [BATCH, SEQ_LEN, N_CLASSES] to [BATCH * SEQ_LEN, N_CLASSES]
    # flatten y_batch from [BATCH, SEQ_LEN] to [BATCH * SEQ_LEN]
    loss = nn.functional.cross_entropy(logits.flatten(0, 1), y_batch.flatten())
    return loss

train(model, loss_calc, optimiser, lr_schedule, train_dataloader, train_epochs, eval_freq_steps, evaluator, model_backward_callbacks=None, log_freq_steps=100, device=torch.device('cpu'))

Trains model.

Parameters:

Name Type Description Default
model Module

The PyTorch model to train.

required
loss_calc Callable[[Module, Tensor, Tensor], Tensor]

Function that calculates and returns loss for model and batch.

required
optimiser Optimizer

The optimizer for updating model parameters.

required
lr_schedule Callable[[int], float] | LRScheduler

Function to compute learning rate for training step.

required
train_dataloader DataLoader

DataLoader for training data.

required
train_epochs int

Number of training epochs.

required
eval_freq_steps int

Number of steps between evaluations.

required
evaluator Evaluator

A handler for all model evaluations.

required
model_backward_callbacks list[Callable[[Module], None]] | None

Optional callbacks for model after backward pass.

None
log_freq_steps int

Number of steps between basic progress logging to stdout. Defaults to 100.

100
device device

The processor to use for training. Defaults to CPU.

device('cpu')
Source code in src/llmz/train.py
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
def train(
    model: nn.Module,
    loss_calc: Callable[[nn.Module, torch.Tensor, torch.Tensor], torch.Tensor],
    optimiser: optim.Optimizer,
    lr_schedule: Callable[[int], float] | optim.lr_scheduler.LRScheduler,
    train_dataloader: DataLoader,
    train_epochs: int,
    eval_freq_steps: int,
    evaluator: Evaluator,
    model_backward_callbacks: list[Callable[[nn.Module], None]] | None = None,
    log_freq_steps: int = 100,
    device: torch.device = torch.device("cpu"),
) -> None:
    """Trains model.

    Args:
        model: The PyTorch model to train.
        loss_calc: Function that calculates and returns loss for model and batch.
        optimiser: The optimizer for updating model parameters.
        lr_schedule: Function to compute learning rate for training step.
        train_dataloader: DataLoader for training data.
        train_epochs: Number of training epochs.
        eval_freq_steps: Number of steps between evaluations.
        evaluator: A handler for all model evaluations.
        model_backward_callbacks: Optional callbacks for model after backward pass.
        log_freq_steps: Number of steps between basic progress logging to stdout.
            Defaults to 100.
        device: The processor to use for training. Defaults to CPU.

    """
    if not isinstance(lr_schedule, optim.lr_scheduler.LRScheduler):
        lr_schedule = optim.lr_scheduler.LambdaLR(optimiser, lr_schedule)

    model = model.to(device)
    step = 0

    for epoch in range(1, train_epochs + 1):
        for X_batch, y_batch in train_dataloader:
            X_batch = X_batch.to(device, non_blocking=True)
            y_batch = y_batch.to(device, non_blocking=True)

            step += 1
            model.train()
            optimiser.zero_grad()

            loss = loss_calc(model, X_batch, y_batch)
            loss.backward()

            if model_backward_callbacks:
                for callback in model_backward_callbacks:
                    callback(model)

            optimiser.step()
            lr_schedule.step()

            if step % log_freq_steps == 0:
                log.info(f"{step=}, {epoch=}")

            if step % eval_freq_steps == 0:
                evaluator.evaluate(step, model)