Skip to content

API Reference

Tokenizers for LLMs.

GPTSmallTextDataset

Bases: Dataset

GPT dataset interface for any 'small' text data.

This will tokenize all text in-memory using a GPT2's tokenization algorithm, which is a pre-trained Bite Pair Encoding (BPE).

Source code in src/llmz/datasets.py
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
class GPTSmallTextDataset(Dataset):
    """GPT dataset interface for any 'small' text data.

    This will tokenize all text in-memory using a GPT2's tokenization algorithm, which
    is a pre-trained Bite Pair Encoding (BPE).
    """

    def __init__(self, text: str, max_length: int = 256, stride: int = 128):
        """Initialise.

        Args:
            text: Raw text data to convert into tokens.
            max_length: Number of tokens for each data instance. Defaults to 256.
            stride: Separation (in tokens) between consecutive instances. Defaults to
                128.

        """
        tokenizer = tiktoken.get_encoding("gpt2")
        tokens = tokenizer.encode(text)

        n_tokens = len(tokens)
        n_instances = int((n_tokens - max_length) / stride)
        if n_instances == 0:
            raise RuntimeError("max_length + stride <= number of tokens")

        self._X = torch.ones((n_instances, max_length))
        self._y = torch.ones((n_instances, max_length))

        for n, i in enumerate(range(0, n_tokens - max_length, stride)):
            self._X[n,] = torch.tensor(tokens[i : i + max_length])
            self._y[n,] = torch.tensor(tokens[i + 1 : i + max_length + 1])

    def create_data_loader(
        self,
        batch_size: int = 4,
        shuffle: bool = True,
        drop_last: bool = True,
        num_workers: int = 0,
    ) -> DataLoader:
        """Create data loader.

        Args:
            batch_size: The batch size. Defaults to 4.
            shuffle: Whether to randomise instance order after each iteration. Defaults
                to True.
            drop_last: Drop last batch if less than `batch_size`. Defaults to True.
            num_workers: Number of CPU processes to use for pre-processing. Defaults to
                0.

        Returns:
            A fully configured DataLoader

        """
        return DataLoader(
            self,
            batch_size=batch_size,
            shuffle=shuffle,
            drop_last=drop_last,
            num_workers=num_workers,
        )

    def __len__(self) -> int:
        return self._X.size(0)

    def __getitem__(self, idx: int) -> tuple[torch.Tensor, torch.Tensor]:
        return self._X[idx,], self._y[idx,]

__init__(text, max_length=256, stride=128)

Initialise.

Parameters:

Name Type Description Default
text str

Raw text data to convert into tokens.

required
max_length int

Number of tokens for each data instance. Defaults to 256.

256
stride int

Separation (in tokens) between consecutive instances. Defaults to 128.

128
Source code in src/llmz/datasets.py
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
def __init__(self, text: str, max_length: int = 256, stride: int = 128):
    """Initialise.

    Args:
        text: Raw text data to convert into tokens.
        max_length: Number of tokens for each data instance. Defaults to 256.
        stride: Separation (in tokens) between consecutive instances. Defaults to
            128.

    """
    tokenizer = tiktoken.get_encoding("gpt2")
    tokens = tokenizer.encode(text)

    n_tokens = len(tokens)
    n_instances = int((n_tokens - max_length) / stride)
    if n_instances == 0:
        raise RuntimeError("max_length + stride <= number of tokens")

    self._X = torch.ones((n_instances, max_length))
    self._y = torch.ones((n_instances, max_length))

    for n, i in enumerate(range(0, n_tokens - max_length, stride)):
        self._X[n,] = torch.tensor(tokens[i : i + max_length])
        self._y[n,] = torch.tensor(tokens[i + 1 : i + max_length + 1])

create_data_loader(batch_size=4, shuffle=True, drop_last=True, num_workers=0)

Create data loader.

Parameters:

Name Type Description Default
batch_size int

The batch size. Defaults to 4.

4
shuffle bool

Whether to randomise instance order after each iteration. Defaults to True.

True
drop_last bool

Drop last batch if less than batch_size. Defaults to True.

True
num_workers int

Number of CPU processes to use for pre-processing. Defaults to 0.

0

Returns:

Type Description
DataLoader

A fully configured DataLoader

Source code in src/llmz/datasets.py
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
def create_data_loader(
    self,
    batch_size: int = 4,
    shuffle: bool = True,
    drop_last: bool = True,
    num_workers: int = 0,
) -> DataLoader:
    """Create data loader.

    Args:
        batch_size: The batch size. Defaults to 4.
        shuffle: Whether to randomise instance order after each iteration. Defaults
            to True.
        drop_last: Drop last batch if less than `batch_size`. Defaults to True.
        num_workers: Number of CPU processes to use for pre-processing. Defaults to
            0.

    Returns:
        A fully configured DataLoader

    """
    return DataLoader(
        self,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers,
    )