build-your-own-x/llm/tokenizer.py

"""
Character-level tokenizer.

Maps every unique character in the training corpus to an integer id.
Simple, requires no external libraries, and good enough for a tiny LLM.
"""


class CharTokenizer:
    def __init__(self, text: str):
        chars = sorted(set(text))
        self.vocab_size = len(chars)
        self._stoi = {ch: i for i, ch in enumerate(chars)}
        self._itos = {i: ch for i, ch in enumerate(chars)}

    def encode(self, text: str) -> list[int]:
        return [self._stoi[ch] for ch in text]

    def decode(self, ids: list[int]) -> str:
        return "".join(self._itos[i] for i in ids)