mirror of
https://github.com/codecrafters-io/build-your-own-x
synced 2026-07-02 16:59:25 +00:00
Implements a character-level GPT-style Transformer: - model.py: CausalSelfAttention, FeedForward, TransformerBlock, LLM - tokenizer.py: CharTokenizer (char -> int mapping) - train.py: training loop with AdamW, gradient clipping, checkpointing, sampling - generate.py: load checkpoint and generate text from a prompt Verified working on a built-in Shakespeare excerpt (805k param model). https://claude.ai/code/session_01SWXLQb3nFTiygbp74dpjVa
20 lines
615 B
Python
20 lines
615 B
Python
"""
|
|
Character-level tokenizer.
|
|
|
|
Maps every unique character in the training corpus to an integer id.
|
|
Simple, requires no external libraries, and good enough for a tiny LLM.
|
|
"""
|
|
|
|
|
|
class CharTokenizer:
|
|
def __init__(self, text: str):
|
|
chars = sorted(set(text))
|
|
self.vocab_size = len(chars)
|
|
self._stoi = {ch: i for i, ch in enumerate(chars)}
|
|
self._itos = {i: ch for i, ch in enumerate(chars)}
|
|
|
|
def encode(self, text: str) -> list[int]:
|
|
return [self._stoi[ch] for ch in text]
|
|
|
|
def decode(self, ids: list[int]) -> str:
|
|
return "".join(self._itos[i] for i in ids)
|