mirror of
https://github.com/codecrafters-io/build-your-own-x
synced 2026-07-02 16:59:25 +00:00
Merge 1603f8cdcb into 264b4547b3
This commit is contained in:
commit
87214d3600
3 changed files with 456 additions and 4 deletions
|
|
@ -2,7 +2,7 @@
|
|||
|
||||
## Build your own <insert-technology-here>
|
||||
|
||||
This repository is a compilation of well-written, step-by-step guides for re-creating our favorite technologies from scratch.
|
||||
This repository is a compilation of well-written, step-by-step guides for re-creating our favorite technologies from scratch.
|
||||
|
||||
> *What I cannot create, I do not understand — Richard Feynman.*
|
||||
|
||||
|
|
@ -360,7 +360,7 @@ It's a great way to learn.
|
|||
* [**Go**: _How to build a regex engine from scratch_](https://rhaeguard.github.io/posts/regex)
|
||||
* [**JavaScript**: _Build a Regex Engine in Less than 40 Lines of Code_](https://nickdrane.com/build-your-own-regex/)
|
||||
* [**JavaScript**: _How to implement regular expressions in functional javascript using derivatives_](http://dpk.io/dregs/toydregs)
|
||||
* [**JavaScript**: _Implementing a Regular Expression Engine_](https://deniskyashif.com/2019/02/17/implementing-a-regular-expression-engine/)
|
||||
* [**JavaScript**: _Implementing a Regular Expression Engine_](https://deniskyashif.com/2019/02/17/implementing-a-regular-expression-engine/)
|
||||
* [**Perl**: _How Regexes Work_](https://perl.plover.com/Regex/article.html)
|
||||
* [**Python**: _Build Your Own Regular Expression Engines: Backtracking, NFA, DFA_](https://build-your-own.org/b2a/r0_intro)
|
||||
* [**Scala**: _No Magic: Regular Expressions_](https://rcoh.svbtle.com/no-magic-regular-expressions)
|
||||
|
|
@ -380,7 +380,7 @@ It's a great way to learn.
|
|||
* [**C**: _Let's build a shell!_](https://github.com/kamalmarhubi/shell-workshop)
|
||||
* [**C**: _Writing a UNIX Shell_](https://indradhanush.github.io/blog/writing-a-unix-shell-part-1/)
|
||||
* [**C**: _Build Your Own Shell_](https://github.com/tokenrove/build-your-own-shell)
|
||||
* [**C**: Write a shell in C](https://danishpraka.sh/posts/write-a-shell/)
|
||||
* [**C**: _Write a shell in C_](https://danishpraka.sh/posts/write-a-shell/)
|
||||
* [**Go**: _Writing a simple shell in Go_](https://sj14.gitlab.io/post/2018-07-01-go-unix-shell/)
|
||||
* [**Rust**: _Build Your Own Shell using Rust_](https://www.joshmcguigan.com/blog/build-your-own-shell-rust/)
|
||||
|
||||
|
|
@ -494,7 +494,7 @@ It's a great way to learn.
|
|||
* [**Rust**: _WebGL + Rust: Basic Water Tutorial_](https://www.chinedufn.com/3d-webgl-basic-water-tutorial/)
|
||||
* [**TypeScript**: _Tiny Package Manager: Learns how npm or Yarn works_](https://github.com/g-plane/tiny-package-manager)
|
||||
|
||||
## Contribute
|
||||
## Contribute
|
||||
* Submissions welcome, just send a PR, or [create an issue](https://github.com/codecrafters-io/build-your-own-x/issues/new)
|
||||
* Help us review [pending submissions](https://github.com/codecrafters-io/build-your-own-x/issues) by leaving comments and "reactions"
|
||||
|
||||
|
|
|
|||
118
scripts/test_validate_links.py
Normal file
118
scripts/test_validate_links.py
Normal file
|
|
@ -0,0 +1,118 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Tests for validate_links.py"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import tempfile
|
||||
import pytest
|
||||
from unittest.mock import patch, MagicMock
|
||||
from validate_links import (
|
||||
extract_markdown_links,
|
||||
extract_categories,
|
||||
check_single_link,
|
||||
get_archive_url,
|
||||
LinkCache,
|
||||
LinkResult,
|
||||
ValidationReport,
|
||||
validate_links,
|
||||
)
|
||||
|
||||
|
||||
class TestExtractMarkdownLinks:
|
||||
def test_basic_link(self):
|
||||
text = "[Example](https://example.com)"
|
||||
links = extract_markdown_links(text)
|
||||
assert len(links) == 1
|
||||
assert links[0] == ("Example", "https://example.com", 1)
|
||||
|
||||
def test_multiple_links(self):
|
||||
text = "[A](https://a.com)\n[B](https://b.com)"
|
||||
links = extract_markdown_links(text)
|
||||
assert len(links) == 2
|
||||
|
||||
def test_ignores_non_http(self):
|
||||
text = "[Local](./local.md)"
|
||||
links = extract_markdown_links(text)
|
||||
assert len(links) == 0
|
||||
|
||||
def test_preserves_line_numbers(self):
|
||||
text = "line1\n[Link](https://x.com)\nline3"
|
||||
links = extract_markdown_links(text)
|
||||
assert links[0][2] == 2
|
||||
|
||||
def test_multiple_links_per_line(self):
|
||||
text = "[A](https://a.com) and [B](https://b.com)"
|
||||
links = extract_markdown_links(text)
|
||||
assert len(links) == 2
|
||||
|
||||
|
||||
class TestExtractCategories:
|
||||
def test_categorizes_links(self):
|
||||
text = "## Web Server\n[Link](https://x.com)\n## Database\n[DB](https://db.com)"
|
||||
cats = extract_categories(text)
|
||||
assert "Web Server" in cats
|
||||
assert "Database" in cats
|
||||
|
||||
def test_uncategorized(self):
|
||||
text = "[Link](https://x.com)"
|
||||
cats = extract_categories(text)
|
||||
assert "Uncategorized" in cats
|
||||
|
||||
|
||||
class TestGetArchiveUrl:
|
||||
def test_generates_archive_url(self):
|
||||
url = "https://example.com/page"
|
||||
result = get_archive_url(url)
|
||||
assert result.startswith("https://web.archive.org/web/")
|
||||
assert "example.com" in result
|
||||
|
||||
|
||||
class TestLinkCache:
|
||||
def test_set_and_get(self):
|
||||
with tempfile.NamedTemporaryFile(suffix=".json", delete=False) as f:
|
||||
cache = LinkCache(f.name)
|
||||
cache.set("https://example.com", {"status_code": 200, "is_alive": True})
|
||||
result = cache.get("https://example.com")
|
||||
assert result is not None
|
||||
assert result["status_code"] == 200
|
||||
os.unlink(f.name)
|
||||
|
||||
def test_cache_miss(self):
|
||||
with tempfile.NamedTemporaryFile(suffix=".json", delete=False) as f:
|
||||
cache = LinkCache(f.name)
|
||||
result = cache.get("https://nonexistent.com")
|
||||
assert result is None
|
||||
os.unlink(f.name)
|
||||
|
||||
def test_save_and_reload(self):
|
||||
with tempfile.NamedTemporaryFile(suffix=".json", delete=False, mode="w") as f:
|
||||
path = f.name
|
||||
cache = LinkCache(path)
|
||||
cache.set("https://test.com", {"status_code": 200, "is_alive": True})
|
||||
cache.save()
|
||||
cache2 = LinkCache(path)
|
||||
result = cache2.get("https://test.com")
|
||||
assert result is not None
|
||||
os.unlink(path)
|
||||
|
||||
|
||||
class TestLinkResult:
|
||||
def test_creation(self):
|
||||
r = LinkResult(url="https://x.com", status_code=200, is_alive=True, category="Test", line_number=1)
|
||||
assert r.is_alive is True
|
||||
assert r.error is None
|
||||
|
||||
def test_dead_link(self):
|
||||
r = LinkResult(url="https://x.com", status_code=404, is_alive=False, category="Test", line_number=1, error="Not Found")
|
||||
assert r.is_alive is False
|
||||
|
||||
|
||||
class TestValidationReport:
|
||||
def test_defaults(self):
|
||||
r = ValidationReport()
|
||||
assert r.total_links == 0
|
||||
assert r.alive_links == 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
pytest.main([__file__, "-v"])
|
||||
334
scripts/validate_links.py
Normal file
334
scripts/validate_links.py
Normal file
|
|
@ -0,0 +1,334 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Tutorial Link Validator for build-your-own-x repository.
|
||||
|
||||
Validates all tutorial links in README.md, checking for dead links,
|
||||
categorizing results, and suggesting archive.org fallbacks.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import concurrent.futures
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
import urllib.parse
|
||||
import urllib.request
|
||||
from collections import defaultdict
|
||||
from dataclasses import dataclass, field, asdict
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
|
||||
# default config
|
||||
DEFAULT_TIMEOUT = 10
|
||||
DEFAULT_MAX_WORKERS = 10
|
||||
DEFAULT_CACHE_FILE = ".link_cache.json"
|
||||
USER_AGENT = "Mozilla/5.0 (build-your-own-x link checker)"
|
||||
ARCHIVE_ORG_PREFIX = "https://web.archive.org/web/"
|
||||
|
||||
|
||||
@dataclass
|
||||
class LinkResult:
|
||||
"""Result of checking a single link."""
|
||||
url: str
|
||||
status_code: int
|
||||
is_alive: bool
|
||||
category: str
|
||||
line_number: int
|
||||
error: Optional[str] = None
|
||||
archive_url: Optional[str] = None
|
||||
response_time: float = 0.0
|
||||
|
||||
|
||||
@dataclass
|
||||
class ValidationReport:
|
||||
"""Overall validation report."""
|
||||
total_links: int = 0
|
||||
alive_links: int = 0
|
||||
dead_links: int = 0
|
||||
skipped_links: int = 0
|
||||
errors: int = 0
|
||||
results: List[LinkResult] = field(default_factory=list)
|
||||
categories: Dict[str, int] = field(default_factory=lambda: defaultdict(int))
|
||||
start_time: float = 0.0
|
||||
end_time: float = 0.0
|
||||
|
||||
|
||||
class LinkCache:
|
||||
"""Cache for link check results to avoid re-checking."""
|
||||
|
||||
def __init__(self, cache_file: str = DEFAULT_CACHE_FILE):
|
||||
self.cache_file = cache_file
|
||||
self.cache: Dict[str, dict] = {}
|
||||
self._load()
|
||||
|
||||
def _load(self):
|
||||
if os.path.exists(self.cache_file):
|
||||
try:
|
||||
with open(self.cache_file, "r", encoding="utf-8") as f:
|
||||
self.cache = json.load(f)
|
||||
except (json.JSONDecodeError, IOError):
|
||||
self.cache = {}
|
||||
|
||||
def save(self):
|
||||
with open(self.cache_file, "w", encoding="utf-8") as f:
|
||||
json.dump(self.cache, f, indent=2)
|
||||
|
||||
def _key(self, url: str) -> str:
|
||||
return hashlib.md5(url.encode()).hexdigest()
|
||||
|
||||
def get(self, url: str, max_age: int = 86400) -> Optional[dict]:
|
||||
key = self._key(url)
|
||||
if key in self.cache:
|
||||
entry = self.cache[key]
|
||||
if time.time() - entry.get("timestamp", 0) < max_age:
|
||||
return entry
|
||||
return None
|
||||
|
||||
def set(self, url: str, result: dict):
|
||||
key = self._key(url)
|
||||
result["timestamp"] = time.time()
|
||||
self.cache[key] = result
|
||||
|
||||
|
||||
def extract_markdown_links(text: str) -> List[Tuple[str, str, int]]:
|
||||
"""Extract all markdown links with their text and line numbers."""
|
||||
links = []
|
||||
for line_num, line in enumerate(text.splitlines(), 1):
|
||||
# Match [text](url) pattern
|
||||
for match in re.finditer(r'\[([^\]]+)\]\(([^)]+)\)', line):
|
||||
link_text = match.group(1)
|
||||
url = match.group(2)
|
||||
if url.startswith(("http://", "https://")):
|
||||
links.append((link_text, url, line_num))
|
||||
return links
|
||||
|
||||
|
||||
def extract_categories(text: str) -> Dict[str, List[Tuple[str, str, int]]]:
|
||||
"""Parse README.md to extract links organized by category."""
|
||||
categories = defaultdict(list)
|
||||
current_category = "Uncategorized"
|
||||
|
||||
for line_num, line in enumerate(text.splitlines(), 1):
|
||||
# Detect category headers (## Build your own ...)
|
||||
header_match = re.match(r'^#{1,3}\s+(.+)', line)
|
||||
if header_match:
|
||||
current_category = header_match.group(1).strip()
|
||||
continue
|
||||
|
||||
# Extract links in current category
|
||||
for match in re.finditer(r'\[([^\]]+)\]\(([^)]+)\)', line):
|
||||
link_text = match.group(1)
|
||||
url = match.group(2)
|
||||
if url.startswith(("http://", "https://")):
|
||||
categories[current_category].append((link_text, url, line_num))
|
||||
|
||||
return dict(categories)
|
||||
|
||||
|
||||
def check_single_link(url: str, timeout: int = DEFAULT_TIMEOUT) -> Tuple[int, bool, Optional[str], float]:
|
||||
"""Check if a single URL is alive. Returns (status_code, is_alive, error, response_time)."""
|
||||
start = time.time()
|
||||
try:
|
||||
req = urllib.request.Request(url, method="HEAD", headers={"User-Agent": USER_AGENT})
|
||||
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
||||
elapsed = time.time() - start
|
||||
return resp.status, True, None, elapsed
|
||||
except urllib.error.HTTPError as e:
|
||||
elapsed = time.time() - start
|
||||
return e.code, False, str(e.reason), elapsed
|
||||
except urllib.error.URLError as e:
|
||||
elapsed = time.time() - start
|
||||
return 0, False, str(e.reason), elapsed
|
||||
except Exception as e:
|
||||
elapsed = time.time() - start
|
||||
return 0, False, str(e), elapsed
|
||||
|
||||
|
||||
def get_archive_url(url: str) -> Optional[str]:
|
||||
"""Generate an archive.org fallback URL."""
|
||||
encoded = urllib.parse.quote(url, safe="")
|
||||
return f"{ARCHIVE_ORG_PREFIX}{encoded}"
|
||||
|
||||
|
||||
def check_link_with_cache(
|
||||
url: str,
|
||||
category: str,
|
||||
line_number: int,
|
||||
cache: LinkCache,
|
||||
timeout: int = DEFAULT_TIMEOUT,
|
||||
) -> LinkResult:
|
||||
"""Check a link, using cache if available."""
|
||||
cached = cache.get(url)
|
||||
if cached:
|
||||
return LinkResult(
|
||||
url=url,
|
||||
status_code=cached.get("status_code", 0),
|
||||
is_alive=cached.get("is_alive", False),
|
||||
category=category,
|
||||
line_number=line_number,
|
||||
error=cached.get("error"),
|
||||
archive_url=cached.get("archive_url"),
|
||||
response_time=cached.get("response_time", 0.0),
|
||||
)
|
||||
|
||||
status_code, is_alive, error, response_time = check_single_link(url, timeout)
|
||||
archive_url = None if is_alive else get_archive_url(url)
|
||||
|
||||
result = LinkResult(
|
||||
url=url,
|
||||
status_code=status_code,
|
||||
is_alive=is_alive,
|
||||
category=category,
|
||||
line_number=line_number,
|
||||
error=error,
|
||||
archive_url=archive_url,
|
||||
response_time=response_time,
|
||||
)
|
||||
|
||||
cache.set(url, {
|
||||
"status_code": status_code,
|
||||
"is_alive": is_alive,
|
||||
"error": error,
|
||||
"archive_url": archive_url,
|
||||
"response_time": response_time,
|
||||
})
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def validate_links(
|
||||
readme_path: str,
|
||||
max_workers: int = DEFAULT_MAX_WORKERS,
|
||||
timeout: int = DEFAULT_TIMEOUT,
|
||||
cache_file: str = DEFAULT_CACHE_FILE,
|
||||
verbose: bool = False,
|
||||
) -> ValidationReport:
|
||||
"""Validate all links in a README file."""
|
||||
report = ValidationReport(start_time=time.time())
|
||||
|
||||
with open(readme_path, "r", encoding="utf-8") as f:
|
||||
content = f.read()
|
||||
|
||||
categories = extract_categories(content)
|
||||
cache = LinkCache(cache_file)
|
||||
|
||||
# Flatten all links with their categories
|
||||
all_links = []
|
||||
for cat, links in categories.items():
|
||||
for text, url, line_num in links:
|
||||
all_links.append((url, cat, line_num))
|
||||
report.categories[cat] = report.categories.get(cat, 0) + 1
|
||||
|
||||
report.total_links = len(all_links)
|
||||
checked = 0
|
||||
|
||||
if verbose:
|
||||
print(f"Found {report.total_links} links across {len(categories)} categories")
|
||||
|
||||
# Check links concurrently
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
|
||||
futures = {
|
||||
executor.submit(check_link_with_cache, url, cat, line_num, cache, timeout): (url, cat, line_num)
|
||||
for url, cat, line_num in all_links
|
||||
}
|
||||
|
||||
for future in concurrent.futures.as_completed(futures):
|
||||
result = future.result()
|
||||
report.results.append(result)
|
||||
|
||||
if result.is_alive:
|
||||
report.alive_links += 1
|
||||
elif result.error:
|
||||
report.errors += 1
|
||||
report.dead_links += 1
|
||||
else:
|
||||
report.dead_links += 1
|
||||
|
||||
checked += 1
|
||||
if verbose and checked % 10 == 0:
|
||||
print(f"Progress: {checked}/{report.total_links} links checked")
|
||||
|
||||
cache.save()
|
||||
report.end_time = time.time()
|
||||
return report
|
||||
|
||||
|
||||
def print_report(report: ValidationReport):
|
||||
"""Print a formatted validation report."""
|
||||
duration = report.end_time - report.start_time
|
||||
print("\n" + "=" * 60)
|
||||
print("LINK VALIDATION REPORT")
|
||||
print("=" * 60)
|
||||
print(f"Total links checked: {report.total_links}")
|
||||
print(f"Alive: {report.alive_links}")
|
||||
print(f"Dead: {report.dead_links}")
|
||||
print(f"Errors: {report.errors}")
|
||||
print(f"Duration: {duration:.1f}s")
|
||||
print()
|
||||
|
||||
# Print categories summary
|
||||
print("Categories:")
|
||||
for cat, count in sorted(report.categories.items()):
|
||||
print(f" {cat}: {count} links")
|
||||
print()
|
||||
|
||||
# Print dead links
|
||||
dead = [r for r in report.results if not r.is_alive]
|
||||
if dead:
|
||||
print("Dead Links:")
|
||||
for r in dead:
|
||||
print(f" Line {r.line_number}: {r.url}")
|
||||
print(f" Status: {r.status_code} | Error: {r.error}")
|
||||
if r.archive_url:
|
||||
print(f" Archive: {r.archive_url}")
|
||||
print()
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point."""
|
||||
parser = argparse.ArgumentParser(description="Validate tutorial links in README.md")
|
||||
parser.add_argument("readme", nargs="?", default="README.md", help="Path to README.md")
|
||||
parser.add_argument("--workers", type=int, default=DEFAULT_MAX_WORKERS, help="Max concurrent workers")
|
||||
parser.add_argument("--timeout", type=int, default=DEFAULT_TIMEOUT, help="Request timeout in seconds")
|
||||
parser.add_argument("--cache-file", default=DEFAULT_CACHE_FILE, help="Cache file path")
|
||||
parser.add_argument("--no-cache", action="store_true", help="Disable caching")
|
||||
parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
|
||||
parser.add_argument("--json-output", help="Write results as JSON to file")
|
||||
args = parser.parse_args()
|
||||
|
||||
if not os.path.exists(args.readme):
|
||||
print(f"Error: {args.readme} not found", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
cache_file = args.cache_file if not args.no_cache else None
|
||||
if cache_file is None:
|
||||
cache_file = os.devnull
|
||||
|
||||
report = validate_links(
|
||||
args.readme,
|
||||
max_workers=args.workers,
|
||||
timeout=args.timeout,
|
||||
cache_file=cache_file,
|
||||
verbose=args.verbose,
|
||||
)
|
||||
|
||||
print_report(report)
|
||||
|
||||
if args.json_output:
|
||||
with open(args.json_output, "w", encoding="utf-8") as f:
|
||||
json.dump({
|
||||
"total": report.total_links,
|
||||
"alive": report.alive_links,
|
||||
"dead": report.dead_links,
|
||||
"errors": report.errors,
|
||||
"results": [asdict(r) for r in report.results],
|
||||
}, f, indent=2)
|
||||
print(f"JSON report written to {args.json_output}")
|
||||
|
||||
sys.exit(1 if report.dead_links > 0 else 0)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Reference in a new issue