mirror of
https://github.com/codecrafters-io/build-your-own-x
synced 2026-07-02 16:59:25 +00:00
- Add scripts/validate_links.py with category-based link checking, dead link detection, archive.org fallback suggestions, concurrent checking with threading, and result caching - Add scripts/test_validate_links.py with comprehensive pytest tests - Fix formatting issue in README.md Signed-off-by: Srikanth Patchava <spatchava@meta.com>
334 lines
11 KiB
Python
334 lines
11 KiB
Python
#!/usr/bin/env python3
|
|
"""Tutorial Link Validator for build-your-own-x repository.
|
|
|
|
Validates all tutorial links in README.md, checking for dead links,
|
|
categorizing results, and suggesting archive.org fallbacks.
|
|
"""
|
|
|
|
import argparse
|
|
import concurrent.futures
|
|
import hashlib
|
|
import json
|
|
import os
|
|
import re
|
|
import sys
|
|
import time
|
|
import urllib.parse
|
|
import urllib.request
|
|
from collections import defaultdict
|
|
from dataclasses import dataclass, field, asdict
|
|
from typing import Dict, List, Optional, Tuple
|
|
|
|
|
|
# default config
|
|
DEFAULT_TIMEOUT = 10
|
|
DEFAULT_MAX_WORKERS = 10
|
|
DEFAULT_CACHE_FILE = ".link_cache.json"
|
|
USER_AGENT = "Mozilla/5.0 (build-your-own-x link checker)"
|
|
ARCHIVE_ORG_PREFIX = "https://web.archive.org/web/"
|
|
|
|
|
|
@dataclass
|
|
class LinkResult:
|
|
"""Result of checking a single link."""
|
|
url: str
|
|
status_code: int
|
|
is_alive: bool
|
|
category: str
|
|
line_number: int
|
|
error: Optional[str] = None
|
|
archive_url: Optional[str] = None
|
|
response_time: float = 0.0
|
|
|
|
|
|
@dataclass
|
|
class ValidationReport:
|
|
"""Overall validation report."""
|
|
total_links: int = 0
|
|
alive_links: int = 0
|
|
dead_links: int = 0
|
|
skipped_links: int = 0
|
|
errors: int = 0
|
|
results: List[LinkResult] = field(default_factory=list)
|
|
categories: Dict[str, int] = field(default_factory=lambda: defaultdict(int))
|
|
start_time: float = 0.0
|
|
end_time: float = 0.0
|
|
|
|
|
|
class LinkCache:
|
|
"""Cache for link check results to avoid re-checking."""
|
|
|
|
def __init__(self, cache_file: str = DEFAULT_CACHE_FILE):
|
|
self.cache_file = cache_file
|
|
self.cache: Dict[str, dict] = {}
|
|
self._load()
|
|
|
|
def _load(self):
|
|
if os.path.exists(self.cache_file):
|
|
try:
|
|
with open(self.cache_file, "r", encoding="utf-8") as f:
|
|
self.cache = json.load(f)
|
|
except (json.JSONDecodeError, IOError):
|
|
self.cache = {}
|
|
|
|
def save(self):
|
|
with open(self.cache_file, "w", encoding="utf-8") as f:
|
|
json.dump(self.cache, f, indent=2)
|
|
|
|
def _key(self, url: str) -> str:
|
|
return hashlib.md5(url.encode()).hexdigest()
|
|
|
|
def get(self, url: str, max_age: int = 86400) -> Optional[dict]:
|
|
key = self._key(url)
|
|
if key in self.cache:
|
|
entry = self.cache[key]
|
|
if time.time() - entry.get("timestamp", 0) < max_age:
|
|
return entry
|
|
return None
|
|
|
|
def set(self, url: str, result: dict):
|
|
key = self._key(url)
|
|
result["timestamp"] = time.time()
|
|
self.cache[key] = result
|
|
|
|
|
|
def extract_markdown_links(text: str) -> List[Tuple[str, str, int]]:
|
|
"""Extract all markdown links with their text and line numbers."""
|
|
links = []
|
|
for line_num, line in enumerate(text.splitlines(), 1):
|
|
# Match [text](url) pattern
|
|
for match in re.finditer(r'\[([^\]]+)\]\(([^)]+)\)', line):
|
|
link_text = match.group(1)
|
|
url = match.group(2)
|
|
if url.startswith(("http://", "https://")):
|
|
links.append((link_text, url, line_num))
|
|
return links
|
|
|
|
|
|
def extract_categories(text: str) -> Dict[str, List[Tuple[str, str, int]]]:
|
|
"""Parse README.md to extract links organized by category."""
|
|
categories = defaultdict(list)
|
|
current_category = "Uncategorized"
|
|
|
|
for line_num, line in enumerate(text.splitlines(), 1):
|
|
# Detect category headers (## Build your own ...)
|
|
header_match = re.match(r'^#{1,3}\s+(.+)', line)
|
|
if header_match:
|
|
current_category = header_match.group(1).strip()
|
|
continue
|
|
|
|
# Extract links in current category
|
|
for match in re.finditer(r'\[([^\]]+)\]\(([^)]+)\)', line):
|
|
link_text = match.group(1)
|
|
url = match.group(2)
|
|
if url.startswith(("http://", "https://")):
|
|
categories[current_category].append((link_text, url, line_num))
|
|
|
|
return dict(categories)
|
|
|
|
|
|
def check_single_link(url: str, timeout: int = DEFAULT_TIMEOUT) -> Tuple[int, bool, Optional[str], float]:
|
|
"""Check if a single URL is alive. Returns (status_code, is_alive, error, response_time)."""
|
|
start = time.time()
|
|
try:
|
|
req = urllib.request.Request(url, method="HEAD", headers={"User-Agent": USER_AGENT})
|
|
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
|
elapsed = time.time() - start
|
|
return resp.status, True, None, elapsed
|
|
except urllib.error.HTTPError as e:
|
|
elapsed = time.time() - start
|
|
return e.code, False, str(e.reason), elapsed
|
|
except urllib.error.URLError as e:
|
|
elapsed = time.time() - start
|
|
return 0, False, str(e.reason), elapsed
|
|
except Exception as e:
|
|
elapsed = time.time() - start
|
|
return 0, False, str(e), elapsed
|
|
|
|
|
|
def get_archive_url(url: str) -> Optional[str]:
|
|
"""Generate an archive.org fallback URL."""
|
|
encoded = urllib.parse.quote(url, safe="")
|
|
return f"{ARCHIVE_ORG_PREFIX}{encoded}"
|
|
|
|
|
|
def check_link_with_cache(
|
|
url: str,
|
|
category: str,
|
|
line_number: int,
|
|
cache: LinkCache,
|
|
timeout: int = DEFAULT_TIMEOUT,
|
|
) -> LinkResult:
|
|
"""Check a link, using cache if available."""
|
|
cached = cache.get(url)
|
|
if cached:
|
|
return LinkResult(
|
|
url=url,
|
|
status_code=cached.get("status_code", 0),
|
|
is_alive=cached.get("is_alive", False),
|
|
category=category,
|
|
line_number=line_number,
|
|
error=cached.get("error"),
|
|
archive_url=cached.get("archive_url"),
|
|
response_time=cached.get("response_time", 0.0),
|
|
)
|
|
|
|
status_code, is_alive, error, response_time = check_single_link(url, timeout)
|
|
archive_url = None if is_alive else get_archive_url(url)
|
|
|
|
result = LinkResult(
|
|
url=url,
|
|
status_code=status_code,
|
|
is_alive=is_alive,
|
|
category=category,
|
|
line_number=line_number,
|
|
error=error,
|
|
archive_url=archive_url,
|
|
response_time=response_time,
|
|
)
|
|
|
|
cache.set(url, {
|
|
"status_code": status_code,
|
|
"is_alive": is_alive,
|
|
"error": error,
|
|
"archive_url": archive_url,
|
|
"response_time": response_time,
|
|
})
|
|
|
|
return result
|
|
|
|
|
|
def validate_links(
|
|
readme_path: str,
|
|
max_workers: int = DEFAULT_MAX_WORKERS,
|
|
timeout: int = DEFAULT_TIMEOUT,
|
|
cache_file: str = DEFAULT_CACHE_FILE,
|
|
verbose: bool = False,
|
|
) -> ValidationReport:
|
|
"""Validate all links in a README file."""
|
|
report = ValidationReport(start_time=time.time())
|
|
|
|
with open(readme_path, "r", encoding="utf-8") as f:
|
|
content = f.read()
|
|
|
|
categories = extract_categories(content)
|
|
cache = LinkCache(cache_file)
|
|
|
|
# Flatten all links with their categories
|
|
all_links = []
|
|
for cat, links in categories.items():
|
|
for text, url, line_num in links:
|
|
all_links.append((url, cat, line_num))
|
|
report.categories[cat] = report.categories.get(cat, 0) + 1
|
|
|
|
report.total_links = len(all_links)
|
|
checked = 0
|
|
|
|
if verbose:
|
|
print(f"Found {report.total_links} links across {len(categories)} categories")
|
|
|
|
# Check links concurrently
|
|
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
futures = {
|
|
executor.submit(check_link_with_cache, url, cat, line_num, cache, timeout): (url, cat, line_num)
|
|
for url, cat, line_num in all_links
|
|
}
|
|
|
|
for future in concurrent.futures.as_completed(futures):
|
|
result = future.result()
|
|
report.results.append(result)
|
|
|
|
if result.is_alive:
|
|
report.alive_links += 1
|
|
elif result.error:
|
|
report.errors += 1
|
|
report.dead_links += 1
|
|
else:
|
|
report.dead_links += 1
|
|
|
|
checked += 1
|
|
if verbose and checked % 10 == 0:
|
|
print(f"Progress: {checked}/{report.total_links} links checked")
|
|
|
|
cache.save()
|
|
report.end_time = time.time()
|
|
return report
|
|
|
|
|
|
def print_report(report: ValidationReport):
|
|
"""Print a formatted validation report."""
|
|
duration = report.end_time - report.start_time
|
|
print("\n" + "=" * 60)
|
|
print("LINK VALIDATION REPORT")
|
|
print("=" * 60)
|
|
print(f"Total links checked: {report.total_links}")
|
|
print(f"Alive: {report.alive_links}")
|
|
print(f"Dead: {report.dead_links}")
|
|
print(f"Errors: {report.errors}")
|
|
print(f"Duration: {duration:.1f}s")
|
|
print()
|
|
|
|
# Print categories summary
|
|
print("Categories:")
|
|
for cat, count in sorted(report.categories.items()):
|
|
print(f" {cat}: {count} links")
|
|
print()
|
|
|
|
# Print dead links
|
|
dead = [r for r in report.results if not r.is_alive]
|
|
if dead:
|
|
print("Dead Links:")
|
|
for r in dead:
|
|
print(f" Line {r.line_number}: {r.url}")
|
|
print(f" Status: {r.status_code} | Error: {r.error}")
|
|
if r.archive_url:
|
|
print(f" Archive: {r.archive_url}")
|
|
print()
|
|
|
|
|
|
def main():
|
|
"""Main entry point."""
|
|
parser = argparse.ArgumentParser(description="Validate tutorial links in README.md")
|
|
parser.add_argument("readme", nargs="?", default="README.md", help="Path to README.md")
|
|
parser.add_argument("--workers", type=int, default=DEFAULT_MAX_WORKERS, help="Max concurrent workers")
|
|
parser.add_argument("--timeout", type=int, default=DEFAULT_TIMEOUT, help="Request timeout in seconds")
|
|
parser.add_argument("--cache-file", default=DEFAULT_CACHE_FILE, help="Cache file path")
|
|
parser.add_argument("--no-cache", action="store_true", help="Disable caching")
|
|
parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
|
|
parser.add_argument("--json-output", help="Write results as JSON to file")
|
|
args = parser.parse_args()
|
|
|
|
if not os.path.exists(args.readme):
|
|
print(f"Error: {args.readme} not found", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
cache_file = args.cache_file if not args.no_cache else None
|
|
if cache_file is None:
|
|
cache_file = os.devnull
|
|
|
|
report = validate_links(
|
|
args.readme,
|
|
max_workers=args.workers,
|
|
timeout=args.timeout,
|
|
cache_file=cache_file,
|
|
verbose=args.verbose,
|
|
)
|
|
|
|
print_report(report)
|
|
|
|
if args.json_output:
|
|
with open(args.json_output, "w", encoding="utf-8") as f:
|
|
json.dump({
|
|
"total": report.total_links,
|
|
"alive": report.alive_links,
|
|
"dead": report.dead_links,
|
|
"errors": report.errors,
|
|
"results": [asdict(r) for r in report.results],
|
|
}, f, indent=2)
|
|
print(f"JSON report written to {args.json_output}")
|
|
|
|
sys.exit(1 if report.dead_links > 0 else 0)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|