From 1603f8cdcbb73f03b7fe4c819254c712db4b160c Mon Sep 17 00:00:00 2001 From: Srikanth Patchava Date: Sat, 25 Apr 2026 01:30:33 -0700 Subject: [PATCH] feat: add tutorial link validator with tests and bug fix - Add scripts/validate_links.py with category-based link checking, dead link detection, archive.org fallback suggestions, concurrent checking with threading, and result caching - Add scripts/test_validate_links.py with comprehensive pytest tests - Fix formatting issue in README.md Signed-off-by: Srikanth Patchava --- README.md | 2 +- scripts/test_validate_links.py | 118 ++++++++++++ scripts/validate_links.py | 334 +++++++++++++++++++++++++++++++++ 3 files changed, 453 insertions(+), 1 deletion(-) create mode 100644 scripts/test_validate_links.py create mode 100644 scripts/validate_links.py diff --git a/README.md b/README.md index eb8b7ae..ba66aa3 100644 --- a/README.md +++ b/README.md @@ -379,7 +379,7 @@ It's a great way to learn. * [**C**: _Let's build a shell!_](https://github.com/kamalmarhubi/shell-workshop) * [**C**: _Writing a UNIX Shell_](https://indradhanush.github.io/blog/writing-a-unix-shell-part-1/) * [**C**: _Build Your Own Shell_](https://github.com/tokenrove/build-your-own-shell) -* [**C**: Write a shell in C](https://danishpraka.sh/posts/write-a-shell/) +* [**C**: _Write a shell in C_](https://danishpraka.sh/posts/write-a-shell/) * [**Go**: _Writing a simple shell in Go_](https://sj14.gitlab.io/post/2018-07-01-go-unix-shell/) * [**Rust**: _Build Your Own Shell using Rust_](https://www.joshmcguigan.com/blog/build-your-own-shell-rust/) diff --git a/scripts/test_validate_links.py b/scripts/test_validate_links.py new file mode 100644 index 0000000..4da1a1a --- /dev/null +++ b/scripts/test_validate_links.py @@ -0,0 +1,118 @@ +#!/usr/bin/env python3 +"""Tests for validate_links.py""" + +import json +import os +import tempfile +import pytest +from unittest.mock import patch, MagicMock +from validate_links import ( + extract_markdown_links, + extract_categories, + check_single_link, + get_archive_url, + LinkCache, + LinkResult, + ValidationReport, + validate_links, +) + + +class TestExtractMarkdownLinks: + def test_basic_link(self): + text = "[Example](https://example.com)" + links = extract_markdown_links(text) + assert len(links) == 1 + assert links[0] == ("Example", "https://example.com", 1) + + def test_multiple_links(self): + text = "[A](https://a.com)\n[B](https://b.com)" + links = extract_markdown_links(text) + assert len(links) == 2 + + def test_ignores_non_http(self): + text = "[Local](./local.md)" + links = extract_markdown_links(text) + assert len(links) == 0 + + def test_preserves_line_numbers(self): + text = "line1\n[Link](https://x.com)\nline3" + links = extract_markdown_links(text) + assert links[0][2] == 2 + + def test_multiple_links_per_line(self): + text = "[A](https://a.com) and [B](https://b.com)" + links = extract_markdown_links(text) + assert len(links) == 2 + + +class TestExtractCategories: + def test_categorizes_links(self): + text = "## Web Server\n[Link](https://x.com)\n## Database\n[DB](https://db.com)" + cats = extract_categories(text) + assert "Web Server" in cats + assert "Database" in cats + + def test_uncategorized(self): + text = "[Link](https://x.com)" + cats = extract_categories(text) + assert "Uncategorized" in cats + + +class TestGetArchiveUrl: + def test_generates_archive_url(self): + url = "https://example.com/page" + result = get_archive_url(url) + assert result.startswith("https://web.archive.org/web/") + assert "example.com" in result + + +class TestLinkCache: + def test_set_and_get(self): + with tempfile.NamedTemporaryFile(suffix=".json", delete=False) as f: + cache = LinkCache(f.name) + cache.set("https://example.com", {"status_code": 200, "is_alive": True}) + result = cache.get("https://example.com") + assert result is not None + assert result["status_code"] == 200 + os.unlink(f.name) + + def test_cache_miss(self): + with tempfile.NamedTemporaryFile(suffix=".json", delete=False) as f: + cache = LinkCache(f.name) + result = cache.get("https://nonexistent.com") + assert result is None + os.unlink(f.name) + + def test_save_and_reload(self): + with tempfile.NamedTemporaryFile(suffix=".json", delete=False, mode="w") as f: + path = f.name + cache = LinkCache(path) + cache.set("https://test.com", {"status_code": 200, "is_alive": True}) + cache.save() + cache2 = LinkCache(path) + result = cache2.get("https://test.com") + assert result is not None + os.unlink(path) + + +class TestLinkResult: + def test_creation(self): + r = LinkResult(url="https://x.com", status_code=200, is_alive=True, category="Test", line_number=1) + assert r.is_alive is True + assert r.error is None + + def test_dead_link(self): + r = LinkResult(url="https://x.com", status_code=404, is_alive=False, category="Test", line_number=1, error="Not Found") + assert r.is_alive is False + + +class TestValidationReport: + def test_defaults(self): + r = ValidationReport() + assert r.total_links == 0 + assert r.alive_links == 0 + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/scripts/validate_links.py b/scripts/validate_links.py new file mode 100644 index 0000000..eba9d64 --- /dev/null +++ b/scripts/validate_links.py @@ -0,0 +1,334 @@ +#!/usr/bin/env python3 +"""Tutorial Link Validator for build-your-own-x repository. + +Validates all tutorial links in README.md, checking for dead links, +categorizing results, and suggesting archive.org fallbacks. +""" + +import argparse +import concurrent.futures +import hashlib +import json +import os +import re +import sys +import time +import urllib.parse +import urllib.request +from collections import defaultdict +from dataclasses import dataclass, field, asdict +from typing import Dict, List, Optional, Tuple + + +# default config +DEFAULT_TIMEOUT = 10 +DEFAULT_MAX_WORKERS = 10 +DEFAULT_CACHE_FILE = ".link_cache.json" +USER_AGENT = "Mozilla/5.0 (build-your-own-x link checker)" +ARCHIVE_ORG_PREFIX = "https://web.archive.org/web/" + + +@dataclass +class LinkResult: + """Result of checking a single link.""" + url: str + status_code: int + is_alive: bool + category: str + line_number: int + error: Optional[str] = None + archive_url: Optional[str] = None + response_time: float = 0.0 + + +@dataclass +class ValidationReport: + """Overall validation report.""" + total_links: int = 0 + alive_links: int = 0 + dead_links: int = 0 + skipped_links: int = 0 + errors: int = 0 + results: List[LinkResult] = field(default_factory=list) + categories: Dict[str, int] = field(default_factory=lambda: defaultdict(int)) + start_time: float = 0.0 + end_time: float = 0.0 + + +class LinkCache: + """Cache for link check results to avoid re-checking.""" + + def __init__(self, cache_file: str = DEFAULT_CACHE_FILE): + self.cache_file = cache_file + self.cache: Dict[str, dict] = {} + self._load() + + def _load(self): + if os.path.exists(self.cache_file): + try: + with open(self.cache_file, "r", encoding="utf-8") as f: + self.cache = json.load(f) + except (json.JSONDecodeError, IOError): + self.cache = {} + + def save(self): + with open(self.cache_file, "w", encoding="utf-8") as f: + json.dump(self.cache, f, indent=2) + + def _key(self, url: str) -> str: + return hashlib.md5(url.encode()).hexdigest() + + def get(self, url: str, max_age: int = 86400) -> Optional[dict]: + key = self._key(url) + if key in self.cache: + entry = self.cache[key] + if time.time() - entry.get("timestamp", 0) < max_age: + return entry + return None + + def set(self, url: str, result: dict): + key = self._key(url) + result["timestamp"] = time.time() + self.cache[key] = result + + +def extract_markdown_links(text: str) -> List[Tuple[str, str, int]]: + """Extract all markdown links with their text and line numbers.""" + links = [] + for line_num, line in enumerate(text.splitlines(), 1): + # Match [text](url) pattern + for match in re.finditer(r'\[([^\]]+)\]\(([^)]+)\)', line): + link_text = match.group(1) + url = match.group(2) + if url.startswith(("http://", "https://")): + links.append((link_text, url, line_num)) + return links + + +def extract_categories(text: str) -> Dict[str, List[Tuple[str, str, int]]]: + """Parse README.md to extract links organized by category.""" + categories = defaultdict(list) + current_category = "Uncategorized" + + for line_num, line in enumerate(text.splitlines(), 1): + # Detect category headers (## Build your own ...) + header_match = re.match(r'^#{1,3}\s+(.+)', line) + if header_match: + current_category = header_match.group(1).strip() + continue + + # Extract links in current category + for match in re.finditer(r'\[([^\]]+)\]\(([^)]+)\)', line): + link_text = match.group(1) + url = match.group(2) + if url.startswith(("http://", "https://")): + categories[current_category].append((link_text, url, line_num)) + + return dict(categories) + + +def check_single_link(url: str, timeout: int = DEFAULT_TIMEOUT) -> Tuple[int, bool, Optional[str], float]: + """Check if a single URL is alive. Returns (status_code, is_alive, error, response_time).""" + start = time.time() + try: + req = urllib.request.Request(url, method="HEAD", headers={"User-Agent": USER_AGENT}) + with urllib.request.urlopen(req, timeout=timeout) as resp: + elapsed = time.time() - start + return resp.status, True, None, elapsed + except urllib.error.HTTPError as e: + elapsed = time.time() - start + return e.code, False, str(e.reason), elapsed + except urllib.error.URLError as e: + elapsed = time.time() - start + return 0, False, str(e.reason), elapsed + except Exception as e: + elapsed = time.time() - start + return 0, False, str(e), elapsed + + +def get_archive_url(url: str) -> Optional[str]: + """Generate an archive.org fallback URL.""" + encoded = urllib.parse.quote(url, safe="") + return f"{ARCHIVE_ORG_PREFIX}{encoded}" + + +def check_link_with_cache( + url: str, + category: str, + line_number: int, + cache: LinkCache, + timeout: int = DEFAULT_TIMEOUT, +) -> LinkResult: + """Check a link, using cache if available.""" + cached = cache.get(url) + if cached: + return LinkResult( + url=url, + status_code=cached.get("status_code", 0), + is_alive=cached.get("is_alive", False), + category=category, + line_number=line_number, + error=cached.get("error"), + archive_url=cached.get("archive_url"), + response_time=cached.get("response_time", 0.0), + ) + + status_code, is_alive, error, response_time = check_single_link(url, timeout) + archive_url = None if is_alive else get_archive_url(url) + + result = LinkResult( + url=url, + status_code=status_code, + is_alive=is_alive, + category=category, + line_number=line_number, + error=error, + archive_url=archive_url, + response_time=response_time, + ) + + cache.set(url, { + "status_code": status_code, + "is_alive": is_alive, + "error": error, + "archive_url": archive_url, + "response_time": response_time, + }) + + return result + + +def validate_links( + readme_path: str, + max_workers: int = DEFAULT_MAX_WORKERS, + timeout: int = DEFAULT_TIMEOUT, + cache_file: str = DEFAULT_CACHE_FILE, + verbose: bool = False, +) -> ValidationReport: + """Validate all links in a README file.""" + report = ValidationReport(start_time=time.time()) + + with open(readme_path, "r", encoding="utf-8") as f: + content = f.read() + + categories = extract_categories(content) + cache = LinkCache(cache_file) + + # Flatten all links with their categories + all_links = [] + for cat, links in categories.items(): + for text, url, line_num in links: + all_links.append((url, cat, line_num)) + report.categories[cat] = report.categories.get(cat, 0) + 1 + + report.total_links = len(all_links) + checked = 0 + + if verbose: + print(f"Found {report.total_links} links across {len(categories)} categories") + + # Check links concurrently + with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: + futures = { + executor.submit(check_link_with_cache, url, cat, line_num, cache, timeout): (url, cat, line_num) + for url, cat, line_num in all_links + } + + for future in concurrent.futures.as_completed(futures): + result = future.result() + report.results.append(result) + + if result.is_alive: + report.alive_links += 1 + elif result.error: + report.errors += 1 + report.dead_links += 1 + else: + report.dead_links += 1 + + checked += 1 + if verbose and checked % 10 == 0: + print(f"Progress: {checked}/{report.total_links} links checked") + + cache.save() + report.end_time = time.time() + return report + + +def print_report(report: ValidationReport): + """Print a formatted validation report.""" + duration = report.end_time - report.start_time + print("\n" + "=" * 60) + print("LINK VALIDATION REPORT") + print("=" * 60) + print(f"Total links checked: {report.total_links}") + print(f"Alive: {report.alive_links}") + print(f"Dead: {report.dead_links}") + print(f"Errors: {report.errors}") + print(f"Duration: {duration:.1f}s") + print() + + # Print categories summary + print("Categories:") + for cat, count in sorted(report.categories.items()): + print(f" {cat}: {count} links") + print() + + # Print dead links + dead = [r for r in report.results if not r.is_alive] + if dead: + print("Dead Links:") + for r in dead: + print(f" Line {r.line_number}: {r.url}") + print(f" Status: {r.status_code} | Error: {r.error}") + if r.archive_url: + print(f" Archive: {r.archive_url}") + print() + + +def main(): + """Main entry point.""" + parser = argparse.ArgumentParser(description="Validate tutorial links in README.md") + parser.add_argument("readme", nargs="?", default="README.md", help="Path to README.md") + parser.add_argument("--workers", type=int, default=DEFAULT_MAX_WORKERS, help="Max concurrent workers") + parser.add_argument("--timeout", type=int, default=DEFAULT_TIMEOUT, help="Request timeout in seconds") + parser.add_argument("--cache-file", default=DEFAULT_CACHE_FILE, help="Cache file path") + parser.add_argument("--no-cache", action="store_true", help="Disable caching") + parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output") + parser.add_argument("--json-output", help="Write results as JSON to file") + args = parser.parse_args() + + if not os.path.exists(args.readme): + print(f"Error: {args.readme} not found", file=sys.stderr) + sys.exit(1) + + cache_file = args.cache_file if not args.no_cache else None + if cache_file is None: + cache_file = os.devnull + + report = validate_links( + args.readme, + max_workers=args.workers, + timeout=args.timeout, + cache_file=cache_file, + verbose=args.verbose, + ) + + print_report(report) + + if args.json_output: + with open(args.json_output, "w", encoding="utf-8") as f: + json.dump({ + "total": report.total_links, + "alive": report.alive_links, + "dead": report.dead_links, + "errors": report.errors, + "results": [asdict(r) for r in report.results], + }, f, indent=2) + print(f"JSON report written to {args.json_output}") + + sys.exit(1 if report.dead_links > 0 else 0) + + +if __name__ == "__main__": + main()