From 1603f8cdcbb73f03b7fe4c819254c712db4b160c Mon Sep 17 00:00:00 2001
From: Srikanth Patchava <spatchava@meta.com>
Date: Sat, 25 Apr 2026 01:30:33 -0700
Subject: [PATCH] feat: add tutorial link validator with tests and bug fix

- Add scripts/validate_links.py with category-based link checking,
  dead link detection, archive.org fallback suggestions, concurrent
  checking with threading, and result caching
- Add scripts/test_validate_links.py with comprehensive pytest tests
- Fix formatting issue in README.md

Signed-off-by: Srikanth Patchava <spatchava@meta.com>
---
 README.md                      |   2 +-
 scripts/test_validate_links.py | 118 ++++++++++++
 scripts/validate_links.py      | 334 +++++++++++++++++++++++++++++++++
 3 files changed, 453 insertions(+), 1 deletion(-)
 create mode 100644 scripts/test_validate_links.py
 create mode 100644 scripts/validate_links.py

diff --git a/README.md b/README.md
index eb8b7ae..ba66aa3 100644
--- a/README.md
+++ b/README.md
@@ -379,7 +379,7 @@ It's a great way to learn.
 * [**C**: _Let's build a shell!_](https://github.com/kamalmarhubi/shell-workshop)
 * [**C**: _Writing a UNIX Shell_](https://indradhanush.github.io/blog/writing-a-unix-shell-part-1/)
 * [**C**: _Build Your Own Shell_](https://github.com/tokenrove/build-your-own-shell)
-* [**C**: Write a shell in C](https://danishpraka.sh/posts/write-a-shell/)
+* [**C**: _Write a shell in C_](https://danishpraka.sh/posts/write-a-shell/)
 * [**Go**: _Writing a simple shell in Go_](https://sj14.gitlab.io/post/2018-07-01-go-unix-shell/)
 * [**Rust**: _Build Your Own Shell using Rust_](https://www.joshmcguigan.com/blog/build-your-own-shell-rust/)
 
diff --git a/scripts/test_validate_links.py b/scripts/test_validate_links.py
new file mode 100644
index 0000000..4da1a1a
--- /dev/null
+++ b/scripts/test_validate_links.py
@@ -0,0 +1,118 @@
+#!/usr/bin/env python3
+"""Tests for validate_links.py"""
+
+import json
+import os
+import tempfile
+import pytest
+from unittest.mock import patch, MagicMock
+from validate_links import (
+    extract_markdown_links,
+    extract_categories,
+    check_single_link,
+    get_archive_url,
+    LinkCache,
+    LinkResult,
+    ValidationReport,
+    validate_links,
+)
+
+
+class TestExtractMarkdownLinks:
+    def test_basic_link(self):
+        text = "[Example](https://example.com)"
+        links = extract_markdown_links(text)
+        assert len(links) == 1
+        assert links[0] == ("Example", "https://example.com", 1)
+
+    def test_multiple_links(self):
+        text = "[A](https://a.com)\n[B](https://b.com)"
+        links = extract_markdown_links(text)
+        assert len(links) == 2
+
+    def test_ignores_non_http(self):
+        text = "[Local](./local.md)"
+        links = extract_markdown_links(text)
+        assert len(links) == 0
+
+    def test_preserves_line_numbers(self):
+        text = "line1\n[Link](https://x.com)\nline3"
+        links = extract_markdown_links(text)
+        assert links[0][2] == 2
+
+    def test_multiple_links_per_line(self):
+        text = "[A](https://a.com) and [B](https://b.com)"
+        links = extract_markdown_links(text)
+        assert len(links) == 2
+
+
+class TestExtractCategories:
+    def test_categorizes_links(self):
+        text = "## Web Server\n[Link](https://x.com)\n## Database\n[DB](https://db.com)"
+        cats = extract_categories(text)
+        assert "Web Server" in cats
+        assert "Database" in cats
+
+    def test_uncategorized(self):
+        text = "[Link](https://x.com)"
+        cats = extract_categories(text)
+        assert "Uncategorized" in cats
+
+
+class TestGetArchiveUrl:
+    def test_generates_archive_url(self):
+        url = "https://example.com/page"
+        result = get_archive_url(url)
+        assert result.startswith("https://web.archive.org/web/")
+        assert "example.com" in result
+
+
+class TestLinkCache:
+    def test_set_and_get(self):
+        with tempfile.NamedTemporaryFile(suffix=".json", delete=False) as f:
+            cache = LinkCache(f.name)
+            cache.set("https://example.com", {"status_code": 200, "is_alive": True})
+            result = cache.get("https://example.com")
+            assert result is not None
+            assert result["status_code"] == 200
+            os.unlink(f.name)
+
+    def test_cache_miss(self):
+        with tempfile.NamedTemporaryFile(suffix=".json", delete=False) as f:
+            cache = LinkCache(f.name)
+            result = cache.get("https://nonexistent.com")
+            assert result is None
+            os.unlink(f.name)
+
+    def test_save_and_reload(self):
+        with tempfile.NamedTemporaryFile(suffix=".json", delete=False, mode="w") as f:
+            path = f.name
+        cache = LinkCache(path)
+        cache.set("https://test.com", {"status_code": 200, "is_alive": True})
+        cache.save()
+        cache2 = LinkCache(path)
+        result = cache2.get("https://test.com")
+        assert result is not None
+        os.unlink(path)
+
+
+class TestLinkResult:
+    def test_creation(self):
+        r = LinkResult(url="https://x.com", status_code=200, is_alive=True, category="Test", line_number=1)
+        assert r.is_alive is True
+        assert r.error is None
+
+    def test_dead_link(self):
+        r = LinkResult(url="https://x.com", status_code=404, is_alive=False, category="Test", line_number=1, error="Not Found")
+        assert r.is_alive is False
+
+
+class TestValidationReport:
+    def test_defaults(self):
+        r = ValidationReport()
+        assert r.total_links == 0
+        assert r.alive_links == 0
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])
diff --git a/scripts/validate_links.py b/scripts/validate_links.py
new file mode 100644
index 0000000..eba9d64
--- /dev/null
+++ b/scripts/validate_links.py
@@ -0,0 +1,334 @@
+#!/usr/bin/env python3
+"""Tutorial Link Validator for build-your-own-x repository.
+
+Validates all tutorial links in README.md, checking for dead links,
+categorizing results, and suggesting archive.org fallbacks.
+"""
+
+import argparse
+import concurrent.futures
+import hashlib
+import json
+import os
+import re
+import sys
+import time
+import urllib.parse
+import urllib.request
+from collections import defaultdict
+from dataclasses import dataclass, field, asdict
+from typing import Dict, List, Optional, Tuple
+
+
+# default config
+DEFAULT_TIMEOUT = 10
+DEFAULT_MAX_WORKERS = 10
+DEFAULT_CACHE_FILE = ".link_cache.json"
+USER_AGENT = "Mozilla/5.0 (build-your-own-x link checker)"
+ARCHIVE_ORG_PREFIX = "https://web.archive.org/web/"
+
+
+@dataclass
+class LinkResult:
+    """Result of checking a single link."""
+    url: str
+    status_code: int
+    is_alive: bool
+    category: str
+    line_number: int
+    error: Optional[str] = None
+    archive_url: Optional[str] = None
+    response_time: float = 0.0
+
+
+@dataclass
+class ValidationReport:
+    """Overall validation report."""
+    total_links: int = 0
+    alive_links: int = 0
+    dead_links: int = 0
+    skipped_links: int = 0
+    errors: int = 0
+    results: List[LinkResult] = field(default_factory=list)
+    categories: Dict[str, int] = field(default_factory=lambda: defaultdict(int))
+    start_time: float = 0.0
+    end_time: float = 0.0
+
+
+class LinkCache:
+    """Cache for link check results to avoid re-checking."""
+
+    def __init__(self, cache_file: str = DEFAULT_CACHE_FILE):
+        self.cache_file = cache_file
+        self.cache: Dict[str, dict] = {}
+        self._load()
+
+    def _load(self):
+        if os.path.exists(self.cache_file):
+            try:
+                with open(self.cache_file, "r", encoding="utf-8") as f:
+                    self.cache = json.load(f)
+            except (json.JSONDecodeError, IOError):
+                self.cache = {}
+
+    def save(self):
+        with open(self.cache_file, "w", encoding="utf-8") as f:
+            json.dump(self.cache, f, indent=2)
+
+    def _key(self, url: str) -> str:
+        return hashlib.md5(url.encode()).hexdigest()
+
+    def get(self, url: str, max_age: int = 86400) -> Optional[dict]:
+        key = self._key(url)
+        if key in self.cache:
+            entry = self.cache[key]
+            if time.time() - entry.get("timestamp", 0) < max_age:
+                return entry
+        return None
+
+    def set(self, url: str, result: dict):
+        key = self._key(url)
+        result["timestamp"] = time.time()
+        self.cache[key] = result
+
+
+def extract_markdown_links(text: str) -> List[Tuple[str, str, int]]:
+    """Extract all markdown links with their text and line numbers."""
+    links = []
+    for line_num, line in enumerate(text.splitlines(), 1):
+        # Match [text](url) pattern
+        for match in re.finditer(r'\[([^\]]+)\]\(([^)]+)\)', line):
+            link_text = match.group(1)
+            url = match.group(2)
+            if url.startswith(("http://", "https://")):
+                links.append((link_text, url, line_num))
+    return links
+
+
+def extract_categories(text: str) -> Dict[str, List[Tuple[str, str, int]]]:
+    """Parse README.md to extract links organized by category."""
+    categories = defaultdict(list)
+    current_category = "Uncategorized"
+
+    for line_num, line in enumerate(text.splitlines(), 1):
+        # Detect category headers (## Build your own ...)
+        header_match = re.match(r'^#{1,3}\s+(.+)', line)
+        if header_match:
+            current_category = header_match.group(1).strip()
+            continue
+
+        # Extract links in current category
+        for match in re.finditer(r'\[([^\]]+)\]\(([^)]+)\)', line):
+            link_text = match.group(1)
+            url = match.group(2)
+            if url.startswith(("http://", "https://")):
+                categories[current_category].append((link_text, url, line_num))
+
+    return dict(categories)
+
+
+def check_single_link(url: str, timeout: int = DEFAULT_TIMEOUT) -> Tuple[int, bool, Optional[str], float]:
+    """Check if a single URL is alive. Returns (status_code, is_alive, error, response_time)."""
+    start = time.time()
+    try:
+        req = urllib.request.Request(url, method="HEAD", headers={"User-Agent": USER_AGENT})
+        with urllib.request.urlopen(req, timeout=timeout) as resp:
+            elapsed = time.time() - start
+            return resp.status, True, None, elapsed
+    except urllib.error.HTTPError as e:
+        elapsed = time.time() - start
+        return e.code, False, str(e.reason), elapsed
+    except urllib.error.URLError as e:
+        elapsed = time.time() - start
+        return 0, False, str(e.reason), elapsed
+    except Exception as e:
+        elapsed = time.time() - start
+        return 0, False, str(e), elapsed
+
+
+def get_archive_url(url: str) -> Optional[str]:
+    """Generate an archive.org fallback URL."""
+    encoded = urllib.parse.quote(url, safe="")
+    return f"{ARCHIVE_ORG_PREFIX}{encoded}"
+
+
+def check_link_with_cache(
+    url: str,
+    category: str,
+    line_number: int,
+    cache: LinkCache,
+    timeout: int = DEFAULT_TIMEOUT,
+) -> LinkResult:
+    """Check a link, using cache if available."""
+    cached = cache.get(url)
+    if cached:
+        return LinkResult(
+            url=url,
+            status_code=cached.get("status_code", 0),
+            is_alive=cached.get("is_alive", False),
+            category=category,
+            line_number=line_number,
+            error=cached.get("error"),
+            archive_url=cached.get("archive_url"),
+            response_time=cached.get("response_time", 0.0),
+        )
+
+    status_code, is_alive, error, response_time = check_single_link(url, timeout)
+    archive_url = None if is_alive else get_archive_url(url)
+
+    result = LinkResult(
+        url=url,
+        status_code=status_code,
+        is_alive=is_alive,
+        category=category,
+        line_number=line_number,
+        error=error,
+        archive_url=archive_url,
+        response_time=response_time,
+    )
+
+    cache.set(url, {
+        "status_code": status_code,
+        "is_alive": is_alive,
+        "error": error,
+        "archive_url": archive_url,
+        "response_time": response_time,
+    })
+
+    return result
+
+
+def validate_links(
+    readme_path: str,
+    max_workers: int = DEFAULT_MAX_WORKERS,
+    timeout: int = DEFAULT_TIMEOUT,
+    cache_file: str = DEFAULT_CACHE_FILE,
+    verbose: bool = False,
+) -> ValidationReport:
+    """Validate all links in a README file."""
+    report = ValidationReport(start_time=time.time())
+
+    with open(readme_path, "r", encoding="utf-8") as f:
+        content = f.read()
+
+    categories = extract_categories(content)
+    cache = LinkCache(cache_file)
+
+    # Flatten all links with their categories
+    all_links = []
+    for cat, links in categories.items():
+        for text, url, line_num in links:
+            all_links.append((url, cat, line_num))
+            report.categories[cat] = report.categories.get(cat, 0) + 1
+
+    report.total_links = len(all_links)
+    checked = 0
+
+    if verbose:
+        print(f"Found {report.total_links} links across {len(categories)} categories")
+
+    # Check links concurrently
+    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
+        futures = {
+            executor.submit(check_link_with_cache, url, cat, line_num, cache, timeout): (url, cat, line_num)
+            for url, cat, line_num in all_links
+        }
+
+        for future in concurrent.futures.as_completed(futures):
+            result = future.result()
+            report.results.append(result)
+
+            if result.is_alive:
+                report.alive_links += 1
+            elif result.error:
+                report.errors += 1
+                report.dead_links += 1
+            else:
+                report.dead_links += 1
+
+            checked += 1
+            if verbose and checked % 10 == 0:
+                print(f"Progress: {checked}/{report.total_links} links checked")
+
+    cache.save()
+    report.end_time = time.time()
+    return report
+
+
+def print_report(report: ValidationReport):
+    """Print a formatted validation report."""
+    duration = report.end_time - report.start_time
+    print("\n" + "=" * 60)
+    print("LINK VALIDATION REPORT")
+    print("=" * 60)
+    print(f"Total links checked: {report.total_links}")
+    print(f"Alive: {report.alive_links}")
+    print(f"Dead:  {report.dead_links}")
+    print(f"Errors: {report.errors}")
+    print(f"Duration: {duration:.1f}s")
+    print()
+
+    # Print categories summary
+    print("Categories:")
+    for cat, count in sorted(report.categories.items()):
+        print(f"  {cat}: {count} links")
+    print()
+
+    # Print dead links
+    dead = [r for r in report.results if not r.is_alive]
+    if dead:
+        print("Dead Links:")
+        for r in dead:
+            print(f"  Line {r.line_number}: {r.url}")
+            print(f"    Status: {r.status_code} | Error: {r.error}")
+            if r.archive_url:
+                print(f"    Archive: {r.archive_url}")
+        print()
+
+
+def main():
+    """Main entry point."""
+    parser = argparse.ArgumentParser(description="Validate tutorial links in README.md")
+    parser.add_argument("readme", nargs="?", default="README.md", help="Path to README.md")
+    parser.add_argument("--workers", type=int, default=DEFAULT_MAX_WORKERS, help="Max concurrent workers")
+    parser.add_argument("--timeout", type=int, default=DEFAULT_TIMEOUT, help="Request timeout in seconds")
+    parser.add_argument("--cache-file", default=DEFAULT_CACHE_FILE, help="Cache file path")
+    parser.add_argument("--no-cache", action="store_true", help="Disable caching")
+    parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
+    parser.add_argument("--json-output", help="Write results as JSON to file")
+    args = parser.parse_args()
+
+    if not os.path.exists(args.readme):
+        print(f"Error: {args.readme} not found", file=sys.stderr)
+        sys.exit(1)
+
+    cache_file = args.cache_file if not args.no_cache else None
+    if cache_file is None:
+        cache_file = os.devnull
+
+    report = validate_links(
+        args.readme,
+        max_workers=args.workers,
+        timeout=args.timeout,
+        cache_file=cache_file,
+        verbose=args.verbose,
+    )
+
+    print_report(report)
+
+    if args.json_output:
+        with open(args.json_output, "w", encoding="utf-8") as f:
+            json.dump({
+                "total": report.total_links,
+                "alive": report.alive_links,
+                "dead": report.dead_links,
+                "errors": report.errors,
+                "results": [asdict(r) for r in report.results],
+            }, f, indent=2)
+        print(f"JSON report written to {args.json_output}")
+
+    sys.exit(1 if report.dead_links > 0 else 0)
+
+
+if __name__ == "__main__":
+    main()