build-your-own-x/.github/scripts/generate_stats.py

#!/usr/bin/env python3
"""
Generate language statistics from README.md and create a visualization in STATS-main.md
"""

import re
from collections import Counter
from typing import Dict, List, Tuple

def extract_languages_from_readme(filename: str = 'README.md') -> List[str]:
    """Extract all programming languages mentioned in project entries."""
    languages = []

    with open(filename, 'r', encoding='utf-8') as f:
        content = f.read()

    # Pattern to match: * [**Language**: _Title_](url)
    # Handles multiple languages separated by / or ,
    pattern = r'^\* \[\*\*([^*]+)\*\*:'

    matches = re.findall(pattern, content, re.MULTILINE)

    for match in matches:
        # Split by / or , and clean up whitespace
        langs = re.split(r'\s*/\s*|\s*,\s*', match)
        for lang in langs:
            lang = lang.strip()
            if lang:
                languages.append(lang)

    return languages

def normalize_language(lang: str) -> str:
    """Normalize language names for consistency."""
    # Handle common variations
    normalizations = {
        'Node.js': 'JavaScript'
    }
    return normalizations.get(lang, lang)

def count_languages(languages: List[str]) -> Dict[str, int]:
    """Count occurrences of each language."""
    normalized = [normalize_language(lang) for lang in languages]
    return dict(Counter(normalized))

def create_horizontal_bar(count: int, max_count: int, bar_width: int = 50) -> str:
    """Create a horizontal bar for visualization."""
    filled = int((count / max_count) * bar_width)
    bar = '█' * filled + '░' * (bar_width - filled)
    return bar

def generate_stats_markdown(language_counts: Dict[str, int], num_projects: int) -> str:
    """Generate the markdown content for STATS-main.md."""
    # Sort by count (descending) then by name
    sorted_langs = sorted(language_counts.items(), key=lambda x: (-x[1], x[0]))

    total_language_mentions = sum(language_counts.values())
    max_count = max(language_counts.values())

    # Separate languages >= 1% and < 1%
    threshold = num_projects * 0.01  # 1% threshold
    main_langs = []
    other_langs = []

    for lang, count in sorted_langs:
        if count >= threshold:
            main_langs.append((lang, count))
        else:
            other_langs.append((lang, count))

    # Calculate "Other" total
    other_count = sum(count for _, count in other_langs)

    # Build markdown content
    lines = [
        "# Build Your Own X - Language Statistics\n",
        f"**Total Projects:** {num_projects}\n",
        f"**Total Language Mentions:** {total_language_mentions} *(some projects support multiple languages)*\n",
        f"**Unique Languages:** {len(language_counts)}\n",
        f"**Last Updated:** {get_current_date()}\n",
        "---\n",
        "## Language Distribution\n",
        "| Language | Count | Percentage | Distribution |",
        "|----------|-------|------------|--------------|"
    ]

    for lang, count in main_langs:
        percentage = (count / num_projects) * 100
        bar = create_horizontal_bar(count, max_count, 30)
        lines.append(f"| {lang} | {count} | {percentage:.1f}% | {bar} |")

    # Add "Other" category if there are languages < 1%
    if other_langs:
        percentage = (other_count / num_projects) * 100
        bar = create_horizontal_bar(other_count, max_count, 30)
        lines.append(f"| Other* | {other_count} | {percentage:.1f}% | {bar} |")

    lines.append("\n---\n")
    lines.append("## Top 10 Languages\n")

    for i, (lang, count) in enumerate(sorted_langs[:10], 1):
        percentage = (count / num_projects) * 100
        lines.append(f"{i}. **{lang}**: {count} projects ({percentage:.1f}%)")

    # Add footnote for "Other" languages
    if other_langs:
        lines.append("## Footnotes\n")
        lines.append(f"**\\* Other languages** (each < 1% of total projects): ")
        other_names = [f"{lang} ({count})" for lang, count in sorted(other_langs, key=lambda x: (-x[1], x[0]))]
        lines.append(", ".join(other_names))

    return '\n'.join(lines) + '\n'

def get_current_date() -> str:
    """Get current date in YYYY-MM-DD format."""
    from datetime import datetime
    return datetime.now().strftime('%Y-%m-%d')

def count_projects(filename: str = 'README.md') -> int:
    """Count the actual number of project entries."""
    with open(filename, 'r', encoding='utf-8') as f:
        content = f.read()
    pattern = r'^\* \[\*\*([^*]+)\*\*:'
    matches = re.findall(pattern, content, re.MULTILINE)
    return len(matches)

def main():
    print("Analyzing README.md...")
    num_projects = count_projects()
    print(f"Found {num_projects} project entries")

    languages = extract_languages_from_readme()
    print(f"Extracted {len(languages)} language mentions (some projects list multiple languages)")

    language_counts = count_languages(languages)
    print(f"Detected {len(language_counts)} unique languages")

    print("\nGenerating STATS-main.md...")
    stats_content = generate_stats_markdown(language_counts, num_projects)

    with open('STATS-main.md', 'w', encoding='utf-8') as f:
        f.write(stats_content)

    print("✓ STATS-main.md generated successfully!")
    print(f"\nTop 5 languages:")
    sorted_langs = sorted(language_counts.items(), key=lambda x: -x[1])
    for lang, count in sorted_langs[:5]:
        print(f"  - {lang}: {count}")

if __name__ == '__main__':
    main()