From ea0ce0872b3375c5c140f48c5848aa176ed22916 Mon Sep 17 00:00:00 2001 From: Cameron Crouch <19394466+crouchcd@users.noreply.github.com> Date: Fri, 16 Jan 2026 04:25:51 +0000 Subject: [PATCH] Add language statistics reporting feature - Add generate_stats.py script to analyze README.md and extract language distribution - Generate STATS-main.md with visualized language statistics - Add GitHub Actions workflow to auto-update stats when README.md changes --- .github/scripts/generate_stats.py | 151 +++++++++++++++++++++++++++++ .github/workflows/update-stats.yml | 42 ++++++++ STATS-main.md | 50 ++++++++++ 3 files changed, 243 insertions(+) create mode 100644 .github/scripts/generate_stats.py create mode 100644 .github/workflows/update-stats.yml create mode 100644 STATS-main.md diff --git a/.github/scripts/generate_stats.py b/.github/scripts/generate_stats.py new file mode 100644 index 0000000..07f7de2 --- /dev/null +++ b/.github/scripts/generate_stats.py @@ -0,0 +1,151 @@ +#!/usr/bin/env python3 +""" +Generate language statistics from README.md and create a visualization in STATS-main.md +""" + +import re +from collections import Counter +from typing import Dict, List, Tuple + +def extract_languages_from_readme(filename: str = 'README.md') -> List[str]: + """Extract all programming languages mentioned in project entries.""" + languages = [] + + with open(filename, 'r', encoding='utf-8') as f: + content = f.read() + + # Pattern to match: * [**Language**: _Title_](url) + # Handles multiple languages separated by / or , + pattern = r'^\* \[\*\*([^*]+)\*\*:' + + matches = re.findall(pattern, content, re.MULTILINE) + + for match in matches: + # Split by / or , and clean up whitespace + langs = re.split(r'\s*/\s*|\s*,\s*', match) + for lang in langs: + lang = lang.strip() + if lang: + languages.append(lang) + + return languages + +def normalize_language(lang: str) -> str: + """Normalize language names for consistency.""" + # Handle common variations + normalizations = { + 'Node.js': 'JavaScript' + } + return normalizations.get(lang, lang) + +def count_languages(languages: List[str]) -> Dict[str, int]: + """Count occurrences of each language.""" + normalized = [normalize_language(lang) for lang in languages] + return dict(Counter(normalized)) + +def create_horizontal_bar(count: int, max_count: int, bar_width: int = 50) -> str: + """Create a horizontal bar for visualization.""" + filled = int((count / max_count) * bar_width) + bar = '█' * filled + '░' * (bar_width - filled) + return bar + +def generate_stats_markdown(language_counts: Dict[str, int], num_projects: int) -> str: + """Generate the markdown content for STATS-main.md.""" + # Sort by count (descending) then by name + sorted_langs = sorted(language_counts.items(), key=lambda x: (-x[1], x[0])) + + total_language_mentions = sum(language_counts.values()) + max_count = max(language_counts.values()) + + # Separate languages >= 1% and < 1% + threshold = num_projects * 0.01 # 1% threshold + main_langs = [] + other_langs = [] + + for lang, count in sorted_langs: + if count >= threshold: + main_langs.append((lang, count)) + else: + other_langs.append((lang, count)) + + # Calculate "Other" total + other_count = sum(count for _, count in other_langs) + + # Build markdown content + lines = [ + "# Build Your Own X - Language Statistics\n", + f"**Total Projects:** {num_projects}\n", + f"**Total Language Mentions:** {total_language_mentions} *(some projects support multiple languages)*\n", + f"**Unique Languages:** {len(language_counts)}\n", + f"**Last Updated:** {get_current_date()}\n", + "---\n", + "## Language Distribution\n", + "| Language | Count | Percentage | Distribution |", + "|----------|-------|------------|--------------|" + ] + + for lang, count in main_langs: + percentage = (count / num_projects) * 100 + bar = create_horizontal_bar(count, max_count, 30) + lines.append(f"| {lang} | {count} | {percentage:.1f}% | {bar} |") + + # Add "Other" category if there are languages < 1% + if other_langs: + percentage = (other_count / num_projects) * 100 + bar = create_horizontal_bar(other_count, max_count, 30) + lines.append(f"| Other* | {other_count} | {percentage:.1f}% | {bar} |") + + lines.append("\n---\n") + lines.append("## Top 10 Languages\n") + + for i, (lang, count) in enumerate(sorted_langs[:10], 1): + percentage = (count / num_projects) * 100 + lines.append(f"{i}. **{lang}**: {count} projects ({percentage:.1f}%)") + + # Add footnote for "Other" languages + if other_langs: + lines.append("## Footnotes\n") + lines.append(f"**\\* Other languages** (each < 1% of total projects): ") + other_names = [f"{lang} ({count})" for lang, count in sorted(other_langs, key=lambda x: (-x[1], x[0]))] + lines.append(", ".join(other_names)) + + return '\n'.join(lines) + '\n' + +def get_current_date() -> str: + """Get current date in YYYY-MM-DD format.""" + from datetime import datetime + return datetime.now().strftime('%Y-%m-%d') + +def count_projects(filename: str = 'README.md') -> int: + """Count the actual number of project entries.""" + with open(filename, 'r', encoding='utf-8') as f: + content = f.read() + pattern = r'^\* \[\*\*([^*]+)\*\*:' + matches = re.findall(pattern, content, re.MULTILINE) + return len(matches) + +def main(): + print("Analyzing README.md...") + num_projects = count_projects() + print(f"Found {num_projects} project entries") + + languages = extract_languages_from_readme() + print(f"Extracted {len(languages)} language mentions (some projects list multiple languages)") + + language_counts = count_languages(languages) + print(f"Detected {len(language_counts)} unique languages") + + print("\nGenerating STATS-main.md...") + stats_content = generate_stats_markdown(language_counts, num_projects) + + with open('STATS-main.md', 'w', encoding='utf-8') as f: + f.write(stats_content) + + print("✓ STATS-main.md generated successfully!") + print(f"\nTop 5 languages:") + sorted_langs = sorted(language_counts.items(), key=lambda x: -x[1]) + for lang, count in sorted_langs[:5]: + print(f" - {lang}: {count}") + +if __name__ == '__main__': + main() diff --git a/.github/workflows/update-stats.yml b/.github/workflows/update-stats.yml new file mode 100644 index 0000000..8c4a9df --- /dev/null +++ b/.github/workflows/update-stats.yml @@ -0,0 +1,42 @@ +name: Update Language Statistics + +on: + push: + branches: + - main + - master + paths: + - 'README.md' + workflow_dispatch: + +jobs: + update-stats: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.x' + + - name: Generate language statistics + run: python3 .github/scripts/generate_stats.py + + - name: Check for changes + id: git-check + run: | + git diff --exit-code STATS-main.md || echo "changed=true" >> $GITHUB_OUTPUT + + - name: Commit and push if changed + if: steps.git-check.outputs.changed == 'true' + run: | + git config --local user.email "github-actions[bot]@users.noreply.github.com" + git config --local user.name "github-actions[bot]" + git add STATS-main.md + git commit -m "Auto-update language statistics [skip ci]" + git push diff --git a/STATS-main.md b/STATS-main.md new file mode 100644 index 0000000..eb2734d --- /dev/null +++ b/STATS-main.md @@ -0,0 +1,50 @@ +# Build Your Own X - Language Statistics + +**Total Projects:** 350 + +**Total Language Mentions:** 356 *(some projects support multiple languages)* + +**Unique Languages:** 35 + +**Last Updated:** 2026-01-16 + +--- + +## Language Distribution + +| Language | Count | Percentage | Distribution | +|----------|-------|------------|--------------| +| JavaScript | 69 | 19.7% | ██████████████████████████████ | +| Python | 68 | 19.4% | █████████████████████████████░ | +| C | 49 | 14.0% | █████████████████████░░░░░░░░░ | +| C++ | 33 | 9.4% | ██████████████░░░░░░░░░░░░░░░░ | +| Go | 23 | 6.6% | ██████████░░░░░░░░░░░░░░░░░░░░ | +| Rust | 17 | 4.9% | ███████░░░░░░░░░░░░░░░░░░░░░░░ | +| C# | 16 | 4.6% | ██████░░░░░░░░░░░░░░░░░░░░░░░░ | +| Ruby | 13 | 3.7% | █████░░░░░░░░░░░░░░░░░░░░░░░░░ | +| Java | 9 | 2.6% | ███░░░░░░░░░░░░░░░░░░░░░░░░░░░ | +| Nim | 9 | 2.6% | ███░░░░░░░░░░░░░░░░░░░░░░░░░░░ | +| Haskell | 6 | 1.7% | ██░░░░░░░░░░░░░░░░░░░░░░░░░░░░ | +| PHP | 5 | 1.4% | ██░░░░░░░░░░░░░░░░░░░░░░░░░░░░ | +| TypeScript | 5 | 1.4% | ██░░░░░░░░░░░░░░░░░░░░░░░░░░░░ | +| (any) | 4 | 1.1% | █░░░░░░░░░░░░░░░░░░░░░░░░░░░░░ | +| Other* | 30 | 8.6% | █████████████░░░░░░░░░░░░░░░░░ | + +--- + +## Top 10 Languages + +1. **JavaScript**: 69 projects (19.7%) +2. **Python**: 68 projects (19.4%) +3. **C**: 49 projects (14.0%) +4. **C++**: 33 projects (9.4%) +5. **Go**: 23 projects (6.6%) +6. **Rust**: 17 projects (4.9%) +7. **C#**: 16 projects (4.6%) +8. **Ruby**: 13 projects (3.7%) +9. **Java**: 9 projects (2.6%) +10. **Nim**: 9 projects (2.6%) +## Footnotes + +**\* Other languages** (each < 1% of total projects): +Assembly (3), Clojure (2), Crystal (2), F# (2), Kotlin (2), Lua (2), OCaml (2), Scala (2), ATS (1), Alloy (1), CSS (1), Common Lisp (1), Elixir (1), Pascal (1), Perl (1), Pseudocode (1), R (1), Racket (1), Shell (1), Swift (1), Zig (1)