clone-sweeper/clone_sweeper.py at main · HelixCipher/clone-sweeper · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python3
"""
Enhanced: renders SVGs using Jinja2 templates, persists per-repo history snapshots,
and produces four artifacts:

  - stats.svg        (summary card — top-N repos by clones)
  - repo_clones.svg  (full tabular SVG)
  - stats.json       (summary JSON for frontend)
  - repo_clones.json (detailed JSON for frontend)
  - history.json     (monthly/yearly history for frontend)

Behavior notes:
 - By default only public repos are processed. Use INCLUDE_PRIVATE env or --include-private to opt-in.
 - PAT environment variable name defaults to TOKEN (override with --token-env).
 - For more information read README.md
"""
import os
import sys
import shutil
import argparse
import datetime
import requests
import time
import subprocess
import json
from typing import Optional, List, Dict, Any, Tuple
from urllib.parse import urlparse
import re
import sqlite3
from jinja2 import Template

# Base GitHub API constants
API_BASE = "https://api.github.com"
HEADERS_COMMON = {
    "Accept": "application/vnd.github.v3+json",
    "User-Agent": "clone-sweeper/1.0",
}

# ---------------------------
# Helpers
# ---------------------------
def escape_xml(s: Optional[str]) -> str:
    """
    Escape a string for safe embedding inside XML/SVG text nodes or attributes.

    Why: repository names and descriptions come from external sources and may contain
    characters that break XML (e.g. &, <, >, " and '). Always escape them before
    inserting into the generated SVG.

    Returns an empty string for None input.
    """
    if s is None:
        return ""
    return (str(s).replace("&", "&amp;")
             .replace("<", "&lt;")
             .replace(">", "&gt;")
             .replace('"', "&quot;")
             .replace("'", "&apos;"))

def ensure_dir(path: str):
    if not os.path.exists(path):
        os.makedirs(path, exist_ok=True)

# ---------------------------
# HTTP helpers
# ---------------------------
def request_with_auth(url: str, token: Optional[str] = None, params: dict = None) -> requests.Response:
    """
    Perform an HTTP GET to `url` using optional `token` for Authorization.

    - Adds the standard Accept and User-Agent headers.
    - Adds an Authorization header when token is provided.
    - Returns the `requests.Response` object for caller handling.

    Note: callers are responsible for checking r.status_code and parsing JSON.
    """
    headers = HEADERS_COMMON.copy()
    if token:
        headers["Authorization"] = f"token {token}"
    return requests.get(url, headers=headers, params=params or {}, timeout=30)

def paginate(url: str, token: Optional[str] = None, params: dict = None) -> List[Dict[str, Any]]:
    """
    Paginate through a GitHub API endpoint that uses Link headers for paging.

    - `url` is the initial URL (e.g. https://api.github.com/user/repos).
    - `token` passes authentication if provided.
    - `params` are query parameters for the first request only.

    Returns the concatenated list of items (each request expected to return a JSON list).
    Raises RuntimeError on HTTP >= 400 to make failures explicit.
    """
    items = []
    cur_url = url
    cur_params = params or {}
    while cur_url:
        r = request_with_auth(cur_url, token, params=cur_params)
        if r.status_code >= 400:
            # Surface the raw response for debugging (status + body)
            raise RuntimeError(f"GitHub API error {r.status_code} for {cur_url}: {r.text}")
        batch = r.json()
        if isinstance(batch, list):
            items.extend(batch)
        else:
            # Some endpoints may return an object when single resource requested; handle defensively
            items.append(batch)
        # Parse Link header for rel="next"
        link = r.headers.get("Link", "")
        next_url = None
        if link:
            parts = link.split(",")
            for p in parts:
                if 'rel="next"' in p:
                    start = p.find("<") + 1
                    end = p.find(">")
                    next_url = p[start:end]
                    break
        cur_url = next_url
        cur_params = None
    return items

# ---------------------------
# Owner detection utilities
# ---------------------------
def get_authenticated_username(token: Optional[str]) -> Optional[str]:
    """
    If a Personal Access Token (PAT) is provided, query /user to discover the authenticated username.

    Returns the login (username) on success, or None on failure / missing token.
    """
    if not token:
        return None
    try:
        r = request_with_auth(f"{API_BASE}/user", token)
        if r.status_code == 200:
            return r.json().get("login")
    except Exception:
        # Swallow network issues — caller will attempt other detection strategies
        pass
    return None

def owner_from_github_repository_env() -> Optional[str]:
    repo = os.environ.get("GITHUB_REPOSITORY")
    if repo and "/" in repo:
        return repo.split("/", 1)[0]
    return None

def owner_from_git_remote() -> Optional[str]:
    """
    Try to read the local git remote 'origin' URL and extract the owner from it.
    This works when the script runs inside a checked-out git repository.
    Returns the owner string or None if it can't be determined.
    """
    try:
        res = subprocess.run(["git", "remote", "get-url", "origin"], capture_output=True, text=True, check=True)
        url = res.stdout.strip()
        if not url:
            return None
        # Match common GitHub formats: git@github.com:owner/repo.git or https://github.com/owner/repo.git
        m = re.search(r"github\.com[:/]+([^/]+)/[^/]+(?:\.git)?$", url)
        if m:
            return m.group(1)
    except Exception:
        # If git isn't present or command fails, return None and let other detection methods run
        pass
    return None

def detect_owner(provided_owner: Optional[str], token_env: str) -> str:
    """
    Determine which GitHub owner (username/organization) to operate on.

    Detection priority:
      1) CLI --owner argument (explicit)
      2) GITHUB_REPOSITORY environment variable (Actions)
      3) PAT -> /user (token-based detection)
      4) local git remote 'origin'
      5) interactive prompt (when run in a TTY)

    Raises RuntimeError if none of the methods yield a value.
    """
    if provided_owner:
        return provided_owner
    env_owner = owner_from_github_repository_env()
    if env_owner:
        print(f"Detected owner from GITHUB_REPOSITORY: {env_owner}")
        return env_owner
    token = os.environ.get(token_env)
    auth_user = get_authenticated_username(token) if token else None
    if auth_user:
        print(f"Detected owner from TOKEN (/user): {auth_user}")
        return auth_user
    git_owner = owner_from_git_remote()
    if git_owner:
        print(f"Detected owner from git remote: {git_owner}")
        return git_owner
    # If interactive tty is available, ask the user
    if sys.stdin and sys.stdin.isatty():
        o = input("Could not auto-detect GitHub owner. Enter GitHub username: ").strip()
        if o:
            return o
    raise RuntimeError("Unable to determine GitHub owner automatically. Provide --owner or set TOKEN/GITHUB_REPOSITORY or run inside a git repo.")

# ---------------------------
# Fetch repos & traffic
# ---------------------------
def fetch_all_repos(owner: str, token: Optional[str]) -> List[Dict[str, Any]]:
    """
    Retrieve the list of repositories for `owner`.

    If the provided token authenticates as the same owner, use the /user/repos endpoint
    so private repositories are visible when the token has `repo` scope.

    Otherwise, use /users/<owner>/repos which returns only public repositories.
    """
    auth_user = get_authenticated_username(token) if token else None
    if auth_user and auth_user.lower() == owner.lower():
        # Authenticated as the owner — we can request user's repos including private (if the token permits)
        url = f"{API_BASE}/user/repos"
        params = {"per_page": 100, "sort": "pushed"}
    else:
        # Unauthenticated or different user — request public repos for the owner
        url = f"{API_BASE}/users/{owner}/repos"
        params = {"per_page": 100, "type": "owner", "sort": "pushed"}
    print(f"Fetching repos from: {url} (authenticated as: {auth_user})")
    repos = paginate(url, token=token, params=params)
    print(f"Found {len(repos)} repos.")
    return repos

def fetch_clone_stats(owner: str, repo_name: str, token: Optional[str]) -> Dict[str, Optional[int]]:
    """
    Fetch clone traffic statistics for a specific repo using:
      GET /repos/{owner}/{repo}/traffic/clones

    Returns a dict with keys:
      - "count": total clones in the last ~14 days or None if unavailable
      - "uniques": unique cloners in same window or None

    If the API returns a non-200 status the function prints a friendly message and
    returns {"count": None, "uniques": None}.
    """
    url = f"{API_BASE}/repos/{owner}/{repo_name}/traffic/clones"
    try:
        r = request_with_auth(url, token)
        if r.status_code == 200:
            d = r.json()
            return {"count": d.get("count"), "uniques": d.get("uniques")}
        else:
            # Typical cases: 401 (unauthorized) or 403 (forbidden) when token doesn't have scope
            print(f"  traffic/clones unavailable for {repo_name}: HTTP {r.status_code}")
            return {"count": None, "uniques": None}
    except Exception as e:
        # Network or unexpected JSON parse errors
        print(f"  error fetching traffic for {repo_name}: {e}")
        return {"count": None, "uniques": None}

def fetch_download_stats(owner: str, repo_name: str, token: Optional[str]) -> Optional[int]:
    """
    Fetch total download counts for all releases of a specific repo using:
      GET /repos/{owner}/{repo}/releases

    Sums download counts from all assets across all releases.
    Returns total download count or None if unavailable.

    If the API returns a non-200 status or the repo has no releases,
    the function prints a friendly message and returns None.

    Note: GitHub only tracks downloads for release assets, not for repo ZIP downloads.
    """
    url = f"{API_BASE}/repos/{owner}/{repo_name}/releases"
    try:
        releases = paginate(url, token)
        total_downloads = 0
        total_assets = 0
        release_count = len(releases)

        for release in releases:
            assets = release.get("assets", [])
            for asset in assets:
                download_count = asset.get("download_count", 0)
                total_downloads += download_count
                total_assets += 1

        if release_count == 0:
            print(f"  No releases found for {repo_name} - downloads require published releases with assets")
            return None
        elif total_assets == 0:
            print(f"  {repo_name} has {release_count} release(s) but no downloadable assets - downloads show N/A")
            return None
        else:
            print(f"  {repo_name}: {total_downloads} downloads from {total_assets} assets in {release_count} releases")
            return total_downloads
    except Exception as e:
        # Network or unexpected JSON parse errors
        print(f"  error fetching downloads for {repo_name}: {e}")
        return None

# ---------------------------
# History persistence (SQLite)
# ---------------------------
DB_PATH = "history.db"

def init_db():
    """
    Initialize the SQLite database and create the repo_clones table if it doesn't exist.
    The table includes columns for clone counts, unique clones, and download counts.
    """
    conn = sqlite3.connect(DB_PATH)
    cursor = conn.cursor()
    cursor.execute("""
        CREATE TABLE IF NOT EXISTS repo_clones (
            repo_name TEXT NOT NULL,
            day TEXT NOT NULL,
            clone_count INTEGER,
            unique_clones INTEGER,
            download_count INTEGER DEFAULT 0,
            PRIMARY KEY (repo_name, day)
        )
    """)
    # Check if download_count column exists (for migration from older schema)
    cursor.execute("PRAGMA table_info(repo_clones)")
    columns = [col[1] for col in cursor.fetchall()]
    if 'download_count' not in columns:
        cursor.execute("ALTER TABLE repo_clones ADD COLUMN download_count INTEGER DEFAULT 0")
        conn.commit()
    conn.close()

def upsert_clone_data(repo_name: str, day: str, clone_count: Optional[int], unique_clones: Optional[int], download_count: Optional[int] = 0):
    """
    Insert or update clone data for a specific repo and day.
    Uses INSERT OR REPLACE to handle existing records.
    Now includes download_count for release asset downloads.
    """
    conn = sqlite3.connect(DB_PATH)
    cursor = conn.cursor()
    cursor.execute("""
        INSERT OR REPLACE INTO repo_clones (repo_name, day, clone_count, unique_clones, download_count)
        VALUES (?, ?, ?, ?, ?)
    """, (repo_name, day, clone_count, unique_clones, download_count or 0))
    conn.commit()
    conn.close()

def remove_missing_repos(current_repos: List[str]):
    """
    Remove data for repos that no longer exist in the current repo list.
    This handles the case where a repo is deleted or removed from the owner's account.
    """
    if not current_repos:
        return
    conn = sqlite3.connect(DB_PATH)
    cursor = conn.cursor()
    placeholders = ','.join('?' * len(current_repos))
    cursor.execute(f"""
        DELETE FROM repo_clones
        WHERE repo_name NOT IN ({placeholders})
    """, current_repos)
    conn.commit()
    conn.close()

def read_history_from_db(repo_name: str) -> List[Tuple[datetime.datetime, Optional[int], Optional[int], Optional[int]]]:
    """
    Read history from database for a specific repo.
    Returns list of (datetime, clone_count_or_None, unique_count_or_None, download_count_or_None) sorted ascending.
    """
    conn = sqlite3.connect(DB_PATH)
    cursor = conn.cursor()
    cursor.execute("""
        SELECT day, clone_count, unique_clones, download_count
        FROM repo_clones
        WHERE repo_name = ?
        ORDER BY day ASC
    """, (repo_name,))
    rows = []
    for day_str, clone_count, unique_clones, download_count in cursor.fetchall():
        try:
            dt = datetime.datetime.fromisoformat(day_str)
            rows.append((dt, clone_count, unique_clones, download_count))
        except Exception:
            continue
    conn.close()
    return rows

def calculate_downloads_14d(repo_name: str, current_downloads: Optional[int]) -> Optional[int]:
    """
    Calculate the 14-day download count by comparing current downloads
    with the download count from 14 days ago.

    Returns the 14-day download count or None if not enough history.
    """
    if current_downloads is None:
        return None

    # Get history from database
    history = read_history_from_db(repo_name)
    if not history:
        return None

    # Find the entry from 14 days ago (or closest to it)
    today = datetime.datetime.utcnow().date()
    target_date = today - datetime.timedelta(days=14)

    # Look for an entry from around 14 days ago
    downloads_14d_ago = None
    for dt, _, _, download_count in history:
        entry_date = dt.date()
        # Find closest entry to 14 days ago
        if entry_date <= target_date:
            downloads_14d_ago = download_count
            break

    if downloads_14d_ago is None:
        # No data from 14 days ago, can't calculate 14-day count
        return None

    # Calculate the difference (downloads in the last 14 days)
    # Note: downloads are cumulative, so subtract older count from current
    download_14d = current_downloads - downloads_14d_ago
    return max(0, download_14d)  # Ensure non-negative


# ---------------------------
# Jinja2 templates (embedded)
# ---------------------------
SUMMARY_SVG_TEMPLATE = """
<svg xmlns="http://www.w3.org/2000/svg" width="{{ width }}" height="{{ height }}" viewBox="0 0 {{ width }} {{ height }}" role="img" aria-label="GitHub repository clone statistics">
<style>
  .card  { font-family: "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif; }
  .title { font-weight: 700; font-size: 18px; fill: #0b1220; }
  .meta  { font-weight: 400; font-size: 12px; fill: #374151; opacity: 0.95; }
  .label { font-weight: 600; font-size: 12px; fill: #0b1220; }
  .count { font-weight: 700; font-size: 12px; fill: #0b1220; text-anchor: start; }
  .count-small { font-weight: 600; font-size: 11px; fill:#6b7280; text-anchor: start; }
  .muted { font-size:12px; fill:#6b7280; }

  .bar-clone { fill: #1f6feb; rx:6; }
  .bar-uniq  { fill: #06b6d4; rx:6; }
  .bar-comb  { fill: #16a34a; rx:6; }

  .bar-bg { fill: #e5e7eb; rx:6; }
  @media (prefers-color-scheme: dark) {
    .title, .meta, .label, .count, .count-small, .muted { fill: #ffffff; }
    .meta, .count-small { opacity: 0.9; }
  }
</style>

<rect x="0" y="0" width="{{ width }}" height="{{ height }}" rx="12" fill="transparent"/>
<text x="{{ padding }}" y="34" class="title card">GitHub repos — {{ owner|e }}</text>
<text x="{{ padding }}" y="54" class="meta card">Repos: {{ total_repos }} · Clones: {{ total_clones }} · Uniques: {{ total_uniques }} · Combined: {{ total_combined }} · Downloads (14d): {{ total_downloads_14d }} · Downloads (total): {{ total_downloads_all }} · {{ mode_note }}</text>

<!-- LEGEND: colored squares + labels -->
<g id="legend">
  <rect x="{{ padding }}" y="70" width="12" height="12" class="bar-clone"/>
  <text x="{{ padding + 18 }}" y="80" class="muted card">Clones (14d)</text>

  <rect x="{{ padding + 180 }}" y="70" width="12" height="12" class="bar-uniq"/>
  <text x="{{ padding + 198 }}" y="80" class="muted card">Unique cloners (14d)</text>

  <rect x="{{ padding + 420 }}" y="70" width="12" height="12" class="bar-comb"/>
  <text x="{{ padding + 438 }}" y="80" class="muted card">Combined</text>
</g>

{% set base_y = 102 %}
{% for row in rows %}
  {% set block_top = base_y + loop.index0 * per_block_h %}
  <!-- repository label -->
  <text x="{{ padding }}" y="{{ block_top }}" class="label card">{{ row.name|e }}</text>

  <!-- CLONES bar (top) -->
  <rect x="{{ bar_x }}" y="{{ block_top + 14 }}" width="{{ bar_max_width }}" height="{{ bar_h }}" class="bar-bg"/>
  <rect x="{{ bar_x }}" y="{{ block_top + 14 }}" width="{{ row.bar_w_clone }}" height="{{ bar_h }}" class="bar-clone"/>
  <text x="{{ bar_x + bar_max_width + 12 }}" y="{{ block_top + 14 + 12 }}" class="count card">{{ row.clone_label|e }}</text>

  <!-- UNIQUES bar (middle) -->
  <rect x="{{ bar_x }}" y="{{ block_top + 14 + (bar_h + bar_gap) }}" width="{{ bar_max_width }}" height="{{ bar_h }}" class="bar-bg"/>
  <rect x="{{ bar_x }}" y="{{ block_top + 14 + (bar_h + bar_gap) }}" width="{{ row.bar_w_uniq }}" height="{{ bar_h }}" class="bar-uniq"/>
  <text x="{{ bar_x + bar_max_width + 12 }}" y="{{ block_top + 14 + (bar_h + bar_gap) + 12 }}" class="count-small card">{{ row.uniq_label|e }}</text>

  <!-- COMBINED bar (bottom) -->
  <rect x="{{ bar_x }}" y="{{ block_top + 14 + 2*(bar_h + bar_gap) }}" width="{{ bar_max_width }}" height="{{ bar_h }}" class="bar-bg"/>
  <rect x="{{ bar_x }}" y="{{ block_top + 14 + 2*(bar_h + bar_gap) }}" width="{{ row.bar_w_comb }}" height="{{ bar_h }}" class="bar-comb"/>
  <text x="{{ bar_x + bar_max_width + 12 }}" y="{{ block_top + 14 + 2*(bar_h + bar_gap) + 12 }}" class="count card">{{ row.comb_label|e }}</text>
{% endfor %}

</svg>
"""

TABLE_SVG_TEMPLATE = """
<svg xmlns="http://www.w3.org/2000/svg" width="{{ table_w }}" height="{{ svg_h }}" viewBox="0 0 {{ table_w }} {{ svg_h }}" role="img" aria-label="GitHub repository clones table">
<style>
  .card { font-family: "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif; }
  .title { font-weight:700; font-size:16px; fill: #0b1220; }
  .meta  { font-size:12px; fill:#374151; opacity:0.9; }
  .th    { font-size:12px; font-weight:700; fill:#0b1220; }
  .td    { font-size:12px; fill:#0b1220; }
  .muted { font-size:11px; fill:#6b7280; }
  .row-even { fill: #ffffff; }
  .row-odd  { fill: #f8fafc; }
  .table-border { stroke: #e6eaf2; stroke-width: 1; fill: none; }
  @media (prefers-color-scheme: dark) {
    .title, .meta, .th, .td, .muted { fill: #ffffff; }
    .row-even { fill: #0b1220; }
    .row-odd  { fill: #071026; }
    .table-border { stroke: #0f172a; }
  }
</style>
<text x="{{ padding }}" y="{{ padding + 14 }}" class="title card">GitHub repositories — {{ owner|e }}</text>
<text x="{{ padding }}" y="{{ padding + 32 }}" class="meta card">Generated: {{ generated_at }} · Mode: {{ mode_note }} · Repos: {{ total_repos }} · Clones: {{ total_clones }} · Downloads: {{ total_downloads }}</text>
<rect x="{{ tbl_x }}" y="{{ tbl_y }}" width="{{ tbl_w }}" height="{{ tbl_h }}" rx="8" class="row-even"/>
<rect x="{{ tbl_x }}" y="{{ tbl_y }}" width="{{ tbl_w }}" height="{{ tbl_h }}" class="table-border"/>
{% for col in cols %}
  <text x="{{ col.x }}" y="{{ header_y }}" class="th card">{{ col.hdr }}</text>
{% endfor %}
<line x1="{{ tbl_x }}" y1="{{ sep_y }}" x2="{{ tbl_x + tbl_w }}" y2="{{ sep_y }}" stroke="#e6eaf2" />
{% for r in rows %}
  {% set i = loop.index0 %}
  <rect x="{{ tbl_x }}" y="{{ tbl_y + row_top_offset + i*row_h }}" width="{{ tbl_w }}" height="{{ row_h }}" class="{{ 'row-even' if (i%2==0) else 'row-odd' }}" opacity="0.95"/>
  {% for col in cols %}
    {% set val = r[col.key] %}
    {% if col.key == 'name' %}
      <text x="{{ col.x }}" y="{{ tbl_y + row_top_offset + i*row_h + 16 }}" class="td card">{{ val|e }}</text>
    {% elif col.key == 'description' %}
      {% set lines = r['_desc_lines'] %}
      {% if lines|length == 0 %}
        <text x="{{ col.x }}" y="{{ tbl_y + row_top_offset + i*row_h + 16 }}" class="muted card">-</text>
      {% else %}
        <text x="{{ col.x }}" y="{{ tbl_y + row_top_offset + i*row_h + 14 }}" class="td card">{{ lines[0]|e }}</text>
        {% if lines|length > 1 %}
          <text x="{{ col.x }}" y="{{ tbl_y + row_top_offset + i*row_h + 28 }}" class="td card">{{ lines[1]|e }}</text>
        {% endif %}
      {% endif %}
    {% else %}
      <text x="{{ col.x }}" y="{{ tbl_y + row_top_offset + i*row_h + 16 }}" class="td card">{{ val|e }}</text>
    {% endif %}
  {% endfor %}
{% endfor %}
<text x="{{ padding }}" y="{{ footer_y }}" class="muted card">{{ footer_note }}</text>
</svg>
"""

HISTORY_SVG_TEMPLATE = """
<svg xmlns="http://www.w3.org/2000/svg" width="{{ width }}" height="{{ height }}" viewBox="0 0 {{ width }} {{ height }}" role="img" aria-label="GitHub repositories clone history">
<style>
  .card { font-family: "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif; }
  .title { font-weight:700; font-size:16px; fill:#0b1220; }
  .muted { font-size:12px; fill:#6b7280; }
  .axis { font-size:11px; fill:#6b7280; }
  .line { fill:none; stroke-width:2; }
  @media (prefers-color-scheme: dark) {
    .title, .muted, .axis { fill: #ffffff; }
  }
</style>

<rect x="0" y="0" width="{{ width }}" height="{{ height }}" fill="transparent" />

<text x="18" y="28" class="title card">
  Clone history (snapshots): {{ owner|e }}
</text>
<text x="18" y="46" class="muted card">
  Monthly and yearly aggregates derived from daily snapshots (per repo).
</text>

<!-- Top: Monthly chart -->
<g transform="translate(0,96)">
  <text x="18" y="14" class="muted card">
    Monthly aggregates · months: {{ months_count }} ({{ months_start }} → {{ months_end }})
  </text>

  <!-- axes -->
  <line x1="{{ margin_left }}" y1="{{ margin_top }}"
        x2="{{ margin_left }}" y2="{{ margin_top + monthly_plot_h }}"
        stroke="#e6eaf2"/>
  <line x1="{{ margin_left }}" y1="{{ margin_top + monthly_plot_h }}"
        x2="{{ margin_left + plot_w }}" y2="{{ margin_top + monthly_plot_h }}"
        stroke="#e6eaf2"/>

  {% for s in monthly_series %}
    <polyline points="{{ s.points }}" class="line" style="stroke:{{ s.color }}"/>
    <text x="{{ margin_left + 6 }}"
          y="{{ margin_top + 14 + loop.index0 * 14 }}"
          class="axis"
          style="fill:{{ s.color }}">
      {{ s.label|e }}
    </text>
  {% endfor %}

  <text x="{{ margin_left }}"
        y="{{ margin_top + monthly_plot_h + 20 }}"
        class="axis card">{{ months_start }}</text>
  <text x="{{ margin_left + plot_w }}"
        y="{{ margin_top + monthly_plot_h + 20 }}"
        class="axis card"
        text-anchor="end">{{ months_end }}</text>
</g>

<!-- Bottom: Yearly chart -->
<g transform="translate(0,{{ 96 + monthly_plot_h + 64 }})">
  <text x="18" y="14" class="muted card">
    Yearly aggregates · years: {{ years_count }} ({{ years_start }} → {{ years_end }})
  </text>

  <!-- axes -->
  <line x1="{{ margin_left }}" y1="{{ margin_top }}"
        x2="{{ margin_left }}" y2="{{ margin_top + yearly_plot_h }}"
        stroke="#e6eaf2"/>
  <line x1="{{ margin_left }}" y1="{{ margin_top + yearly_plot_h }}"
        x2="{{ margin_left + plot_w }}" y2="{{ margin_top + yearly_plot_h }}"
        stroke="#e6eaf2"/>

  {% for s in yearly_series %}
    <polyline points="{{ s.points }}" class="line" style="stroke:{{ s.color }}"/>
    <text x="{{ margin_left + 6 }}"
          y="{{ margin_top + 14 + loop.index0 * 14 }}"
          class="axis"
          style="fill:{{ s.color }}">
      {{ s.label|e }}
    </text>
  {% endfor %}

  <text x="{{ margin_left }}"
        y="{{ margin_top + yearly_plot_h + 20 }}"
        class="axis card">{{ years_start }}</text>
  <text x="{{ margin_left + plot_w }}"
        y="{{ margin_top + yearly_plot_h + 20 }}"
        class="axis card"
        text-anchor="end">{{ years_end }}</text>
</g>

</svg>
"""


# ---------------------------
# Rendering helpers
# ---------------------------
def render_template(template_str: str, ctx: dict) -> str:
    tpl = Template(template_str)
    return tpl.render(**ctx)

# ---------------------------
# Aggregate history helpers
# ---------------------------
def month_key(dt: datetime.datetime) -> Tuple[int, int]:
    return (dt.year, dt.month)

def aggregate_history_by_month(hist: List[Tuple[datetime.datetime, Optional[int], Optional[int], Optional[int]]]) -> List[Tuple[datetime.datetime, int, int, int]]:
    """
    Aggregate daily snapshots into monthly sums.
    Returns list of tuples: (month_dt (first-of-month), sum_clones, sum_uniques, sum_downloads) sorted ascending.
    """
    buckets = {}
    for dt, clones, uniques, downloads in hist:
        k = month_key(dt)
        if k not in buckets:
            buckets[k] = [0, 0, 0]
        buckets[k][0] += (clones or 0)
        buckets[k][1] += (uniques or 0)
        buckets[k][2] += (downloads or 0)
    # convert to sorted list of datetimes
    out = []
    for (y, m) in sorted(buckets.keys()):
        out.append((datetime.datetime(y, m, 1), buckets[(y, m)][0], buckets[(y, m)][1], buckets[(y, m)][2]))
    return out

def aggregate_history_by_year(hist: List[Tuple[datetime.datetime, Optional[int], Optional[int], Optional[int]]]) -> List[Tuple[datetime.datetime, int, int, int]]:
    """
    Aggregate daily snapshots into yearly sums.
    Returns list: (year_dt (first-of-year), sum_clones, sum_uniques, sum_downloads) sorted ascending.
    """
    buckets = {}
    for dt, clones, uniques, downloads in hist:
        y = dt.year
        if y not in buckets:
            buckets[y] = [0, 0, 0]
        buckets[y][0] += (clones or 0)
        buckets[y][1] += (uniques or 0)
        buckets[y][2] += (downloads or 0)
    out = []
    for y in sorted(buckets.keys()):
        out.append((datetime.datetime(y, 1, 1), buckets[y][0], buckets[y][1], buckets[y][2]))
    return out


# ---------------------------
# Outputs - summary SVG (stats.svg)
# ---------------------------
def generate_summary_svg_jinja(owner: str, repo_rows: List[Dict[str, Any]],
                               include_private: bool, out_path="stats.svg", top_n=6):
    """
    Render summary card with three stacked bar metrics per repo:
      - top bar: total clones (clone_count)
      - middle bar: unique cloners (clone_uniques)
      - bottom bar: combined (clone_count + clone_uniques)

    Each metric uses its own scale (max per metric) so short/long metrics are visible.
    Counts are rendered to the right of each bar. Missing values (None) are shown as 'N/A'.
    """
    # totals for header
    total_clones = sum((r.get("clone_count") or 0) for r in repo_rows)
    total_uniques = sum((r.get("clone_uniques") or 0) for r in repo_rows)
    total_combined = sum(((r.get("clone_count") or 0) + (r.get("clone_uniques") or 0)) for r in repo_rows)
    total_downloads_14d = sum((r.get("download_14d") or 0) for r in repo_rows)
    total_downloads_all = sum((r.get("download_total") or 0) for r in repo_rows)
    total_repos = len(repo_rows)

    # Sort repos by clone_count (descending) and take the top_n for the chart
    rows_sorted = sorted(repo_rows, key=lambda x: (x.get("clone_count") or 0), reverse=True)
    chart_rows = rows_sorted[:top_n]

    # sizing heuristics
    padding = 18
    CHAR_PX = 7.5
    bar_h = 18
    bar_gap = 8

    # compute label width requirement
    labels = [r.get("name") or "" for r in chart_rows]
    max_label_chars = max((len(l) for l in labels), default=0)
    name_col_width = int(max(120, min(max_label_chars * CHAR_PX + 10, 420)))

    # Build textual labels for counts (for sizing)
    clone_labels = []
    uniq_labels = []
    comb_labels = []
    for r in chart_rows:
        c = r.get("clone_count")
        u = r.get("clone_uniques")
        cstr = "N/A" if c is None else str(c)
        ustr = "N/A" if u is None else str(u)
        comb = None
        if c is None and u is None:
            comb_label = "N/A"
        else:
            # treat missing as 0 for combined label if one present
            comb_val = (c or 0) + (u or 0)
            comb_label = str(comb_val)
        clone_labels.append(cstr)
        uniq_labels.append(ustr)
        comb_labels.append(comb_label)

    all_count_labels = clone_labels + uniq_labels + comb_labels
    max_count_chars = max((len(s) for s in all_count_labels), default=1)
    count_text_w = int(max(64, max_count_chars * CHAR_PX + 12))

    # canvas width computation
    width = max(820, padding * 3 + name_col_width + 220 + count_text_w + 18)
    bar_x = padding + name_col_width + 12
    bar_max_width = int(width - bar_x - padding - count_text_w - 18)

    # per-repo block height and total height
    per_block_h = int(12 + 3*bar_h + 2*bar_gap)  # label + three bars + gaps
    height = 120 + len(chart_rows) * per_block_h

    # compute per-metric maxima (avoid zero)
    max_clones = max((r.get("clone_count") or 0) for r in chart_rows) or 1
    max_uniques = max((r.get("clone_uniques") or 0) for r in chart_rows) or 1
    max_comb = max(((r.get("clone_count") or 0) + (r.get("clone_uniques") or 0)) for r in chart_rows) or 1

    # Use a common denominator for clones and uniques so bars are visually comparable
    # Each bar still scales to full width, but clones and uniques use the same reference
    common_max = max(max_clones, max_uniques)

    # build rows with scaled bar widths and labels
    rows_for_template = []
    for r, clab, ulab, comblab in zip(chart_rows, clone_labels, uniq_labels, comb_labels):
        c = r.get("clone_count")
        u = r.get("clone_uniques")
        cval = 0 if c is None else int(c)
        uval = 0 if u is None else int(u)
        comb_val = cval + uval

        bar_w_clone = int((cval / common_max) * bar_max_width) if common_max else 0
        bar_w_uniq  = int((uval / common_max) * bar_max_width) if common_max else 0
        bar_w_comb  = int((comb_val / max_comb) * bar_max_width) if max_comb else 0

        rows_for_template.append({
            "name": r.get("name") or "",
            "clone_label": clab,
            "uniq_label": ulab,
            "comb_label": comblab,
            "bar_w_clone": bar_w_clone,
            "bar_w_uniq": bar_w_uniq,
            "bar_w_comb": bar_w_comb,
        })

    ctx = {
        "owner": owner,
        "total_repos": total_repos,
        "total_clones": total_clones,
        "total_uniques": total_uniques,
        "total_combined": total_combined,
        "total_downloads_14d": total_downloads_14d,
        "total_downloads_all": total_downloads_all,
        "mode_note": "Includes private repos" if include_private else "Public repos only",
        "rows": rows_for_template,
        "width": width,
        "height": height,
        "bar_x": bar_x,
        "bar_max_width": bar_max_width,
        "bar_h": bar_h,
        "bar_gap": bar_gap,
        "per_block_h": per_block_h,
        "padding": padding,  # exposed for legend/template placement
    }

    svg = render_template(SUMMARY_SVG_TEMPLATE, ctx)
    with open(out_path, "w", encoding="utf-8") as f:
        f.write(svg)
    print(f"Wrote summary svg to {out_path}")


# ---------------------------
# Outputs - full table SVG (repo_clones.svg)
# ---------------------------
def generate_table_svg_jinja(owner: str, repo_rows: List[Dict[str, Any]], include_private: bool,
                             out_path="repo_clones.svg", max_rows: Optional[int] = None):
    """
    Generate a full table as an SVG.

    The produced table contains columns:
      - Repo (name)
      - Description (first two lines, truncated)
      - Stars
      - Forks
      - Open issues
      - Last push (ISO->readable timestamp)
      - Clones (14d)
      - Unique clones (14d)
      - Downloads (all releases)

    Column widths are computed dynamically from the data with min/max clamping so
    the SVG remains visually stable even with unusually long names/descriptions.
    """

    # Copy rows (optionally truncate to max_rows)
    rows = repo_rows[:] if max_rows is None else repo_rows[:max_rows]
    total_repos = len(repo_rows)
    total_clones = sum((r.get("clone_count") or 0) for r in repo_rows)
    total_downloads = sum((r.get("download_count") or 0) for r in repo_rows)

    # Definition of columns: (key, header, min_px, max_px, is_numeric, wrap_chars_for_text)
    COLS = [
        ("name", "Repo", 140, 420, False, 30),
        ("description", "Description", 220, 600, False, 60),
        ("language", "Language", 80, 140, False, 20),
        ("stargazers_count", "Stars", 56, 80, True, 0),
        ("forks_count", "Forks", 56, 80, True, 0),
        ("watchers_count", "Watchers", 56, 80, True, 0),
        ("open_issues_count", "Open issues", 82, 110, True, 0),
        ("pushed_at", "Last push", 140, 200, False, 20),
        ("clone_count", "Clones (14d)", 90, 140, True, 0),
        ("clone_uniques", "Unique clones (14d)", 110, 160, True, 0),
        ("download_14d", "Downloads (14d)", 110, 160, True, 0),
        ("download_total", "Downloads (total)", 110, 160, True, 0),
    ]

    CHAR_PX = 7.2

    def cell_text(col_key, r):
        """
        Return the appropriate display string for a given column key and repo dict.
        Handles formatting for dates and the clone/download columns which may be None.
        """
        if col_key == "name":
            return r.get("name") or ""
        if col_key == "description":
            return r.get("description") or ""
        if col_key == "pushed_at":
            s = r.get("pushed_at") or ""
            return s[:19].replace("T", " ") if s else ""
        if col_key == "clone_count":
            v = r.get("clone_count")
            return str(v) if v is not None else "N/A"
        if col_key == "clone_uniques":
            v = r.get("clone_uniques")
            return str(v) if v is not None else "N/A"
        if col_key == "download_14d":
            v = r.get("download_14d")
            return str(v) if v is not None else "N/A"
        if col_key == "download_total":
            v = r.get("download_total")
            return str(v) if v is not None else "N/A"
        return str(r.get(col_key) or "")

    # Measure the character requirements for each column from the data, capped by wrap heuristics
    col_char_max = {}
    for key, hdr, min_px, max_px, isnumeric, wrap_chars in COLS:
        if isnumeric:
            # For numeric columns base sizing on the max number of digits observed
            col_char_max[key] = max(len(str(cell_text(key, r))) for r in rows) if rows else len(hdr)
        else:
            # For text columns, estimate using header length and data samples; cap by wrap_chars
            maxchars = len(hdr)
            for r in rows:
                c = cell_text(key, r)
                if not c:
                    continue
                # Limit extremely long strings to a conservative multiplier to avoid huge widths
                maxchars = max(maxchars, min(len(c), wrap_chars * 2))
            col_char_max[key] = maxchars

    # Convert char counts to pixel widths respecting each column's min/max constraints
    col_px = {}
    for key, hdr, min_px, max_px, isnumeric, wrap_chars in COLS:
        if isnumeric:
            chars = col_char_max[key] + 1
            estimated = int(chars * CHAR_PX + 10)
            col_px[key] = max(min_px, min(estimated, max_px))
        else:
            chars = col_char_max[key]
            estimated = int(chars * CHAR_PX + 18)
            col_px[key] = max(min_px, min(estimated, max_px))

    # Table layout parameters
    padding = 18
    header_h = 48
    row_h = 26
    gap = 8
    table_x = padding
    table_y = padding + 12

    # Compute overall table width and height
    table_w = sum(col_px[k] for k, *_ in [(c[0],) for c in COLS]) + gap * (len(COLS) - 1) + padding * 2
    table_w = max(table_w, 760)

    visible_rows = rows
    table_h = header_h + row_h * len(visible_rows) + padding * 2
    svg_h = table_y + table_h + padding

    # Build column metadata for template
    col_positions = []
    cur_x = table_x + 12
    for key, hdr, min_px, max_px, isnumeric, wrap_chars in COLS:
        col_positions.append({"key": key, "hdr": hdr, "x": int(cur_x), "px": col_px[key]})
        cur_x += col_px[key] + gap

    # Build row display data (including description wrap)
    rows_display = []
    for r in visible_rows:
        desc = (r.get("description") or "").strip()
        wrap_limit = next((w for (k,_,_,_,_,w) in COLS if k == "description"), 60)
        if not desc:
            desc_lines = []
        else:
            words = desc.split()
            line1 = ""
            line2 = ""
            cur = ""
            for w in words:
                if len(cur) + len(w) + 1 <= wrap_limit:
                    cur = (cur + " " + w).strip()
                else:
                    if not line1:
                        line1 = cur or w
                        cur = w
                    else:
                        line2 = cur + " " + w if cur else w
                        cur = ""
                        break
            if not line1 and cur:
                line1 = cur
            if not line2 and cur and len(line1) + len(cur) <= wrap_limit * 2:
                if not line2:
                    line2 = cur
            def trunc(s, n):
                return (s[:n - 1] + "…") if len(s) > n else s
            line1 = trunc(line1, wrap_limit)
            line2 = trunc(line2, wrap_limit)
            desc_lines = [line1] if line1 else []
            if line2:
                desc_lines.append(line2)
        # Prepare numeric/text fields for template (use humanized values)
        rows_display.append({
            "name": r.get("name") or "",
            "description": r.get("description") or "",
            "_desc_lines": desc_lines,
            "language": r.get("language") or "",
            "stargazers_count": r.get("stargazers_count", 0),
            "forks_count": r.get("forks_count", 0),
            "watchers_count": r.get("watchers_count", r.get("watchers", 0)),
            "open_issues_count": r.get("open_issues_count", 0),
            "pushed_at": (r.get("pushed_at") or "")[:19].replace("T", " "),
            "clone_count": r.get("clone_count") if r.get("clone_count") is not None else "N/A",
            "clone_uniques": r.get("clone_uniques") if r.get("clone_uniques") is not None else "N/A",
            "download_14d": r.get("download_14d") if r.get("download_14d") is not None else "N/A",
            "download_total": r.get("download_total") if r.get("download_total") is not None else "N/A",
        })

    ctx = {
        "owner": owner,
        "generated_at": datetime.datetime.utcnow().isoformat() + "Z",
        "mode_note": "Includes private repos" if include_private else "Public repos only",
        "total_repos": total_repos,
        "total_clones": total_clones,
        "total_downloads": total_downloads,
        "cols": col_positions,
        "rows": rows_display,
        "padding": padding,
        "tbl_x": table_x,
        "tbl_y": table_y,
        "tbl_w": table_w - padding*2,