Skip to content

Commit 648f054

Browse files
committed
[feat] Add component directory scripts
1 parent a2a0b92 commit 648f054

24 files changed

Lines changed: 3426 additions & 0 deletions

.gitattributes

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
directory/compiled/** linguist-generated=true

directory/ranking_config.json

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
{
2+
"schemaVersion": 1,
3+
"halfLifeDays": 90.0,
4+
"weights": {
5+
"stars": 1.0,
6+
"recency": 2.0,
7+
"contributors": 0.5,
8+
"downloads": 0.35
9+
}
10+
}

directory/scripts/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
from __future__ import annotations
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
from __future__ import annotations
2+
3+
from .github import GitHubEnricher # type: ignore[import-not-found]
4+
from .pypi import PyPiEnricher # type: ignore[import-not-found]
5+
from .pypistats import PyPiStatsEnricher # type: ignore[import-not-found]
6+
7+
8+
def get_default_enrichers(*, github_token_env: str = "GH_TOKEN") -> list:
9+
return [
10+
GitHubEnricher(token_env=github_token_env),
11+
PyPiEnricher(),
12+
PyPiStatsEnricher(),
13+
]
Lines changed: 216 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,216 @@
1+
from __future__ import annotations
2+
3+
import os
4+
import re
5+
from dataclasses import dataclass
6+
from typing import Any
7+
from urllib.parse import parse_qs, urlparse
8+
9+
from _utils.enrich import should_refetch
10+
from _utils.enrichment_engine import FetchResult, Patch
11+
from _utils.github import parse_owner_repo
12+
from _utils.http import RetryConfig, fetch_json
13+
from _utils.time import utc_now_iso
14+
15+
GITHUB_API_BASE = "https://api.github.com"
16+
17+
18+
@dataclass(frozen=True)
19+
class GitHubResult:
20+
owner: str
21+
repo: str
22+
stars: int | None
23+
forks: int | None
24+
contributors_count: int | None
25+
open_issues: int | None
26+
pushed_at: str | None
27+
28+
29+
def _github_repo_api_url(owner: str, repo: str) -> str:
30+
return f"{GITHUB_API_BASE}/repos/{owner}/{repo}"
31+
32+
33+
def _github_contributors_api_url(owner: str, repo: str) -> str:
34+
return f"{GITHUB_API_BASE}/repos/{owner}/{repo}/contributors?per_page=1"
35+
36+
37+
_LINK_LAST_RE = re.compile(r'<([^>]+)>;\s*rel="last"')
38+
39+
40+
def _parse_last_page_from_link_header(link: str | None) -> int | None:
41+
if not isinstance(link, str) or not link.strip():
42+
return None
43+
m = _LINK_LAST_RE.search(link)
44+
if not m:
45+
return None
46+
try:
47+
last_url = m.group(1)
48+
parsed = urlparse(last_url)
49+
qs = parse_qs(parsed.query)
50+
page_vals = qs.get("page")
51+
if not page_vals:
52+
return None
53+
page = int(page_vals[0])
54+
return page if page >= 0 else None
55+
except Exception:
56+
return None
57+
58+
59+
def _get_token(token_env: str) -> str | None:
60+
token = os.environ.get(token_env)
61+
if token:
62+
return token.strip() or None
63+
for k in ("GH_TOKEN", "GH_API_TOKEN", "GITHUB_TOKEN"):
64+
token = os.environ.get(k)
65+
if token:
66+
return token.strip() or None
67+
return None
68+
69+
70+
class GitHubEnricher:
71+
name = "github"
72+
bucket = "github"
73+
74+
def __init__(self, *, token_env: str = "GH_TOKEN") -> None:
75+
self._token_env = token_env
76+
self._token = _get_token(token_env)
77+
self._retry_cfg = RetryConfig(retry_statuses=(403, 429, 500, 502, 503, 504))
78+
79+
def key_for_component(self, comp: dict[str, Any]) -> tuple[str, str] | None:
80+
gh_url = comp.get("gitHubUrl")
81+
if not isinstance(gh_url, str) or not gh_url.strip():
82+
return None
83+
try:
84+
owner, repo = parse_owner_repo(gh_url)
85+
except Exception:
86+
return None
87+
return (owner.lower(), repo.lower())
88+
89+
def needs_fetch(
90+
self, comp: dict[str, Any], refresh_older_than_hours: float | None
91+
) -> bool:
92+
metrics = comp.get("metrics")
93+
gh_metrics = metrics.get("github") if isinstance(metrics, dict) else None
94+
existing_fetched_at = (
95+
gh_metrics.get("fetchedAt") if isinstance(gh_metrics, dict) else None
96+
)
97+
stale = gh_metrics.get("isStale") if isinstance(gh_metrics, dict) else None
98+
return should_refetch(
99+
fetched_at=(
100+
existing_fetched_at if isinstance(existing_fetched_at, str) else None
101+
),
102+
is_stale=stale if isinstance(stale, bool) else None,
103+
refresh_older_than_hours=refresh_older_than_hours,
104+
)
105+
106+
def _headers(self) -> dict[str, str]:
107+
headers = {
108+
"Accept": "application/vnd.github+json",
109+
"User-Agent": "component-gallery-enrich-github",
110+
"X-GitHub-Api-Version": "2022-11-28",
111+
}
112+
if self._token:
113+
headers["Authorization"] = f"Bearer {self._token}"
114+
return headers
115+
116+
def _fetch_contributors_count(
117+
self, *, ctx, owner: str, repo: str
118+
) -> tuple[int | None, int, int | None, str | None]:
119+
url = _github_contributors_api_url(owner, repo)
120+
r = ctx.request_json(
121+
url=url,
122+
headers=self._headers(),
123+
fetcher=fetch_json,
124+
retry_cfg=self._retry_cfg,
125+
)
126+
if not r.ok or not isinstance(r.data, list):
127+
return None, r.attempts, r.status, r.error
128+
link = None
129+
if isinstance(r.headers, dict):
130+
link = r.headers.get("Link") or r.headers.get("link")
131+
last_page = _parse_last_page_from_link_header(link)
132+
if isinstance(last_page, int):
133+
return last_page, r.attempts, r.status, None
134+
return (1 if len(r.data) >= 1 else 0), r.attempts, r.status, None
135+
136+
def fetch(self, key: tuple[str, str], ctx) -> FetchResult:
137+
owner, repo = key
138+
url = _github_repo_api_url(owner, repo)
139+
r = ctx.request_json(
140+
url=url,
141+
headers=self._headers(),
142+
fetcher=fetch_json,
143+
retry_cfg=self._retry_cfg,
144+
)
145+
attempts = int(r.attempts)
146+
if not r.ok or not isinstance(r.data, dict):
147+
return FetchResult(
148+
ok=False,
149+
data=None,
150+
error=r.error or "Request failed.",
151+
attempts=attempts,
152+
status=r.status,
153+
)
154+
155+
data = r.data
156+
stars = data.get("stargazers_count")
157+
forks = data.get("forks_count")
158+
open_issues = data.get("open_issues_count")
159+
pushed_at = data.get("pushed_at")
160+
161+
contributors_count, contrib_attempts, status, err = (
162+
self._fetch_contributors_count(ctx=ctx, owner=owner, repo=repo)
163+
)
164+
attempts += int(contrib_attempts)
165+
if err:
166+
return FetchResult(
167+
ok=False,
168+
data=None,
169+
error=err,
170+
attempts=attempts,
171+
status=status,
172+
)
173+
174+
result = GitHubResult(
175+
owner=owner,
176+
repo=repo,
177+
stars=int(stars) if isinstance(stars, int) else None,
178+
forks=int(forks) if isinstance(forks, int) else None,
179+
contributors_count=(
180+
int(contributors_count)
181+
if isinstance(contributors_count, int) and contributors_count >= 0
182+
else None
183+
),
184+
open_issues=int(open_issues) if isinstance(open_issues, int) else None,
185+
pushed_at=str(pushed_at) if isinstance(pushed_at, str) else None,
186+
)
187+
return FetchResult(
188+
ok=True, data=result, error=None, attempts=attempts, status=r.status
189+
)
190+
191+
def patch_success(
192+
self, comp: dict[str, Any], result: GitHubResult, fetched_at: str
193+
) -> Patch:
194+
metrics = comp.get("metrics")
195+
gh_metrics = metrics.get("github") if isinstance(metrics, dict) else None
196+
prev_stars = gh_metrics.get("stars") if isinstance(gh_metrics, dict) else None
197+
198+
updates: dict[str, Any] = {}
199+
if isinstance(result.stars, int):
200+
updates["stars"] = result.stars
201+
if isinstance(result.forks, int):
202+
updates["forks"] = result.forks
203+
if isinstance(result.contributors_count, int):
204+
updates["contributorsCount"] = result.contributors_count
205+
if isinstance(result.open_issues, int):
206+
updates["openIssues"] = result.open_issues
207+
if isinstance(result.pushed_at, str):
208+
updates["lastPushAt"] = result.pushed_at
209+
updates["fetchedAt"] = fetched_at or utc_now_iso()
210+
updates["isStale"] = False
211+
212+
changed = isinstance(result.stars, int) and prev_stars != result.stars
213+
return Patch(bucket=self.bucket, updates=updates, changed=changed)
214+
215+
def patch_failure(self, comp: dict[str, Any], error: str | None) -> Patch:
216+
return Patch(bucket=self.bucket, updates={"isStale": True}, changed=False)
Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
from __future__ import annotations
2+
3+
from dataclasses import dataclass
4+
from typing import Any
5+
6+
from _utils.enrich import should_refetch
7+
from _utils.enrichment_engine import FetchResult, Patch
8+
from _utils.http import RetryConfig, fetch_json
9+
from _utils.pypi_helpers import infer_pypi_project_from_piplink
10+
from _utils.time import utc_now_iso
11+
12+
PYPI_BASE = "https://pypi.org/pypi"
13+
14+
15+
@dataclass(frozen=True)
16+
class PyPiResult:
17+
project: str
18+
latest_version: str | None
19+
latest_release_at: str | None
20+
21+
22+
def _get_project_for_component(comp: dict[str, Any]) -> str | None:
23+
p = comp.get("pypi")
24+
if isinstance(p, str) and p.strip():
25+
return p.strip()
26+
return infer_pypi_project_from_piplink(comp.get("pipLink"))
27+
28+
29+
def _pypi_api_url(project: str) -> str:
30+
return f"{PYPI_BASE}/{project}/json"
31+
32+
33+
def _max_upload_time_iso(release_files: Any) -> str | None:
34+
if not isinstance(release_files, list):
35+
return None
36+
times: list[str] = []
37+
for f in release_files:
38+
if not isinstance(f, dict):
39+
continue
40+
t = f.get("upload_time_iso_8601") or f.get("upload_time")
41+
if isinstance(t, str) and t:
42+
times.append(t)
43+
return max(times) if times else None
44+
45+
46+
class PyPiEnricher:
47+
name = "pypi"
48+
bucket = "pypi"
49+
50+
def __init__(self) -> None:
51+
self._retry_cfg = RetryConfig(retry_statuses=(429, 500, 502, 503, 504))
52+
53+
def key_for_component(self, comp: dict[str, Any]) -> str | None:
54+
return _get_project_for_component(comp)
55+
56+
def needs_fetch(
57+
self, comp: dict[str, Any], refresh_older_than_hours: float | None
58+
) -> bool:
59+
metrics = comp.get("metrics")
60+
pypi_metrics = metrics.get("pypi") if isinstance(metrics, dict) else None
61+
existing_fetched_at = (
62+
pypi_metrics.get("fetchedAt") if isinstance(pypi_metrics, dict) else None
63+
)
64+
stale = pypi_metrics.get("isStale") if isinstance(pypi_metrics, dict) else None
65+
return should_refetch(
66+
fetched_at=(
67+
existing_fetched_at if isinstance(existing_fetched_at, str) else None
68+
),
69+
is_stale=stale if isinstance(stale, bool) else None,
70+
refresh_older_than_hours=refresh_older_than_hours,
71+
)
72+
73+
def fetch(self, key: str, ctx) -> FetchResult:
74+
url = _pypi_api_url(key)
75+
headers = {
76+
"Accept": "application/json",
77+
"User-Agent": "component-gallery-enrich-pypi",
78+
}
79+
r = ctx.request_json(
80+
url=url,
81+
headers=headers,
82+
fetcher=fetch_json,
83+
retry_cfg=self._retry_cfg,
84+
)
85+
if not r.ok or not isinstance(r.data, dict):
86+
return FetchResult(
87+
ok=False,
88+
data=None,
89+
error=r.error or "Request failed.",
90+
attempts=int(r.attempts),
91+
status=r.status,
92+
)
93+
data = r.data
94+
info = data.get("info")
95+
releases = data.get("releases")
96+
if not isinstance(info, dict) or not isinstance(releases, dict):
97+
return FetchResult(
98+
ok=False,
99+
data=None,
100+
error="Missing info/releases.",
101+
attempts=int(r.attempts),
102+
status=r.status,
103+
)
104+
latest_version = info.get("version")
105+
latest_version = (
106+
str(latest_version)
107+
if isinstance(latest_version, str) and latest_version
108+
else None
109+
)
110+
111+
latest_release_at: str | None = None
112+
if latest_version and latest_version in releases:
113+
latest_release_at = _max_upload_time_iso(releases.get(latest_version))
114+
if latest_release_at is None:
115+
best: str | None = None
116+
for _, files in releases.items():
117+
t = _max_upload_time_iso(files)
118+
if t and (best is None or t > best):
119+
best = t
120+
latest_release_at = best
121+
122+
result = PyPiResult(
123+
project=key,
124+
latest_version=latest_version,
125+
latest_release_at=latest_release_at,
126+
)
127+
return FetchResult(
128+
ok=True, data=result, error=None, attempts=int(r.attempts), status=r.status
129+
)
130+
131+
def patch_success(
132+
self, comp: dict[str, Any], result: PyPiResult, fetched_at: str
133+
) -> Patch:
134+
updates = {
135+
"latestVersion": result.latest_version,
136+
"latestReleaseAt": result.latest_release_at,
137+
"fetchedAt": fetched_at or utc_now_iso(),
138+
"isStale": False,
139+
}
140+
return Patch(bucket=self.bucket, updates=updates, changed=True)
141+
142+
def patch_failure(self, comp: dict[str, Any], error: str | None) -> Patch:
143+
return Patch(bucket=self.bucket, updates={"isStale": True}, changed=False)

0 commit comments

Comments
 (0)