-
-
Notifications
You must be signed in to change notification settings - Fork 682
Expand file tree
/
Copy pathpypi_cache.bzl
More file actions
277 lines (215 loc) · 9.27 KB
/
pypi_cache.bzl
File metadata and controls
277 lines (215 loc) · 9.27 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
"""A cache for the PyPI index contents evaluation.
This is design to work as the following:
- in-memory cache for results of PyPI index queries, so that we are not calling PyPI multiple times
for the same package for different hub repos.
In the future the same will be used to:
- Store PyPI index query results as facts in the MODULE.bazel.lock file
"""
load(":version_from_filename.bzl", "version_from_filename")
_FACT_VERSION = "v1"
def pypi_cache(module_ctx = None, store = None):
"""The cache for PyPI index queries.
Currently the key is of the following structure:
(url, real_url, versions)
Args:
module_ctx: The module context
store: The in-memory store, should implement dict interface for get and setdefault
Returns:
A cache struct
"""
mcache = memory_cache(store)
fcache = facts_cache(getattr(module_ctx, "facts", None))
# buildifier: disable=uninitialized
self = struct(
_mcache = mcache,
_facts = fcache,
setdefault = lambda key, parsed_result: _pypi_cache_setdefault(self, key, parsed_result),
get = lambda key: _pypi_cache_get(self, key),
get_facts = lambda: _pypi_cache_get_facts(self),
)
# buildifier: enable=uninitialized
return self
def _pypi_cache_setdefault(self, key, parsed_result):
"""Store the value if not yet cached.
Args:
self: {type}`struct` The self of this implementation.
key: {type}`str` The cache key, can be any string.
parsed_result: {type}`struct` The result of `parse_simpleapi_html` function.
index_url and distribution is used to write to the MODULE.bazel.lock file as facts
real_index_url and distribution is used to write to in-memory cache to ensure that there are
no duplicate calls to the PyPI indexes
Returns:
The `parse_result`.
"""
index_url, real_url, versions = key
self._mcache.setdefault(real_url, parsed_result)
if not versions or not self._facts:
return parsed_result
# Filter the packages to only what is needed before writing to the facts cache
filtered = _filter_packages(parsed_result, versions)
return self._facts.setdefault(index_url, filtered)
def _pypi_cache_get(self, key):
"""Return the parsed result from the cache.
Args:
self: {type}`struct` The self of this implementation.
key: {type}`str` The cache key, can be any string.
Returns:
The {type}`struct` or `None` based on if the result is in the cache or not.
"""
index_url, real_url, versions = key
# When retrieving from memory cache, filter down to only what is needed. If the
# cache is empty, we will attempt to read from facts, however, reading from memory
# first allows us to not parse the contents of the lock file that may add up.
cached = _filter_packages(self._mcache.get(real_url), versions)
if not self._facts:
return cached
if not cached and versions:
# Could not get from in-memory, read from lockfile facts
cached = self._facts.get(index_url, versions)
return cached
def _pypi_cache_get_facts(self):
if not self._facts:
return {}
return self._facts.facts
def memory_cache(cache = None):
"""SimpleAPI cache for making fewer calls.
We are using the `real_url` as the key in the cache functions on purpose in order to get the
best possible cache hits.
Args:
cache: the storage to store things in memory.
Returns:
struct with 2 methods, `get` and `setdefault`.
"""
if cache == None:
cache = {}
return struct(
get = lambda real_url: cache.get(real_url),
setdefault = lambda real_url, value: cache.setdefault(real_url, value),
)
def _filter_packages(dists, requested_versions):
if dists == None or not requested_versions:
return dists
sha256s_by_version = {}
whls = {}
sdists = {}
for sha256, d in dists.sdists.items():
if d.version not in requested_versions:
continue
sdists[sha256] = d
sha256s_by_version.setdefault(d.version, []).append(sha256)
for sha256, d in dists.whls.items():
if d.version not in requested_versions:
continue
whls[sha256] = d
sha256s_by_version.setdefault(d.version, []).append(sha256)
if not whls and not sdists:
# TODO @aignas 2026-03-08: add logging
#print("WARN: no dists matched for versions {}".format(requested_versions))
return None
return struct(
whls = whls,
sdists = sdists,
sha256s_by_version = {
k: sorted(v)
for k, v in sha256s_by_version.items()
},
)
def facts_cache(known_facts, facts_version = _FACT_VERSION):
"""The facts cache.
Here we have a way to store things as facts and the main thing to keep in mind is that we should
not use the real_url in case it contains credentials in it (e.g. is of form `https://<username>:<password>@<host>`).
Args:
known_facts: An opaque object coming from {obj}`module_ctx.facts`.
facts_version: {type}`str` the version of the facts schema, used for short-circuiting.
Returns:
A struct that has:
* `get` method for getting values from the facts cache.
* `setdefault` method for setting values in the cache.
* `facts` attribute that should be passed to the {obj}`module_ctx.extension_metadata` to persist facts.
"""
if known_facts == None:
return None
facts = {}
return struct(
get = lambda index_url, versions: _get_from_facts(
facts,
known_facts,
index_url,
versions,
facts_version,
),
setdefault = lambda url, value: _store_facts(facts, facts_version, url, value),
known_facts = known_facts,
facts = facts,
)
def _get_from_facts(facts, known_facts, index_url, requested_versions, facts_version):
if known_facts.get("fact_version") != facts_version:
# cannot trust known facts, different version that we know how to parse
return None
known_sources = {}
root_url, _, distribution = index_url.rstrip("/").rpartition("/")
distribution = distribution.rstrip("/")
root_url = root_url.rstrip("/")
retrieved_versions = {}
for url, sha256 in known_facts.get("dist_hashes", {}).get(root_url, {}).get(distribution, {}).items():
filename = known_facts.get("dist_filenames", {}).get(root_url, {}).get(distribution, {}).get(sha256)
if not filename:
_, _, filename = url.rpartition("/")
version = version_from_filename(filename)
if version not in requested_versions:
# TODO @aignas 2026-01-21: do the check by requested shas at some point
# We don't have sufficient info in the lock file, need to call the API
#
continue
retrieved_versions[version] = True
if filename.endswith(".whl"):
dists = known_sources.setdefault("whls", {})
else:
dists = known_sources.setdefault("sdists", {})
known_sources.setdefault("sha256s_by_version", {}).setdefault(version, []).append(sha256)
dists.setdefault(sha256, struct(
sha256 = sha256,
filename = filename,
version = version,
metadata_url = "",
metadata_sha256 = "",
url = url,
yanked = known_facts.get("dist_yanked", {}).get(root_url, {}).get(distribution, {}).get(sha256),
))
if not known_sources:
# We found nothing in facts
return None
if len(requested_versions) != len(retrieved_versions):
# If the results are incomplete, then return None, so that we can fetch sources from the
# internet again.
return None
output = struct(
whls = known_sources.get("whls", {}),
sdists = known_sources.get("sdists", {}),
sha256s_by_version = {
k: sorted(v)
for k, v in known_sources.get("sha256s_by_version", {}).items()
},
)
# Persist these facts for the next run because we have used them.
return _store_facts(facts, facts_version, index_url, output)
def _store_facts(facts, fact_version, index_url, value):
"""Store values as facts in the lock file.
The main idea is to ensure that the lock file is small and it is only storing what
we would need to fetch from the internet. Any derivative information we can
from this that can be achieved using pure Starlark functions should be done in
Starlark.
"""
if not value:
return value
facts["fact_version"] = fact_version
root_url, _, distribution = index_url.rstrip("/").rpartition("/")
distribution = distribution.rstrip("/")
root_url = root_url.rstrip("/")
for sha256, d in (value.sdists | value.whls).items():
facts.setdefault("dist_hashes", {}).setdefault(root_url, {}).setdefault(distribution, {}).setdefault(d.url, sha256)
if not d.url.endswith(d.filename):
facts.setdefault("dist_filenames", {}).setdefault(root_url, {}).setdefault(distribution, {}).setdefault(d.url, d.filename)
if d.yanked != None:
facts.setdefault("dist_yanked", {}).setdefault(root_url, {}).setdefault(distribution, {}).setdefault(sha256, d.yanked)
return value