Skip to content
5 changes: 4 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,10 @@ END_UNRELEASED_TEMPLATE

{#v0-0-0-added}
### Added
* Nothing added.
* (pypi) Write SimpleAPI contents to the `MODULE.bazel.lock` file if using
{obj}`experimental_index_url` which should speed up consecutive initializations and should no
longer require the network access if the cache is hydrated.
Implements [#2731](https://github.com/bazel-contrib/rules_python/issues/2731).

{#v1-9-0}
## [1.9.0] - 2026-02-21
Expand Down
3 changes: 3 additions & 0 deletions python/private/pypi/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -359,6 +359,9 @@ bzl_library(
bzl_library(
name = "pypi_cache_bzl",
srcs = ["pypi_cache.bzl"],
deps = [
":version_from_filename_bzl",
],
)

bzl_library(
Expand Down
21 changes: 16 additions & 5 deletions python/private/pypi/extension.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,7 @@ You cannot use both the additive_build_content and additive_build_content_file a
# dict[str repo, HubBuilder]
# See `hub_builder.bzl%hub_builder()` for `HubBuilder`
pip_hub_map = {}
simpleapi_cache = pypi_cache()
simpleapi_cache = pypi_cache(module_ctx = module_ctx)

for mod in module_ctx.modules:
for pip_attr in mod.tags.parse:
Expand Down Expand Up @@ -293,6 +293,7 @@ You cannot use both the additive_build_content and additive_build_content_file a
config = config,
exposed_packages = exposed_packages,
extra_aliases = extra_aliases,
facts = simpleapi_cache.get_facts(),
hub_group_map = hub_group_map,
hub_whl_map = hub_whl_map,
whl_libraries = whl_libraries,
Expand Down Expand Up @@ -372,7 +373,11 @@ def _pip_impl(module_ctx):
module_ctx: module contents
"""

mods = parse_modules(module_ctx, enable_pipstar = rp_config.enable_pipstar, enable_pipstar_extract = rp_config.enable_pipstar and rp_config.bazel_8_or_later)
mods = parse_modules(
module_ctx,
enable_pipstar = rp_config.enable_pipstar,
enable_pipstar_extract = rp_config.enable_pipstar and rp_config.bazel_8_or_later,
)

# Build all of the wheel modifications if the tag class is called.
_whl_mods_impl(mods.whl_mods)
Expand All @@ -394,9 +399,15 @@ def _pip_impl(module_ctx):
groups = mods.hub_group_map.get(hub_name),
)

return module_ctx.extension_metadata(
reproducible = True,
)
# The code is smart to not return facts if we don't support the mechanism for that.
# Hence we should not pass it to the metadata
if mods.facts:
return module_ctx.extension_metadata(
reproducible = True,
facts = mods.facts,
)
else:
return module_ctx.extension_metadata(reproducible = True)

_default_attrs = {
"arch_name": attr.string(
Expand Down
8 changes: 4 additions & 4 deletions python/private/pypi/hub_builder.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -395,11 +395,11 @@ def _set_get_index_urls(self, pip_attr):
index_url = pip_attr.experimental_index_url,
extra_index_urls = pip_attr.experimental_extra_index_urls or [],
index_url_overrides = pip_attr.experimental_index_url_overrides or {},
sources = [
d
for d in distributions
sources = {
d: versions
for d, versions in distributions.items()
if _use_downloader(self, python_version, d)
],
},
envsubst = pip_attr.envsubst,
# Auth related info
netrc = pip_attr.netrc,
Expand Down
18 changes: 10 additions & 8 deletions python/private/pypi/parse_requirements.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def parse_requirements(
os, arch combinations.
extra_pip_args (string list): Extra pip arguments to perform extra validations and to
be joined with args found in files.
get_index_urls: Callable[[ctx, list[str]], dict], a callable to get all
get_index_urls: Callable[[ctx, dict[str, list[str]]], dict], a callable to get all
of the distribution URLs from a PyPI index. Accepts ctx and
distribution names to query.
evaluate_markers: A function to use to evaluate the requirements.
Expand Down Expand Up @@ -170,15 +170,17 @@ def parse_requirements(

index_urls = {}
if get_index_urls:
distributions = {}
for reqs in requirements_by_platform.values():
for req in reqs.values():
if req.srcs.url:
continue

distributions.setdefault(req.distribution, []).append(req.srcs.version)

index_urls = get_index_urls(
ctx,
# Use list({}) as a way to have a set
list({
req.distribution: None
for reqs in requirements_by_platform.values()
for req in reqs.values()
if not req.srcs.url
}),
distributions,
)

ret = []
Expand Down
232 changes: 227 additions & 5 deletions python/private/pypi/pypi_cache.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -8,18 +8,33 @@ In the future the same will be used to:
- Store PyPI index query results as facts in the MODULE.bazel.lock file
"""

def pypi_cache(store = None):
load(":version_from_filename.bzl", "version_from_filename")

_FACT_VERSION = "v1"

def pypi_cache(module_ctx = None, store = None):
"""The cache for PyPI index queries.
Currently the key is of the following structure:
(url, real_url)
(url, real_url, versions)
Args:
module_ctx: The module context
store: The in-memory store, should implement dict interface for get and setdefault
Returns:
A cache struct
"""
mcache = memory_cache(store)
fcache = facts_cache(getattr(module_ctx, "facts", None))

# buildifier: disable=uninitialized
self = struct(
_store = store or {},
_mcache = mcache,
_facts = fcache,
setdefault = lambda key, parsed_result: _pypi_cache_setdefault(self, key, parsed_result),
get = lambda key: _pypi_cache_get(self, key),
get_facts = lambda: _pypi_cache_get_facts(self),
)

# buildifier: enable=uninitialized
Expand All @@ -40,7 +55,14 @@ def _pypi_cache_setdefault(self, key, parsed_result):
Returns:
The `parse_result`.
"""
return self._store.setdefault(key, parsed_result)
index_url, real_url, versions = key
self._mcache.setdefault(real_url, parsed_result)
if not versions or not self._facts:
return parsed_result

# Filter the packages to only what is needed before writing to the facts cache
filtered = _filter_packages(parsed_result, versions)
return self._facts.setdefault(index_url, filtered)

def _pypi_cache_get(self, key):
"""Return the parsed result from the cache.
Expand All @@ -52,4 +74,204 @@ def _pypi_cache_get(self, key):
Returns:
The {type}`struct` or `None` based on if the result is in the cache or not.
"""
return self._store.get(key)
index_url, real_url, versions = key

# When retrieving from memory cache, filter down to only what is needed. If the
# cache is empty, we will attempt to read from facts, however, reading from memory
# first allows us to not parse the contents of the lock file that may add up.
cached = _filter_packages(self._mcache.get(real_url), versions)
if not self._facts:
return cached

if not cached and versions:
# Could not get from in-memory, read from lockfile facts
cached = self._facts.get(index_url, versions)

return cached

def _pypi_cache_get_facts(self):
if not self._facts:
return {}

return self._facts.facts

def memory_cache(cache = None):
"""SimpleAPI cache for making fewer calls.
We are using the `real_url` as the key in the cache functions on purpose in order to get the
best possible cache hits.
Args:
cache: the storage to store things in memory.
Returns:
struct with 2 methods, `get` and `setdefault`.
"""
if cache == None:
cache = {}

return struct(
get = lambda real_url: cache.get(real_url),
setdefault = lambda real_url, value: cache.setdefault(real_url, value),
)

def _filter_packages(dists, requested_versions):
if dists == None or not requested_versions:
return dists

sha256s_by_version = {}
whls = {}
sdists = {}

for sha256, d in dists.sdists.items():
if d.version not in requested_versions:
continue

sdists[sha256] = d
sha256s_by_version.setdefault(d.version, []).append(sha256)

for sha256, d in dists.whls.items():
if d.version not in requested_versions:
continue

whls[sha256] = d
sha256s_by_version.setdefault(d.version, []).append(sha256)

if not whls and not sdists:
# TODO @aignas 2026-03-08: add logging
#print("WARN: no dists matched for versions {}".format(requested_versions))
return None

return struct(
whls = whls,
sdists = sdists,
sha256s_by_version = {
k: sorted(v)
for k, v in sha256s_by_version.items()
},
)

def facts_cache(known_facts, facts_version = _FACT_VERSION):
"""The facts cache.
Here we have a way to store things as facts and the main thing to keep in mind is that we should
not use the real_url in case it contains credentials in it (e.g. is of form `https://<username>:<password>@<host>`).
Args:
known_facts: An opaque object coming from {obj}`module_ctx.facts`.
facts_version: {type}`str` the version of the facts schema, used for short-circuiting.
Returns:
A struct that has:
* `get` method for getting values from the facts cache.
* `setdefault` method for setting values in the cache.
* `facts` attribute that should be passed to the {obj}`module_ctx.extension_metadata` to persist facts.
"""
if known_facts == None:
return None

facts = {}

return struct(
get = lambda index_url, versions: _get_from_facts(
facts,
known_facts,
index_url,
versions,
facts_version,
),
setdefault = lambda url, value: _store_facts(facts, facts_version, url, value),
known_facts = known_facts,
facts = facts,
)

def _get_from_facts(facts, known_facts, index_url, requested_versions, facts_version):
if known_facts.get("fact_version") != facts_version:
# cannot trust known facts, different version that we know how to parse
return None

known_sources = {}

root_url, _, distribution = index_url.rstrip("/").rpartition("/")
distribution = distribution.rstrip("/")
root_url = root_url.rstrip("/")

retrieved_versions = {}

for url, sha256 in known_facts.get("dist_hashes", {}).get(root_url, {}).get(distribution, {}).items():
filename = known_facts.get("dist_filenames", {}).get(root_url, {}).get(distribution, {}).get(sha256)
if not filename:
_, _, filename = url.rpartition("/")

version = version_from_filename(filename)
if version not in requested_versions:
# TODO @aignas 2026-01-21: do the check by requested shas at some point
# We don't have sufficient info in the lock file, need to call the API
#
continue

retrieved_versions[version] = True

if filename.endswith(".whl"):
dists = known_sources.setdefault("whls", {})
else:
dists = known_sources.setdefault("sdists", {})

known_sources.setdefault("sha256s_by_version", {}).setdefault(version, []).append(sha256)

dists.setdefault(sha256, struct(
sha256 = sha256,
filename = filename,
version = version,
metadata_url = "",
metadata_sha256 = "",
url = url,
yanked = known_facts.get("dist_yanked", {}).get(root_url, {}).get(distribution, {}).get(sha256),
))

if not known_sources:
# We found nothing in facts
return None

if len(requested_versions) != len(retrieved_versions):
# If the results are incomplete, then return None, so that we can fetch sources from the
# internet again.
return None

output = struct(
whls = known_sources.get("whls", {}),
sdists = known_sources.get("sdists", {}),
sha256s_by_version = {
k: sorted(v)
for k, v in known_sources.get("sha256s_by_version", {}).items()
},
)

# Persist these facts for the next run because we have used them.
return _store_facts(facts, facts_version, index_url, output)

def _store_facts(facts, fact_version, index_url, value):
"""Store values as facts in the lock file.
The main idea is to ensure that the lock file is small and it is only storing what
we would need to fetch from the internet. Any derivative information we can
from this that can be achieved using pure Starlark functions should be done in
Starlark.
"""
if not value:
return value

facts["fact_version"] = fact_version

root_url, _, distribution = index_url.rstrip("/").rpartition("/")
distribution = distribution.rstrip("/")
root_url = root_url.rstrip("/")

for sha256, d in (value.sdists | value.whls).items():
facts.setdefault("dist_hashes", {}).setdefault(root_url, {}).setdefault(distribution, {}).setdefault(d.url, sha256)
if not d.url.endswith(d.filename):
facts.setdefault("dist_filenames", {}).setdefault(root_url, {}).setdefault(distribution, {}).setdefault(d.url, d.filename)
if d.yanked != None:
facts.setdefault("dist_yanked", {}).setdefault(root_url, {}).setdefault(distribution, {}).setdefault(sha256, d.yanked)

return value
Loading