Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions python/private/pypi/parse_requirements.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,7 @@ def _package_srcs(
url = "",
filename = "",
sha256 = "",
yanked = False,
yanked = None,
)
req_line = r.srcs.requirement_line
else:
Expand Down Expand Up @@ -379,7 +379,7 @@ def _add_dists(*, requirement, index_urls, target_platform, logger = None):
url = requirement.srcs.url,
filename = requirement.srcs.filename,
sha256 = requirement.srcs.shas[0] if requirement.srcs.shas else "",
yanked = False,
yanked = None,
)

return dist, False
Expand All @@ -403,20 +403,20 @@ def _add_dists(*, requirement, index_urls, target_platform, logger = None):
# See https://packaging.python.org/en/latest/specifications/simple-repository-api/#adding-yank-support-to-the-simple-api

maybe_whl = index_urls.whls.get(sha256)
if maybe_whl and not maybe_whl.yanked:
if maybe_whl and maybe_whl.yanked == None:
whls.append(maybe_whl)
continue

maybe_sdist = index_urls.sdists.get(sha256)
if maybe_sdist and not maybe_sdist.yanked:
if maybe_sdist and maybe_sdist.yanked == None:
sdist = maybe_sdist
continue

logger.warn(lambda: "Could not find a whl or an sdist with sha256={}".format(sha256))

yanked = {}
for dist in whls + [sdist]:
if dist and dist.yanked:
if dist and dist.yanked != None:
yanked.setdefault(dist.yanked, []).append(dist.filename)
if yanked:
logger.warn(lambda: "\n".join([
Expand Down
198 changes: 147 additions & 51 deletions python/private/pypi/parse_simpleapi_html.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -26,81 +26,177 @@ def parse_simpleapi_html(*, content):

Returns:
A list of structs with:
* filename: The filename of the artifact.
* version: The version of the artifact.
* url: The URL to download the artifact.
* sha256: The sha256 of the artifact.
* metadata_sha256: The whl METADATA sha256 if we can download it. If this is
present, then the 'metadata_url' is also present. Defaults to "".
* metadata_url: The URL for the METADATA if we can download it. Defaults to "".
* filename: {type}`str` The filename of the artifact.
* version: {type}`str` The version of the artifact.
* url: {type}`str` The URL to download the artifact.
* sha256: {type}`str` The sha256 of the artifact.
* metadata_sha256: {type}`str` The whl METADATA sha256 if we can download it. If this is
present, then the 'metadata_url' is also present. Defaults to "".
* metadata_url: {type}`str` The URL for the METADATA if we can download it. Defaults to "".
* yanked: {type}`str | None` the yank reason if the package is yanked. If it is not yanked,
then it will be `None`. An empty string yank reason means that the package is yanked but
the reason is not provided.
"""
sdists = {}
whls = {}
lines = content.split("<a href=\"")

_, _, api_version = lines[0].partition("name=\"pypi:repository-version\" content=\"")
api_version, _, _ = api_version.partition("\"")
sha256s_by_version = {}

# We must assume the 1.0 if it is not present
# See https://packaging.python.org/en/latest/specifications/simple-repository-api/#clients
api_version = api_version or "1.0"
api_version = tuple([int(i) for i in api_version.split(".")])
# 1. Faster Version Extraction
# Search only the first 2KB for versioning metadata instead of splitting everything
api_version = (1, 0)
meta_idx = content.find('name="pypi:repository-version"')
if meta_idx != -1:
# Find 'content="' after the name attribute
v_start = content.find('content="', meta_idx)
if v_start != -1:
v_end = content.find('"', v_start + 9)
v_str = content[v_start + 9:v_end]
if v_str:
api_version = tuple([int(i) for i in v_str.split(".")])

if api_version >= (2, 0):
# We don't expect to have version 2.0 here, but have this check in place just in case.
# https://packaging.python.org/en/latest/specifications/simple-repository-api/#versioning-pypi-s-simple-api
fail("Unsupported API version: {}".format(api_version))

# Each line follows the following pattern
# <a href="https://...#sha256=..." attribute1="foo" ... attributeN="bar">filename</a><br />
sha256s_by_version = {}
for line in lines[1:]:
dist_url, _, tail = line.partition("#sha256=")
# 2. Iterate using find() to avoid huge list allocations from .split("<a ")
cursor = 0
for _ in range(1000000): # Safety break for Starlark
start_tag = content.find("<a ", cursor)
if start_tag == -1:
break

# Find the end of the opening tag and the closing </a>
tag_end = content.find(">", start_tag)
end_tag = content.find("</a>", tag_end)
if tag_end == -1 or end_tag == -1:
break

# Extract only the necessary slices
attr_part = content[start_tag + 3:tag_end]
filename = content[tag_end + 1:end_tag].strip()

# Update cursor for next iteration
cursor = end_tag + 4

# 3. Efficient Attribute Parsing
attrs = _parse_attrs(attr_part)
href = attrs.get("href", "")
if not href:
continue

sha256, _, tail = tail.partition("\"")
dist_url, _, sha256 = href.partition("#sha256=")

# See https://packaging.python.org/en/latest/specifications/simple-repository-api/#adding-yank-support-to-the-simple-api
yanked = "data-yanked" in line
# Handle Yanked status
yanked = None
if "data-yanked" in attrs:
yanked = _unescape_pypi_html(attrs["data-yanked"])

head, _, _ = tail.rpartition("</a>")
maybe_metadata, _, filename = head.rpartition(">")
version = version_from_filename(filename)
sha256s_by_version.setdefault(version, []).append(sha256)

# 4. Optimized Metadata Check (PEP 714)
metadata_sha256 = ""
metadata_url = ""
for metadata_marker in ["data-core-metadata", "data-dist-info-metadata"]:
metadata_marker = metadata_marker + "=\"sha256="
if metadata_marker in maybe_metadata:
# Implement https://peps.python.org/pep-0714/
_, _, tail = maybe_metadata.partition(metadata_marker)
metadata_sha256, _, _ = tail.partition("\"")
metadata_url = dist_url + ".metadata"
break

# Dist-info is more common in modern PyPI
m_val = attrs.get("data-dist-info-metadata") or attrs.get("data-core-metadata")
if m_val and m_val != "false":
_, _, metadata_sha256 = m_val.partition("sha256=")
metadata_url = dist_url + ".metadata"

# 5. Result object
dist = struct(
filename = filename,
version = version,
url = dist_url,
sha256 = sha256,
metadata_sha256 = metadata_sha256,
metadata_url = metadata_url,
yanked = yanked,
)

if filename.endswith(".whl"):
whls[sha256] = struct(
filename = filename,
version = version,
url = dist_url,
sha256 = sha256,
metadata_sha256 = metadata_sha256,
metadata_url = metadata_url,
yanked = yanked,
)
whls[sha256] = dist
else:
sdists[sha256] = struct(
filename = filename,
version = version,
url = dist_url,
sha256 = sha256,
metadata_sha256 = "",
metadata_url = "",
yanked = yanked,
)
sdists[sha256] = dist

return struct(
sdists = sdists,
whls = whls,
sha256s_by_version = sha256s_by_version,
)

def _parse_attrs(attr_string):
"""Parses attributes from a pre-sliced string."""
attrs = {}
parts = attr_string.split('"')

for i in range(0, len(parts) - 1, 2):
raw_key = parts[i].strip()
if not raw_key:
continue

key_parts = raw_key.split(" ")
current_key = key_parts[-1].rstrip("=")

# Batch handle booleans
for j in range(len(key_parts) - 1):
b = key_parts[j].strip()
if b:
attrs[b] = ""

attrs[current_key] = parts[i + 1]

# Final trailing boolean check
last = parts[-1].strip()
if last:
for b in last.split(" "):
if b:
attrs[b] = ""
return attrs

def _unescape_pypi_html(text):
"""Unescape HTML text.

Decodes standard HTML entities used in the Simple API.
Specifically targets characters used in URLs and attribute values.

Args:
text: {type}`str` The text to replace.

Returns:
A string with unescaped characters
"""

# 1. Short circuit for the most common case
if not text or "&" not in text:
return text

# 2. Check for the most frequent PEP 503 entities first (version constraints).
# Re-ordering based on frequency reduces unnecessary checks for rare entities.
if "&gt;" in text:
text = text.replace("&gt;", ">")
if "&lt;" in text:
text = text.replace("&lt;", "<")

# 3. Grouped check for numeric entities.
# If '&#' isn't there, we skip 4 distinct string scans.
if "&#" in text:
if "&#39;" in text:
text = text.replace("&#39;", "'")
if "&#x27;" in text:
text = text.replace("&#x27;", "'")
if "&#10;" in text:
text = text.replace("&#10;", "\n")
if "&#13;" in text:
text = text.replace("&#13;", "\r")

if "&quot;" in text:
text = text.replace("&quot;", '"')

# 4. Handle ampersands last to prevent double-decoding.
if "&amp;" in text:
text = text.replace("&amp;", "&")

return text
8 changes: 4 additions & 4 deletions tests/pypi/hub_builder/hub_builder_tests.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -777,7 +777,7 @@ def _test_simple_get_index(env):
"plat_pkg": struct(
whls = {
"deadb44f": struct(
yanked = False,
yanked = None,
filename = "plat-pkg-0.0.4-py3-none-linux_x86_64.whl",
sha256 = "deadb44f",
url = "example2.org/index/plat_pkg/",
Expand All @@ -792,15 +792,15 @@ def _test_simple_get_index(env):
"simple": struct(
whls = {
"deadb00f": struct(
yanked = False,
yanked = None,
filename = "simple-0.0.1-py3-none-any.whl",
sha256 = "deadb00f",
url = "example2.org",
),
},
sdists = {
"deadbeef": struct(
yanked = False,
yanked = None,
filename = "simple-0.0.1.tar.gz",
sha256 = "deadbeef",
url = "example.org",
Expand All @@ -811,7 +811,7 @@ def _test_simple_get_index(env):
"some_other_pkg": struct(
whls = {
"deadb33f": struct(
yanked = False,
yanked = None,
filename = "some-other-pkg-0.0.1-py3-none-any.whl",
sha256 = "deadb33f",
url = "example2.org/index/some_other_pkg/",
Expand Down
Loading