Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions python/private/pypi/parse_requirements.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,7 @@ def _package_srcs(
url = "",
filename = "",
sha256 = "",
yanked = False,
yanked = None,
)
req_line = r.srcs.requirement_line
else:
Expand Down Expand Up @@ -379,7 +379,7 @@ def _add_dists(*, requirement, index_urls, target_platform, logger = None):
url = requirement.srcs.url,
filename = requirement.srcs.filename,
sha256 = requirement.srcs.shas[0] if requirement.srcs.shas else "",
yanked = False,
yanked = None,
)

return dist, False
Expand All @@ -403,20 +403,20 @@ def _add_dists(*, requirement, index_urls, target_platform, logger = None):
# See https://packaging.python.org/en/latest/specifications/simple-repository-api/#adding-yank-support-to-the-simple-api

maybe_whl = index_urls.whls.get(sha256)
if maybe_whl and not maybe_whl.yanked:
if maybe_whl and maybe_whl.yanked == None:
whls.append(maybe_whl)
continue

maybe_sdist = index_urls.sdists.get(sha256)
if maybe_sdist and not maybe_sdist.yanked:
if maybe_sdist and maybe_sdist.yanked == None:
sdist = maybe_sdist
continue

logger.warn(lambda: "Could not find a whl or an sdist with sha256={}".format(sha256))

yanked = {}
for dist in whls + [sdist]:
if dist and dist.yanked:
if dist and dist.yanked != None:
yanked.setdefault(dist.yanked, []).append(dist.filename)
if yanked:
logger.warn(lambda: "\n".join([
Expand Down
217 changes: 163 additions & 54 deletions python/private/pypi/parse_simpleapi_html.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -18,89 +18,198 @@ Parse SimpleAPI HTML in Starlark.

load(":version_from_filename.bzl", "version_from_filename")

def parse_simpleapi_html(*, content):
def parse_simpleapi_html(*, content, parse_index = False):
"""Get the package URLs for given shas by parsing the Simple API HTML.

Args:
content(str): The Simple API HTML content.
content: {type}`str` The Simple API HTML content.
parse_index: {type}`bool` whether to parse the content as the index page of the PyPI index,
e.g. the `https://pypi.org/simple/`. This only has the URLs for the individual package.

Returns:
A list of structs with:
* filename: The filename of the artifact.
* version: The version of the artifact.
* url: The URL to download the artifact.
* sha256: The sha256 of the artifact.
* metadata_sha256: The whl METADATA sha256 if we can download it. If this is
present, then the 'metadata_url' is also present. Defaults to "".
* metadata_url: The URL for the METADATA if we can download it. Defaults to "".
If it is the index page, return the map of package to URL it can be queried from.
Otherwise, a list of structs with:
* filename: {type}`str` The filename of the artifact.
* version: {type}`str` The version of the artifact.
* url: {type}`str` The URL to download the artifact.
* sha256: {type}`str` The sha256 of the artifact.
* metadata_sha256: {type}`str` The whl METADATA sha256 if we can download it. If this is
present, then the 'metadata_url' is also present. Defaults to "".
* metadata_url: {type}`str` The URL for the METADATA if we can download it. Defaults to "".
* yanked: {type}`str | None` the yank reason if the package is yanked. If it is not yanked,
then it will be `None`. An empty string yank reason means that the package is yanked but
the reason is not provided.
"""
sdists = {}
whls = {}
lines = content.split("<a href=\"")

_, _, api_version = lines[0].partition("name=\"pypi:repository-version\" content=\"")
api_version, _, _ = api_version.partition("\"")
sha256s_by_version = {}

# We must assume the 1.0 if it is not present
# See https://packaging.python.org/en/latest/specifications/simple-repository-api/#clients
api_version = api_version or "1.0"
api_version = tuple([int(i) for i in api_version.split(".")])
# 1. Faster Version Extraction
# Search only the first 2KB for versioning metadata instead of splitting everything
api_version = (1, 0)
meta_idx = content.find('name="pypi:repository-version"')
if meta_idx != -1:
# Find 'content="' after the name attribute
v_start = content.find('content="', meta_idx)
if v_start != -1:
v_end = content.find('"', v_start + 9)
v_str = content[v_start + 9:v_end]
if v_str:
api_version = tuple([int(i) for i in v_str.split(".")])

if api_version >= (2, 0):
# We don't expect to have version 2.0 here, but have this check in place just in case.
# https://packaging.python.org/en/latest/specifications/simple-repository-api/#versioning-pypi-s-simple-api
fail("Unsupported API version: {}".format(api_version))

# Each line follows the following pattern
# <a href="https://...#sha256=..." attribute1="foo" ... attributeN="bar">filename</a><br />
sha256s_by_version = {}
for line in lines[1:]:
dist_url, _, tail = line.partition("#sha256=")
packages = {}

# 2. Iterate using find() to avoid huge list allocations from .split("<a ")
cursor = 0
for _ in range(1000000): # Safety break for Starlark
start_tag = content.find("<a ", cursor)
if start_tag == -1:
break

# Find the end of the opening tag and the closing </a>
tag_end = content.find(">", start_tag)
end_tag = content.find("</a>", tag_end)
if tag_end == -1 or end_tag == -1:
break

# Extract only the necessary slices
filename = content[tag_end + 1:end_tag].strip()
attr_part = content[start_tag + 3:tag_end]

# Update cursor for next iteration
cursor = end_tag + 4

attrs = _parse_attrs(attr_part)
href = attrs.get("href", "")
if not href:
continue

sha256, _, tail = tail.partition("\"")
if parse_index:
pkg_name = filename
packages[pkg_name] = href
continue

# See https://packaging.python.org/en/latest/specifications/simple-repository-api/#adding-yank-support-to-the-simple-api
yanked = "data-yanked" in line
# 3. Efficient Attribute Parsing
dist_url, _, sha256 = href.partition("#sha256=")

# Handle Yanked status
yanked = None
if "data-yanked" in attrs:
yanked = _unescape_pypi_html(attrs["data-yanked"])

head, _, _ = tail.rpartition("</a>")
maybe_metadata, _, filename = head.rpartition(">")
version = version_from_filename(filename)
sha256s_by_version.setdefault(version, []).append(sha256)

# 4. Optimized Metadata Check (PEP 714)
metadata_sha256 = ""
metadata_url = ""
for metadata_marker in ["data-core-metadata", "data-dist-info-metadata"]:
metadata_marker = metadata_marker + "=\"sha256="
if metadata_marker in maybe_metadata:
# Implement https://peps.python.org/pep-0714/
_, _, tail = maybe_metadata.partition(metadata_marker)
metadata_sha256, _, _ = tail.partition("\"")
metadata_url = dist_url + ".metadata"
break

# Dist-info is more common in modern PyPI
m_val = attrs.get("data-dist-info-metadata") or attrs.get("data-core-metadata")
if m_val and m_val != "false":
_, _, metadata_sha256 = m_val.partition("sha256=")
metadata_url = dist_url + ".metadata"

# 5. Result object
dist = struct(
filename = filename,
version = version,
url = dist_url,
sha256 = sha256,
metadata_sha256 = metadata_sha256,
metadata_url = metadata_url,
yanked = yanked,
)

if filename.endswith(".whl"):
whls[sha256] = struct(
filename = filename,
version = version,
url = dist_url,
sha256 = sha256,
metadata_sha256 = metadata_sha256,
metadata_url = metadata_url,
yanked = yanked,
)
whls[sha256] = dist
else:
sdists[sha256] = struct(
filename = filename,
version = version,
url = dist_url,
sha256 = sha256,
metadata_sha256 = "",
metadata_url = "",
yanked = yanked,
)
sdists[sha256] = dist

if packages:
return packages

return struct(
sdists = sdists,
whls = whls,
sha256s_by_version = sha256s_by_version,
)

def _parse_attrs(attr_string):
"""Parses attributes from a pre-sliced string."""
attrs = {}
parts = attr_string.split('"')

for i in range(0, len(parts) - 1, 2):
raw_key = parts[i].strip()
if not raw_key:
continue

key_parts = raw_key.split(" ")
current_key = key_parts[-1].rstrip("=")

# Batch handle booleans
for j in range(len(key_parts) - 1):
b = key_parts[j].strip()
if b:
attrs[b] = ""

attrs[current_key] = parts[i + 1]

# Final trailing boolean check
last = parts[-1].strip()
if last:
for b in last.split(" "):
if b:
attrs[b] = ""
return attrs

def _unescape_pypi_html(text):
"""Unescape HTML text.

Decodes standard HTML entities used in the Simple API.
Specifically targets characters used in URLs and attribute values.

Args:
text: {type}`str` The text to replace.

Returns:
A string with unescaped characters
"""

# 1. Short circuit for the most common case
if not text or "&" not in text:
return text

# 2. Check for the most frequent PEP 503 entities first (version constraints).
# Re-ordering based on frequency reduces unnecessary checks for rare entities.
if "&gt;" in text:
text = text.replace("&gt;", ">")
if "&lt;" in text:
text = text.replace("&lt;", "<")

# 3. Grouped check for numeric entities.
# If '&#' isn't there, we skip 4 distinct string scans.
if "&#" in text:
if "&#39;" in text:
text = text.replace("&#39;", "'")
if "&#x27;" in text:
text = text.replace("&#x27;", "'")
if "&#10;" in text:
text = text.replace("&#10;", "\n")
if "&#13;" in text:
text = text.replace("&#13;", "\r")

if "&quot;" in text:
text = text.replace("&quot;", '"')

# 4. Handle ampersands last to prevent double-decoding.
if "&amp;" in text:
text = text.replace("&amp;", "&")

return text
Loading
Loading