bazel-contrib · aignas · Mar 9, 2026 · Mar 10, 2026 · Mar 10, 2026
@@ -267,7 +267,7 @@ def _package_srcs(
                     url = "",
                     filename = "",
                     sha256 = "",
-                    yanked = False,
+                    yanked = None,
                 )
                 req_line = r.srcs.requirement_line
             else:
@@ -379,7 +379,7 @@ def _add_dists(*, requirement, index_urls, target_platform, logger = None):
             url = requirement.srcs.url,
             filename = requirement.srcs.filename,
             sha256 = requirement.srcs.shas[0] if requirement.srcs.shas else "",
-            yanked = False,
+            yanked = None,
         )
 
         return dist, False
@@ -403,20 +403,20 @@ def _add_dists(*, requirement, index_urls, target_platform, logger = None):
         # See https://packaging.python.org/en/latest/specifications/simple-repository-api/#adding-yank-support-to-the-simple-api
 
         maybe_whl = index_urls.whls.get(sha256)
-        if maybe_whl and not maybe_whl.yanked:
+        if maybe_whl and maybe_whl.yanked == None:
             whls.append(maybe_whl)
             continue
 
         maybe_sdist = index_urls.sdists.get(sha256)
-        if maybe_sdist and not maybe_sdist.yanked:
+        if maybe_sdist and maybe_sdist.yanked == None:
             sdist = maybe_sdist
             continue
 
         logger.warn(lambda: "Could not find a whl or an sdist with sha256={}".format(sha256))
 
     yanked = {}
     for dist in whls + [sdist]:
-        if dist and dist.yanked:
+        if dist and dist.yanked != None:
             yanked.setdefault(dist.yanked, []).append(dist.filename)
     if yanked:
         logger.warn(lambda: "\n".join([

@@ -18,89 +18,198 @@ Parse SimpleAPI HTML in Starlark.
 
 load(":version_from_filename.bzl", "version_from_filename")
 
-def parse_simpleapi_html(*, content):
+def parse_simpleapi_html(*, content, parse_index = False):
     """Get the package URLs for given shas by parsing the Simple API HTML.
 
     Args:
-        content(str): The Simple API HTML content.
+        content: {type}`str` The Simple API HTML content.
+        parse_index: {type}`bool` whether to parse the content as the index page of the PyPI index,
+            e.g. the `https://pypi.org/simple/`. This only has the URLs for the individual package.
 
     Returns:
-        A list of structs with:
-        * filename: The filename of the artifact.
-        * version: The version of the artifact.
-        * url: The URL to download the artifact.
-        * sha256: The sha256 of the artifact.
-        * metadata_sha256: The whl METADATA sha256 if we can download it. If this is
-          present, then the 'metadata_url' is also present. Defaults to "".
-        * metadata_url: The URL for the METADATA if we can download it. Defaults to "".
+        If it is the index page, return the map of package to URL it can be queried from.
+        Otherwise, a list of structs with:
+          * filename: {type}`str` The filename of the artifact.
+          * version: {type}`str` The version of the artifact.
+          * url: {type}`str` The URL to download the artifact.
+          * sha256: {type}`str` The sha256 of the artifact.
+          * metadata_sha256: {type}`str` The whl METADATA sha256 if we can download it. If this is
+            present, then the 'metadata_url' is also present. Defaults to "".
+          * metadata_url: {type}`str` The URL for the METADATA if we can download it. Defaults to "".
+          * yanked: {type}`str | None` the yank reason if the package is yanked. If it is not yanked,
+              then it will be `None`. An empty string yank reason means that the package is yanked but
+              the reason is not provided.
     """
     sdists = {}
     whls = {}
-    lines = content.split("<a href=\"")
-
-    _, _, api_version = lines[0].partition("name=\"pypi:repository-version\" content=\"")
-    api_version, _, _ = api_version.partition("\"")
+    sha256s_by_version = {}
 
-    # We must assume the 1.0 if it is not present
-    # See https://packaging.python.org/en/latest/specifications/simple-repository-api/#clients
-    api_version = api_version or "1.0"
-    api_version = tuple([int(i) for i in api_version.split(".")])
+    # 1. Faster Version Extraction
+    # Search only the first 2KB for versioning metadata instead of splitting everything
+    api_version = (1, 0)
+    meta_idx = content.find('name="pypi:repository-version"')
+    if meta_idx != -1:
+        # Find 'content="' after the name attribute
+        v_start = content.find('content="', meta_idx)
+        if v_start != -1:
+            v_end = content.find('"', v_start + 9)
+            v_str = content[v_start + 9:v_end]
+            if v_str:
+                api_version = tuple([int(i) for i in v_str.split(".")])
 
     if api_version >= (2, 0):
         # We don't expect to have version 2.0 here, but have this check in place just in case.
         # https://packaging.python.org/en/latest/specifications/simple-repository-api/#versioning-pypi-s-simple-api
         fail("Unsupported API version: {}".format(api_version))
 
-    # Each line follows the following pattern
-    # <a href="https://...#sha256=..." attribute1="foo" ... attributeN="bar">filename</a><br />
-    sha256s_by_version = {}
-    for line in lines[1:]:
-        dist_url, _, tail = line.partition("#sha256=")
+    packages = {}
+
+    # 2. Iterate using find() to avoid huge list allocations from .split("<a ")
+    cursor = 0
+    for _ in range(1000000):  # Safety break for Starlark
+        start_tag = content.find("<a ", cursor)
+        if start_tag == -1:
+            break
+
+        # Find the end of the opening tag and the closing </a>
+        tag_end = content.find(">", start_tag)
+        end_tag = content.find("</a>", tag_end)
+        if tag_end == -1 or end_tag == -1:
+            break
+
+        # Extract only the necessary slices
+        filename = content[tag_end + 1:end_tag].strip()
+        attr_part = content[start_tag + 3:tag_end]
+
+        # Update cursor for next iteration
+        cursor = end_tag + 4
+
+        attrs = _parse_attrs(attr_part)
+        href = attrs.get("href", "")
+        if not href:
+            continue
 
-        sha256, _, tail = tail.partition("\"")
+        if parse_index:
+            pkg_name = filename
+            packages[pkg_name] = href
+            continue
 
-        # See https://packaging.python.org/en/latest/specifications/simple-repository-api/#adding-yank-support-to-the-simple-api
-        yanked = "data-yanked" in line
+        # 3. Efficient Attribute Parsing
+        dist_url, _, sha256 = href.partition("#sha256=")
+
+        # Handle Yanked status
+        yanked = None
+        if "data-yanked" in attrs:
+            yanked = _unescape_pypi_html(attrs["data-yanked"])
 
-        head, _, _ = tail.rpartition("</a>")
-        maybe_metadata, _, filename = head.rpartition(">")
         version = version_from_filename(filename)
         sha256s_by_version.setdefault(version, []).append(sha256)
 
+        # 4. Optimized Metadata Check (PEP 714)
         metadata_sha256 = ""
         metadata_url = ""
-        for metadata_marker in ["data-core-metadata", "data-dist-info-metadata"]:
-            metadata_marker = metadata_marker + "=\"sha256="
-            if metadata_marker in maybe_metadata:
-                # Implement https://peps.python.org/pep-0714/
-                _, _, tail = maybe_metadata.partition(metadata_marker)
-                metadata_sha256, _, _ = tail.partition("\"")
-                metadata_url = dist_url + ".metadata"
-                break
+
+        # Dist-info is more common in modern PyPI
+        m_val = attrs.get("data-dist-info-metadata") or attrs.get("data-core-metadata")
+        if m_val and m_val != "false":
+            _, _, metadata_sha256 = m_val.partition("sha256=")
+            metadata_url = dist_url + ".metadata"
+
+        # 5. Result object
+        dist = struct(
+            filename = filename,
+            version = version,
+            url = dist_url,
+            sha256 = sha256,
+            metadata_sha256 = metadata_sha256,
+            metadata_url = metadata_url,
+            yanked = yanked,
+        )
 
         if filename.endswith(".whl"):
-            whls[sha256] = struct(
-                filename = filename,
-                version = version,
-                url = dist_url,
-                sha256 = sha256,
-                metadata_sha256 = metadata_sha256,
-                metadata_url = metadata_url,
-                yanked = yanked,
-            )
+            whls[sha256] = dist
         else:
-            sdists[sha256] = struct(
-                filename = filename,
-                version = version,
-                url = dist_url,
-                sha256 = sha256,
-                metadata_sha256 = "",
-                metadata_url = "",
-                yanked = yanked,
-            )
+            sdists[sha256] = dist
+
+    if packages:
+        return packages
 
     return struct(
         sdists = sdists,
         whls = whls,
         sha256s_by_version = sha256s_by_version,
     )
+
+def _parse_attrs(attr_string):
+    """Parses attributes from a pre-sliced string."""
+    attrs = {}
+    parts = attr_string.split('"')
+
+    for i in range(0, len(parts) - 1, 2):
+        raw_key = parts[i].strip()
+        if not raw_key:
+            continue
+
+        key_parts = raw_key.split(" ")
+        current_key = key_parts[-1].rstrip("=")
+
+        # Batch handle booleans
+        for j in range(len(key_parts) - 1):
+            b = key_parts[j].strip()
+            if b:
+                attrs[b] = ""
+
+        attrs[current_key] = parts[i + 1]
+
+    # Final trailing boolean check
+    last = parts[-1].strip()
+    if last:
+        for b in last.split(" "):
+            if b:
+                attrs[b] = ""
+    return attrs
+
+def _unescape_pypi_html(text):
+    """Unescape HTML text.
+
+    Decodes standard HTML entities used in the Simple API.
+    Specifically targets characters used in URLs and attribute values.
+
+    Args:
+        text: {type}`str` The text to replace.
+
+    Returns:
+        A string with unescaped characters
+    """
+
+    # 1. Short circuit for the most common case
+    if not text or "&" not in text:
+        return text
+
+    # 2. Check for the most frequent PEP 503 entities first (version constraints).
+    # Re-ordering based on frequency reduces unnecessary checks for rare entities.
+    if "&gt;" in text:
+        text = text.replace("&gt;", ">")
+    if "&lt;" in text:
+        text = text.replace("&lt;", "<")
+
+    # 3. Grouped check for numeric entities.
+    # If '&#' isn't there, we skip 4 distinct string scans.
+    if "&#" in text:
+        if "&#39;" in text:
+            text = text.replace("&#39;", "'")
+        if "&#x27;" in text:
+            text = text.replace("&#x27;", "'")
+        if "&#10;" in text:
+            text = text.replace("&#10;", "\n")
+        if "&#13;" in text:
+            text = text.replace("&#13;", "\r")
+
+    if "&quot;" in text:
+        text = text.replace("&quot;", '"')
+
+    # 4. Handle ampersands last to prevent double-decoding.
+    if "&amp;" in text:
+        text = text.replace("&amp;", "&")
+
+    return text