Skip to content

Commit 1c73fed

Browse files
committed
fix(pypi): return yank reason from SimpleAPI HTML
This makes the logic in the parser a little bit more sophisticated, but we also start handling the yank reason. This fixes the issue where the `data-yank` presence but no value would be interpreted as a yanked package. With this it should start working. This implementation assumes that we have HTML escaped sequences as tag values. It also unescapes them when returning the strings. The posibilities that it gives us are: - Use the `data-requires-python` to potentially discard any Python packages that are unsupported in the `select_whl` function. Work towards #260. Work towards #2731.
1 parent 8c2cae6 commit 1c73fed

5 files changed

Lines changed: 236 additions & 104 deletions

File tree

python/private/pypi/parse_requirements.bzl

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -267,7 +267,7 @@ def _package_srcs(
267267
url = "",
268268
filename = "",
269269
sha256 = "",
270-
yanked = False,
270+
yanked = None,
271271
)
272272
req_line = r.srcs.requirement_line
273273
else:
@@ -379,7 +379,7 @@ def _add_dists(*, requirement, index_urls, target_platform, logger = None):
379379
url = requirement.srcs.url,
380380
filename = requirement.srcs.filename,
381381
sha256 = requirement.srcs.shas[0] if requirement.srcs.shas else "",
382-
yanked = False,
382+
yanked = None,
383383
)
384384

385385
return dist, False
@@ -403,20 +403,20 @@ def _add_dists(*, requirement, index_urls, target_platform, logger = None):
403403
# See https://packaging.python.org/en/latest/specifications/simple-repository-api/#adding-yank-support-to-the-simple-api
404404

405405
maybe_whl = index_urls.whls.get(sha256)
406-
if maybe_whl and not maybe_whl.yanked:
406+
if maybe_whl and maybe_whl.yanked == None:
407407
whls.append(maybe_whl)
408408
continue
409409

410410
maybe_sdist = index_urls.sdists.get(sha256)
411-
if maybe_sdist and not maybe_sdist.yanked:
411+
if maybe_sdist and maybe_sdist.yanked == None:
412412
sdist = maybe_sdist
413413
continue
414414

415415
logger.warn(lambda: "Could not find a whl or an sdist with sha256={}".format(sha256))
416416

417417
yanked = {}
418418
for dist in whls + [sdist]:
419-
if dist and dist.yanked:
419+
if dist and dist.yanked != None:
420420
yanked.setdefault(dist.yanked, []).append(dist.filename)
421421
if yanked:
422422
logger.warn(lambda: "\n".join([

python/private/pypi/parse_simpleapi_html.bzl

Lines changed: 147 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -26,81 +26,177 @@ def parse_simpleapi_html(*, content):
2626
2727
Returns:
2828
A list of structs with:
29-
* filename: The filename of the artifact.
30-
* version: The version of the artifact.
31-
* url: The URL to download the artifact.
32-
* sha256: The sha256 of the artifact.
33-
* metadata_sha256: The whl METADATA sha256 if we can download it. If this is
34-
present, then the 'metadata_url' is also present. Defaults to "".
35-
* metadata_url: The URL for the METADATA if we can download it. Defaults to "".
29+
* filename: {type}`str` The filename of the artifact.
30+
* version: {type}`str` The version of the artifact.
31+
* url: {type}`str` The URL to download the artifact.
32+
* sha256: {type}`str` The sha256 of the artifact.
33+
* metadata_sha256: {type}`str` The whl METADATA sha256 if we can download it. If this is
34+
present, then the 'metadata_url' is also present. Defaults to "".
35+
* metadata_url: {type}`str` The URL for the METADATA if we can download it. Defaults to "".
36+
* yanked: {type}`str | None` the yank reason if the package is yanked. If it is not yanked,
37+
then it will be `None`. An empty string yank reason means that the package is yanked but
38+
the reason is not provided.
3639
"""
3740
sdists = {}
3841
whls = {}
39-
lines = content.split("<a href=\"")
40-
41-
_, _, api_version = lines[0].partition("name=\"pypi:repository-version\" content=\"")
42-
api_version, _, _ = api_version.partition("\"")
42+
sha256s_by_version = {}
4343

44-
# We must assume the 1.0 if it is not present
45-
# See https://packaging.python.org/en/latest/specifications/simple-repository-api/#clients
46-
api_version = api_version or "1.0"
47-
api_version = tuple([int(i) for i in api_version.split(".")])
44+
# 1. Faster Version Extraction
45+
# Search only the first 2KB for versioning metadata instead of splitting everything
46+
api_version = (1, 0)
47+
meta_idx = content.find('name="pypi:repository-version"')
48+
if meta_idx != -1:
49+
# Find 'content="' after the name attribute
50+
v_start = content.find('content="', meta_idx)
51+
if v_start != -1:
52+
v_end = content.find('"', v_start + 9)
53+
v_str = content[v_start + 9:v_end]
54+
if v_str:
55+
api_version = tuple([int(i) for i in v_str.split(".")])
4856

4957
if api_version >= (2, 0):
5058
# We don't expect to have version 2.0 here, but have this check in place just in case.
5159
# https://packaging.python.org/en/latest/specifications/simple-repository-api/#versioning-pypi-s-simple-api
5260
fail("Unsupported API version: {}".format(api_version))
5361

54-
# Each line follows the following pattern
55-
# <a href="https://...#sha256=..." attribute1="foo" ... attributeN="bar">filename</a><br />
56-
sha256s_by_version = {}
57-
for line in lines[1:]:
58-
dist_url, _, tail = line.partition("#sha256=")
62+
# 2. Iterate using find() to avoid huge list allocations from .split("<a ")
63+
cursor = 0
64+
for _ in range(1000000): # Safety break for Starlark
65+
start_tag = content.find("<a ", cursor)
66+
if start_tag == -1:
67+
break
68+
69+
# Find the end of the opening tag and the closing </a>
70+
tag_end = content.find(">", start_tag)
71+
end_tag = content.find("</a>", tag_end)
72+
if tag_end == -1 or end_tag == -1:
73+
break
74+
75+
# Extract only the necessary slices
76+
attr_part = content[start_tag + 3:tag_end]
77+
filename = content[tag_end + 1:end_tag].strip()
78+
79+
# Update cursor for next iteration
80+
cursor = end_tag + 4
81+
82+
# 3. Efficient Attribute Parsing
83+
attrs = _parse_attrs(attr_part)
84+
href = attrs.get("href", "")
85+
if not href:
86+
continue
5987

60-
sha256, _, tail = tail.partition("\"")
88+
dist_url, _, sha256 = href.partition("#sha256=")
6189

62-
# See https://packaging.python.org/en/latest/specifications/simple-repository-api/#adding-yank-support-to-the-simple-api
63-
yanked = "data-yanked" in line
90+
# Handle Yanked status
91+
yanked = None
92+
if "data-yanked" in attrs:
93+
yanked = _unescape_pypi_html(attrs["data-yanked"])
6494

65-
head, _, _ = tail.rpartition("</a>")
66-
maybe_metadata, _, filename = head.rpartition(">")
6795
version = version_from_filename(filename)
6896
sha256s_by_version.setdefault(version, []).append(sha256)
6997

98+
# 4. Optimized Metadata Check (PEP 714)
7099
metadata_sha256 = ""
71100
metadata_url = ""
72-
for metadata_marker in ["data-core-metadata", "data-dist-info-metadata"]:
73-
metadata_marker = metadata_marker + "=\"sha256="
74-
if metadata_marker in maybe_metadata:
75-
# Implement https://peps.python.org/pep-0714/
76-
_, _, tail = maybe_metadata.partition(metadata_marker)
77-
metadata_sha256, _, _ = tail.partition("\"")
78-
metadata_url = dist_url + ".metadata"
79-
break
101+
102+
# Dist-info is more common in modern PyPI
103+
m_val = attrs.get("data-dist-info-metadata") or attrs.get("data-core-metadata")
104+
if m_val and m_val != "false":
105+
_, _, metadata_sha256 = m_val.partition("sha256=")
106+
metadata_url = dist_url + ".metadata"
107+
108+
# 5. Result object
109+
dist = struct(
110+
filename = filename,
111+
version = version,
112+
url = dist_url,
113+
sha256 = sha256,
114+
metadata_sha256 = metadata_sha256,
115+
metadata_url = metadata_url,
116+
yanked = yanked,
117+
)
80118

81119
if filename.endswith(".whl"):
82-
whls[sha256] = struct(
83-
filename = filename,
84-
version = version,
85-
url = dist_url,
86-
sha256 = sha256,
87-
metadata_sha256 = metadata_sha256,
88-
metadata_url = metadata_url,
89-
yanked = yanked,
90-
)
120+
whls[sha256] = dist
91121
else:
92-
sdists[sha256] = struct(
93-
filename = filename,
94-
version = version,
95-
url = dist_url,
96-
sha256 = sha256,
97-
metadata_sha256 = "",
98-
metadata_url = "",
99-
yanked = yanked,
100-
)
122+
sdists[sha256] = dist
101123

102124
return struct(
103125
sdists = sdists,
104126
whls = whls,
105127
sha256s_by_version = sha256s_by_version,
106128
)
129+
130+
def _parse_attrs(attr_string):
131+
"""Parses attributes from a pre-sliced string."""
132+
attrs = {}
133+
parts = attr_string.split('"')
134+
135+
for i in range(0, len(parts) - 1, 2):
136+
raw_key = parts[i].strip()
137+
if not raw_key:
138+
continue
139+
140+
key_parts = raw_key.split(" ")
141+
current_key = key_parts[-1].rstrip("=")
142+
143+
# Batch handle booleans
144+
for j in range(len(key_parts) - 1):
145+
b = key_parts[j].strip()
146+
if b:
147+
attrs[b] = ""
148+
149+
attrs[current_key] = parts[i + 1]
150+
151+
# Final trailing boolean check
152+
last = parts[-1].strip()
153+
if last:
154+
for b in last.split(" "):
155+
if b:
156+
attrs[b] = ""
157+
return attrs
158+
159+
def _unescape_pypi_html(text):
160+
"""Unescape HTML text.
161+
162+
Decodes standard HTML entities used in the Simple API.
163+
Specifically targets characters used in URLs and attribute values.
164+
165+
Args:
166+
text: {type}`str` The text to replace.
167+
168+
Returns:
169+
A string with unescaped characters
170+
"""
171+
172+
# 1. Short circuit for the most common case
173+
if not text or "&" not in text:
174+
return text
175+
176+
# 2. Check for the most frequent PEP 503 entities first (version constraints).
177+
# Re-ordering based on frequency reduces unnecessary checks for rare entities.
178+
if "&gt;" in text:
179+
text = text.replace("&gt;", ">")
180+
if "&lt;" in text:
181+
text = text.replace("&lt;", "<")
182+
183+
# 3. Grouped check for numeric entities.
184+
# If '&#' isn't there, we skip 4 distinct string scans.
185+
if "&#" in text:
186+
if "&#39;" in text:
187+
text = text.replace("&#39;", "'")
188+
if "&#x27;" in text:
189+
text = text.replace("&#x27;", "'")
190+
if "&#10;" in text:
191+
text = text.replace("&#10;", "\n")
192+
if "&#13;" in text:
193+
text = text.replace("&#13;", "\r")
194+
195+
if "&quot;" in text:
196+
text = text.replace("&quot;", '"')
197+
198+
# 4. Handle ampersands last to prevent double-decoding.
199+
if "&amp;" in text:
200+
text = text.replace("&amp;", "&")
201+
202+
return text

tests/pypi/hub_builder/hub_builder_tests.bzl

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -777,7 +777,7 @@ def _test_simple_get_index(env):
777777
"plat_pkg": struct(
778778
whls = {
779779
"deadb44f": struct(
780-
yanked = False,
780+
yanked = None,
781781
filename = "plat-pkg-0.0.4-py3-none-linux_x86_64.whl",
782782
sha256 = "deadb44f",
783783
url = "example2.org/index/plat_pkg/",
@@ -792,15 +792,15 @@ def _test_simple_get_index(env):
792792
"simple": struct(
793793
whls = {
794794
"deadb00f": struct(
795-
yanked = False,
795+
yanked = None,
796796
filename = "simple-0.0.1-py3-none-any.whl",
797797
sha256 = "deadb00f",
798798
url = "example2.org",
799799
),
800800
},
801801
sdists = {
802802
"deadbeef": struct(
803-
yanked = False,
803+
yanked = None,
804804
filename = "simple-0.0.1.tar.gz",
805805
sha256 = "deadbeef",
806806
url = "example.org",
@@ -811,7 +811,7 @@ def _test_simple_get_index(env):
811811
"some_other_pkg": struct(
812812
whls = {
813813
"deadb33f": struct(
814-
yanked = False,
814+
yanked = None,
815815
filename = "some-other-pkg-0.0.1-py3-none-any.whl",
816816
sha256 = "deadb33f",
817817
url = "example2.org/index/some_other_pkg/",

0 commit comments

Comments
 (0)