@@ -26,81 +26,177 @@ def parse_simpleapi_html(*, content):
2626
2727 Returns:
2828 A list of structs with:
29- * filename: The filename of the artifact.
30- * version: The version of the artifact.
31- * url: The URL to download the artifact.
32- * sha256: The sha256 of the artifact.
33- * metadata_sha256: The whl METADATA sha256 if we can download it. If this is
34- present, then the 'metadata_url' is also present. Defaults to "".
35- * metadata_url: The URL for the METADATA if we can download it. Defaults to "".
29+ * filename: {type}`str` The filename of the artifact.
30+ * version: {type}`str` The version of the artifact.
31+ * url: {type}`str` The URL to download the artifact.
32+ * sha256: {type}`str` The sha256 of the artifact.
33+ * metadata_sha256: {type}`str` The whl METADATA sha256 if we can download it. If this is
34+ present, then the 'metadata_url' is also present. Defaults to "".
35+ * metadata_url: {type}`str` The URL for the METADATA if we can download it. Defaults to "".
36+ * yanked: {type}`str | None` the yank reason if the package is yanked. If it is not yanked,
37+ then it will be `None`. An empty string yank reason means that the package is yanked but
38+ the reason is not provided.
3639 """
3740 sdists = {}
3841 whls = {}
39- lines = content .split ("<a href=\" " )
40-
41- _ , _ , api_version = lines [0 ].partition ("name=\" pypi:repository-version\" content=\" " )
42- api_version , _ , _ = api_version .partition ("\" " )
42+ sha256s_by_version = {}
4343
44- # We must assume the 1.0 if it is not present
45- # See https://packaging.python.org/en/latest/specifications/simple-repository-api/#clients
46- api_version = api_version or "1.0"
47- api_version = tuple ([int (i ) for i in api_version .split ("." )])
44+ # 1. Faster Version Extraction
45+ # Search only the first 2KB for versioning metadata instead of splitting everything
46+ api_version = (1 , 0 )
47+ meta_idx = content .find ('name="pypi:repository-version"' )
48+ if meta_idx != - 1 :
49+ # Find 'content="' after the name attribute
50+ v_start = content .find ('content="' , meta_idx )
51+ if v_start != - 1 :
52+ v_end = content .find ('"' , v_start + 9 )
53+ v_str = content [v_start + 9 :v_end ]
54+ if v_str :
55+ api_version = tuple ([int (i ) for i in v_str .split ("." )])
4856
4957 if api_version >= (2 , 0 ):
5058 # We don't expect to have version 2.0 here, but have this check in place just in case.
5159 # https://packaging.python.org/en/latest/specifications/simple-repository-api/#versioning-pypi-s-simple-api
5260 fail ("Unsupported API version: {}" .format (api_version ))
5361
54- # Each line follows the following pattern
55- # <a href="https://...#sha256=..." attribute1="foo" ... attributeN="bar">filename</a><br />
56- sha256s_by_version = {}
57- for line in lines [1 :]:
58- dist_url , _ , tail = line .partition ("#sha256=" )
62+ # 2. Iterate using find() to avoid huge list allocations from .split("<a ")
63+ cursor = 0
64+ for _ in range (1000000 ): # Safety break for Starlark
65+ start_tag = content .find ("<a " , cursor )
66+ if start_tag == - 1 :
67+ break
68+
69+ # Find the end of the opening tag and the closing </a>
70+ tag_end = content .find (">" , start_tag )
71+ end_tag = content .find ("</a>" , tag_end )
72+ if tag_end == - 1 or end_tag == - 1 :
73+ break
74+
75+ # Extract only the necessary slices
76+ attr_part = content [start_tag + 3 :tag_end ]
77+ filename = content [tag_end + 1 :end_tag ].strip ()
78+
79+ # Update cursor for next iteration
80+ cursor = end_tag + 4
81+
82+ # 3. Efficient Attribute Parsing
83+ attrs = _parse_attrs (attr_part )
84+ href = attrs .get ("href" , "" )
85+ if not href :
86+ continue
5987
60- sha256 , _ , tail = tail .partition ("\" " )
88+ dist_url , _ , sha256 = href .partition ("#sha256= " )
6189
62- # See https://packaging.python.org/en/latest/specifications/simple-repository-api/#adding-yank-support-to-the-simple-api
63- yanked = "data-yanked" in line
90+ # Handle Yanked status
91+ yanked = None
92+ if "data-yanked" in attrs :
93+ yanked = _unescape_pypi_html (attrs ["data-yanked" ])
6494
65- head , _ , _ = tail .rpartition ("</a>" )
66- maybe_metadata , _ , filename = head .rpartition (">" )
6795 version = version_from_filename (filename )
6896 sha256s_by_version .setdefault (version , []).append (sha256 )
6997
98+ # 4. Optimized Metadata Check (PEP 714)
7099 metadata_sha256 = ""
71100 metadata_url = ""
72- for metadata_marker in ["data-core-metadata" , "data-dist-info-metadata" ]:
73- metadata_marker = metadata_marker + "=\" sha256="
74- if metadata_marker in maybe_metadata :
75- # Implement https://peps.python.org/pep-0714/
76- _ , _ , tail = maybe_metadata .partition (metadata_marker )
77- metadata_sha256 , _ , _ = tail .partition ("\" " )
78- metadata_url = dist_url + ".metadata"
79- break
101+
102+ # Dist-info is more common in modern PyPI
103+ m_val = attrs .get ("data-dist-info-metadata" ) or attrs .get ("data-core-metadata" )
104+ if m_val and m_val != "false" :
105+ _ , _ , metadata_sha256 = m_val .partition ("sha256=" )
106+ metadata_url = dist_url + ".metadata"
107+
108+ # 5. Result object
109+ dist = struct (
110+ filename = filename ,
111+ version = version ,
112+ url = dist_url ,
113+ sha256 = sha256 ,
114+ metadata_sha256 = metadata_sha256 ,
115+ metadata_url = metadata_url ,
116+ yanked = yanked ,
117+ )
80118
81119 if filename .endswith (".whl" ):
82- whls [sha256 ] = struct (
83- filename = filename ,
84- version = version ,
85- url = dist_url ,
86- sha256 = sha256 ,
87- metadata_sha256 = metadata_sha256 ,
88- metadata_url = metadata_url ,
89- yanked = yanked ,
90- )
120+ whls [sha256 ] = dist
91121 else :
92- sdists [sha256 ] = struct (
93- filename = filename ,
94- version = version ,
95- url = dist_url ,
96- sha256 = sha256 ,
97- metadata_sha256 = "" ,
98- metadata_url = "" ,
99- yanked = yanked ,
100- )
122+ sdists [sha256 ] = dist
101123
102124 return struct (
103125 sdists = sdists ,
104126 whls = whls ,
105127 sha256s_by_version = sha256s_by_version ,
106128 )
129+
130+ def _parse_attrs (attr_string ):
131+ """Parses attributes from a pre-sliced string."""
132+ attrs = {}
133+ parts = attr_string .split ('"' )
134+
135+ for i in range (0 , len (parts ) - 1 , 2 ):
136+ raw_key = parts [i ].strip ()
137+ if not raw_key :
138+ continue
139+
140+ key_parts = raw_key .split (" " )
141+ current_key = key_parts [- 1 ].rstrip ("=" )
142+
143+ # Batch handle booleans
144+ for j in range (len (key_parts ) - 1 ):
145+ b = key_parts [j ].strip ()
146+ if b :
147+ attrs [b ] = ""
148+
149+ attrs [current_key ] = parts [i + 1 ]
150+
151+ # Final trailing boolean check
152+ last = parts [- 1 ].strip ()
153+ if last :
154+ for b in last .split (" " ):
155+ if b :
156+ attrs [b ] = ""
157+ return attrs
158+
159+ def _unescape_pypi_html (text ):
160+ """Unescape HTML text.
161+
162+ Decodes standard HTML entities used in the Simple API.
163+ Specifically targets characters used in URLs and attribute values.
164+
165+ Args:
166+ text: {type}`str` The text to replace.
167+
168+ Returns:
169+ A string with unescaped characters
170+ """
171+
172+ # 1. Short circuit for the most common case
173+ if not text or "&" not in text :
174+ return text
175+
176+ # 2. Check for the most frequent PEP 503 entities first (version constraints).
177+ # Re-ordering based on frequency reduces unnecessary checks for rare entities.
178+ if ">" in text :
179+ text = text .replace (">" , ">" )
180+ if "<" in text :
181+ text = text .replace ("<" , "<" )
182+
183+ # 3. Grouped check for numeric entities.
184+ # If '&#' isn't there, we skip 4 distinct string scans.
185+ if "&#" in text :
186+ if "'" in text :
187+ text = text .replace ("'" , "'" )
188+ if "'" in text :
189+ text = text .replace ("'" , "'" )
190+ if " " in text :
191+ text = text .replace (" " , "\n " )
192+ if " " in text :
193+ text = text .replace (" " , "\r " )
194+
195+ if """ in text :
196+ text = text .replace (""" , '"' )
197+
198+ # 4. Handle ampersands last to prevent double-decoding.
199+ if "&" in text :
200+ text = text .replace ("&" , "&" )
201+
202+ return text
0 commit comments