Skip to content

Commit b70b9c0

Browse files
committed
fix: handle left-angle-bracket that's not a tag (#733)
This handles the case where there's a `<` followed by one or more character such that it looks like the beginning of a start tag and then a space and then a thing that actually is a start tag. Something like: ``` <tag <b>text</b> ``` This fixes it by identifying the situation and then shoving everything after the space back into the character stream to get parsed again.
1 parent 4dcf8d6 commit b70b9c0

1 file changed

Lines changed: 31 additions & 0 deletions

File tree

bleach/html5lib_shim.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -322,6 +322,7 @@ def __iter__(self):
322322
last_error_token = None
323323

324324
for token in super().__iter__():
325+
print(token, last_error_token)
325326
if last_error_token is not None:
326327
if (
327328
last_error_token["data"] == "invalid-character-in-attribute-name"
@@ -346,6 +347,36 @@ def __iter__(self):
346347
last_error_token = None
347348
yield token
348349

350+
elif (
351+
last_error_token["data"] in (
352+
"invalid-character-in-attribute-name",
353+
"invalid-character-after-attribute-name",
354+
)
355+
and token["type"] == TAG_TOKEN_TYPE_CHARACTERS
356+
and token.get("data")
357+
and " " in token["data"]
358+
):
359+
# token["data"] has something that starts with a left angle
360+
# bracket, then has some characters followed by a space
361+
# followed by another left angle bracket and ending with
362+
# a right angle bracket. That part could be a real tag, so
363+
# we don't want it to get treated as Characters. For
364+
# example, soemthing in this shape: <nottag <...>
365+
# If so, we want to take off the first bit that is
366+
# definitely not a tag and reparse the rest.
367+
head, rest = token["data"].split(" ", 1)
368+
if rest.strip().startswith("<"):
369+
# yield the not-a-tag plus the space we split on
370+
token["data"] = head + " "
371+
yield token
372+
373+
# shove the rest back in the stream for the praser to look
374+
# at
375+
for c in reversed(rest):
376+
self.stream.unget(c)
377+
else:
378+
yield token
379+
349380
elif (
350381
last_error_token["data"] == "expected-closing-tag-but-got-char"
351382
and self.parser.tags is not None

0 commit comments

Comments
 (0)