-
-
Notifications
You must be signed in to change notification settings - Fork 34.5k
gh-118350: Add escapable-raw-text mode to html parser #121770
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
420af54
1241a65
e7f11a0
bd63490
da868db
d17b409
d8cc255
a36070a
43804bb
70b8e5d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -26,6 +26,7 @@ | |
|
|
||
| starttagopen = re.compile('<[a-zA-Z]') | ||
| piclose = re.compile('>') | ||
| escapable_raw_text_close = re.compile('</(title|textarea)>', re.I) | ||
| commentclose = re.compile(r'--\s*>') | ||
| # Note: | ||
| # 1) if you change tagfind/attrfind remember to update locatestarttagend too; | ||
|
|
@@ -82,6 +83,7 @@ class HTMLParser(_markupbase.ParserBase): | |
| """ | ||
|
|
||
| CDATA_CONTENT_ELEMENTS = ("script", "style") | ||
| ESCAPABLE_RAW_TEXT_ELEMENTS = ("title", "textarea") | ||
|
|
||
| def __init__(self, *, convert_charrefs=True): | ||
| """Initialize and reset this instance. | ||
|
|
@@ -99,6 +101,7 @@ def reset(self): | |
| self.lasttag = '???' | ||
| self.interesting = interesting_normal | ||
| self.cdata_elem = None | ||
| self.escapable_raw_text_elem = None | ||
| super().reset() | ||
|
|
||
| def feed(self, data): | ||
|
|
@@ -120,6 +123,14 @@ def get_starttag_text(self): | |
| """Return full source of start tag: '<...>'.""" | ||
| return self.__starttag_text | ||
|
|
||
| def set_escapable_raw_text_mode(self, elem): | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since the behavior for raw text elements and escapable raw text elements is so similar, and they cannot be nested, why not use
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @serhiy-storchaka I can do that. |
||
| self.escapable_raw_text_elem = elem.lower() | ||
| self.interesting = re.compile(r'</\s*%s\s*>' % self.escapable_raw_text_elem, re.I) | ||
|
|
||
| def clear_escapable_raw_text_mode(self): | ||
| self.interesting = interesting_normal | ||
| self.escapable_raw_text_elem = None | ||
|
|
||
| def set_cdata_mode(self, elem): | ||
| self.cdata_elem = elem.lower() | ||
| self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I) | ||
|
|
@@ -136,7 +147,7 @@ def goahead(self, end): | |
| i = 0 | ||
| n = len(rawdata) | ||
| while i < n: | ||
| if self.convert_charrefs and not self.cdata_elem: | ||
| if self.convert_charrefs and not self.cdata_elem and not self.escapable_raw_text_elem: | ||
| j = rawdata.find('<', i) | ||
| if j < 0: | ||
| # if we can't find the next <, either we are at the end | ||
|
|
@@ -155,11 +166,13 @@ def goahead(self, end): | |
| if match: | ||
| j = match.start() | ||
| else: | ||
| if self.escapable_raw_text_elem: | ||
| break | ||
| if self.cdata_elem: | ||
| break | ||
| j = n | ||
| if i < j: | ||
| if self.convert_charrefs and not self.cdata_elem: | ||
| if self.convert_charrefs and not self.cdata_elem and not self.escapable_raw_text_elem: | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is incorrect. Charrefs should be resolved in an escapable raw text element. Except an ambiguous ampersand. We need also tests for |
||
| self.handle_data(unescape(rawdata[i:j])) | ||
| else: | ||
| self.handle_data(rawdata[i:j]) | ||
|
|
@@ -336,6 +349,8 @@ def parse_starttag(self, i): | |
| self.handle_startendtag(tag, attrs) | ||
| else: | ||
| self.handle_starttag(tag, attrs) | ||
| if tag in self.ESCAPABLE_RAW_TEXT_ELEMENTS: | ||
| self.set_escapable_raw_text_mode(tag) | ||
| if tag in self.CDATA_CONTENT_ELEMENTS: | ||
| self.set_cdata_mode(tag) | ||
| return endpos | ||
|
|
@@ -411,8 +426,14 @@ def parse_endtag(self, i): | |
| self.handle_data(rawdata[i:gtpos]) | ||
| return gtpos | ||
|
|
||
| if self.escapable_raw_text_elem is not None: # title or textarea | ||
| if elem != self.escapable_raw_text_elem: | ||
| self.handle_data(rawdata[i:gtpos]) | ||
| return gtpos | ||
|
|
||
| self.handle_endtag(elem) | ||
| self.clear_cdata_mode() | ||
| self.clear_escapable_raw_text_mode() | ||
| return gtpos | ||
|
|
||
| # Overridable -- finish processing of start+end tag: <tag.../> | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -285,7 +285,7 @@ def test_cdata_content(self): | |
| #'foo = </\nscript>', | ||
| #'foo = </ script>', | ||
| ] | ||
| elements = ['script', 'style', 'SCRIPT', 'STYLE', 'Script', 'Style'] | ||
| elements = ['script', 'style', 'SCRIPT', 'TEXTAREA', 'Script', 'Textarea'] | ||
| for content in contents: | ||
| for element in elements: | ||
| element_lower = element.lower() | ||
|
|
@@ -317,6 +317,58 @@ def get_events(self): | |
| ("endtag", element_lower)], | ||
| collector=Collector(convert_charrefs=False)) | ||
|
|
||
| def test_escapable_raw_text_content(self): | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How does this test differ from test_cdata_content? BTW, most examples use JavaScript syntax, and only relevant for |
||
| contents = [ | ||
| '<h2>This is a header</h2>', | ||
| 'Rebelious<h1>Heading' | ||
| '<!-- not a comment --> ¬-an-entity-ref;', | ||
| "<not a='start tag'>", | ||
| '<a href="" /> <p> <span></span>', | ||
| 'foo = "</scr" + "ipt>";', | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why test this in the title and textarea elements?
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add also examples of valid character references and an ambiguous ampersand. |
||
| 'foo = "</TITLE" + ">";', | ||
| 'foo = <\n/title> ', | ||
| '<!-- document.write("</scr" + "ipt>"); -->', | ||
| '\n//<![CDATA[\n' | ||
| '\n<!-- //\nvar foo = 3.14;\n// -->\n', | ||
| 'foo = "</sty" + "le>";', | ||
| '<!-- \u2603 -->', | ||
| # these two should be invalid according to the HTML 5 spec, | ||
| # section 8.1.2.2 | ||
| #'foo = </\nscript>', | ||
| #'foo = </ script>', | ||
| ] | ||
| elements = ['title', 'textarea', 'TITLE', 'TEXTAREA', 'Title', 'Textarea'] | ||
| for content in contents: | ||
| for element in elements: | ||
| element_lower = element.lower() | ||
| s = '<{element}>{content}</{element}>'.format(element=element, | ||
| content=content) | ||
| self._run_check(s, [("starttag", element_lower, []), | ||
| ("data", content), | ||
| ("endtag", element_lower)]) | ||
|
|
||
| def test_escapable_raw_text_with_closing_tags(self): | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is it right? The test name is test_escapable_raw_text_with_closing_tags, but it tests the script element. It looks very similar to test_cdata_with_closing_tags. |
||
| # see issue #13358 | ||
| # make sure that HTMLParser calls handle_data only once for each CDATA. | ||
| # The normal event collector normalizes the events in get_events, | ||
| # so we override it to return the original list of events. | ||
| class Collector(EventCollector): | ||
| def get_events(self): | ||
| return self.events | ||
|
|
||
| content = """<!-- not a comment --> ¬-an-entity-ref; | ||
| <a href="" /> </p><p> <span></span></style> | ||
| '</script' + '>'""" | ||
| for element in [' script', 'script ', ' script ', | ||
| '\nscript', 'script\n', '\nscript\n']: | ||
| element_lower = element.lower().strip() | ||
| s = '<script>{content}</{element}>'.format(element=element, | ||
| content=content) | ||
| self._run_check(s, [("starttag", element_lower, []), | ||
| ("data", content), | ||
| ("endtag", element_lower)], | ||
| collector=Collector(convert_charrefs=False)) | ||
|
|
||
| def test_comments(self): | ||
| html = ("<!-- I'm a valid comment -->" | ||
| '<!--me too!-->' | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is it even used?