Skip to content

Commit 8ee9fbd

Browse files
committed
Fix additional < followed by characters and EOF issues (#728)
This fixes these two cases: * "<some thing thing" where "thing" is repeated twice which kicks up a parser error because it thinks it's a duplicated attribute * "<some thing thing2 " where the space at the end causes a expected-end-of-tag-but-got-eof parser error to pop up In both of these cases, we want the data to be treated as character data--not a tag.
1 parent 648a97d commit 8ee9fbd

File tree

2 files changed

+16
-3
lines changed

2 files changed

+16
-3
lines changed

bleach/html5lib_shim.py

+12-3
Original file line numberDiff line numberDiff line change
@@ -396,16 +396,25 @@ def __iter__(self):
396396
# name that abruptly ends, but we should treat that like
397397
# character data
398398
yield {"type": TAG_TOKEN_TYPE_CHARACTERS, "data": self.stream.get_tag()}
399+
399400
elif last_error_token["data"] in (
401+
"duplicate-attribute",
400402
"eof-in-attribute-name",
401403
"eof-in-attribute-value-no-quotes",
404+
"expected-end-of-tag-but-got-eof",
402405
):
403406
# Handle the case where the text being parsed ends with <
404-
# followed by a series of characters and then space and then
405-
# more characters. It's treated as a tag name followed by an
407+
# followed by characters and then space and then:
408+
#
409+
# * more characters
410+
# * more characters repeated with a space between (e.g. "abc abc")
411+
# * more characters and then a space and then an EOF (e.g. "abc def ")
412+
#
413+
# These cases are treated as a tag name followed by an
406414
# attribute that abruptly ends, but we should treat that like
407-
# character data.
415+
# character data instead.
408416
yield {"type": TAG_TOKEN_TYPE_CHARACTERS, "data": self.stream.get_tag()}
417+
409418
else:
410419
yield last_error_token
411420

tests/test_clean.py

+4
Original file line numberDiff line numberDiff line change
@@ -167,6 +167,10 @@ def test_bare_entities_get_escaped_correctly(text, expected):
167167
("<some thing", "&lt;some thing"),
168168
# this is an eof-in-attribute-value-no-quotes parser error
169169
("<some thing=foo", "&lt;some thing=foo"),
170+
# this is a duplicate-attribute parser error
171+
("<some thing thing", "&lt;some thing thing"),
172+
# this is an expected-end-of-tag-but-got-eof parser error
173+
("<some thing thing2 ", "&lt;some thing thing2 "),
170174
],
171175
)
172176
def test_lessthan_escaping(text, expected):

0 commit comments

Comments
 (0)