2
0
mirror of https://github.com/pyrogram/pyrogram synced 2025-08-28 04:48:06 +00:00

Make the HTML parser more sound

This commit is contained in:
Dan 2019-06-25 07:41:48 +02:00
parent 07bc7e39df
commit a086964e85

View File

@ -95,7 +95,16 @@ class Parser(HTMLParser):
self.text += data self.text += data
def handle_endtag(self, tag): def handle_endtag(self, tag):
self.entities.append(self.tag_entities[tag].pop()) try:
self.entities.append(self.tag_entities[tag].pop())
except (KeyError, IndexError):
line, offset = self.getpos()
offset += 1
raise ValueError("Unmatched closing tag </{}> at line {}:{}".format(tag, line, offset))
else:
if not self.tag_entities[tag]:
self.tag_entities.pop(tag)
def error(self, message): def error(self, message):
pass pass
@ -112,6 +121,14 @@ class HTML:
parser.feed(text) parser.feed(text)
parser.close() parser.close()
if parser.tag_entities:
unclosed_tags = []
for tag, entities in parser.tag_entities.items():
unclosed_tags.append("<{}> (x{})".format(tag, len(entities)))
raise ValueError("Unclosed tags: {}".format(", ".join(unclosed_tags)))
# TODO: OrderedDict to be removed in Python 3.6 # TODO: OrderedDict to be removed in Python 3.6
return OrderedDict([ return OrderedDict([
("message", utils.remove_surrogates(parser.text)), ("message", utils.remove_surrogates(parser.text)),