From 2ed000381d60bf8da9b0bec57693425b24b50c9d Mon Sep 17 00:00:00 2001 From: Andrea Princic <48788808+Princic-1837592@users.noreply.github.com> Date: Tue, 6 Dec 2022 18:29:27 +0100 Subject: [PATCH] Update the HTML logic to output well-formed elements (#1155) * unparsing html entities with deque * unparsing using a stack (recursive) --- pyrogram/parser/html.py | 55 ++++++++++++++++++++++++++++------------- 1 file changed, 38 insertions(+), 17 deletions(-) diff --git a/pyrogram/parser/html.py b/pyrogram/parser/html.py index 7ed2a5be..4afeea2b 100644 --- a/pyrogram/parser/html.py +++ b/pyrogram/parser/html.py @@ -155,11 +155,10 @@ class HTML: @staticmethod def unparse(text: str, entities: list): - text = utils.add_surrogates(text) - - entities_offsets = [] - - for entity in entities: + def parse_one(entity): + """ + Parses a single entity and returns (start_tag, start), (end_tag, end) + """ entity_type = entity.type start = entity.offset end = start + entity.length @@ -199,21 +198,43 @@ class HTML: start_tag = f'' end_tag = "" else: - continue + return - entities_offsets.append((start_tag, start,)) - entities_offsets.append((end_tag, end,)) + return (start_tag, start), (end_tag, end) - entities_offsets = map( - lambda x: x[1], - sorted( - enumerate(entities_offsets), - key=lambda x: (x[1][1], x[0]), - reverse=True - ) - ) + def recursive(entity_i: int) -> int: + """ + Takes the index of the entity to start parsing from, returns the number of parsed entities inside it. + Uses entities_offsets as a stack, pushing (start_tag, start) first, then parsing nested entities, + and finally pushing (end_tag, end) to the stack. + No need to sort at the end. + """ + this = parse_one(entities[entity_i]) + if this is None: + return 1 + (start_tag, start), (end_tag, end) = this + entities_offsets.append((start_tag, start)) + internal_i = entity_i + 1 + # while the next entity is inside the current one, keep parsing + while internal_i < len(entities) and entities[internal_i].offset < end: + internal_i += recursive(internal_i) + entities_offsets.append((end_tag, end)) + return internal_i - entity_i - for entity, offset in entities_offsets: + text = utils.add_surrogates(text) + + entities_offsets = [] + + # probably useless because entities are already sorted by telegram + entities.sort(key=lambda e: (e.offset, -e.length)) + + # main loop for first-level entities + i = 0 + while i < len(entities): + i += recursive(i) + + # no need to sort, but still add entities starting from the end + for entity, offset in reversed(entities_offsets): text = text[:offset] + entity + text[offset:] return utils.remove_surrogates(text)