From 8102a7fe82323988f45aaaf32fdb75db1169dfd5 Mon Sep 17 00:00:00 2001 From: Dan <14043624+delivrance@users.noreply.github.com> Date: Mon, 24 Jun 2019 01:46:20 +0200 Subject: [PATCH 01/16] Update FUNDING.yml --- .github/FUNDING.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml index f34f615a..3437aeae 100644 --- a/.github/FUNDING.yml +++ b/.github/FUNDING.yml @@ -1,2 +1,2 @@ -github: delivrance +# github: delivrance custom: https://docs.pyrogram.org/support-pyrogram From e7c49c6a1b98148a792beecdafff79e9030a277e Mon Sep 17 00:00:00 2001 From: Dan <14043624+delivrance@users.noreply.github.com> Date: Mon, 24 Jun 2019 10:07:28 +0200 Subject: [PATCH 02/16] Revamp HTML and Markdown parsers to allow multiple nested entities --- pyrogram/client/style/html.py | 197 ++++++++++++++---------------- pyrogram/client/style/markdown.py | 158 +++++++----------------- 2 files changed, 137 insertions(+), 218 deletions(-) diff --git a/pyrogram/client/style/html.py b/pyrogram/client/style/html.py index b42114a8..82921f4c 100644 --- a/pyrogram/client/style/html.py +++ b/pyrogram/client/style/html.py @@ -16,127 +16,110 @@ # You should have received a copy of the GNU Lesser General Public License # along with Pyrogram. If not, see . +import html import re from collections import OrderedDict +from html.parser import HTMLParser import pyrogram -from pyrogram.api.types import ( - MessageEntityBold as Bold, - MessageEntityItalic as Italic, - MessageEntityCode as Code, - MessageEntityTextUrl as Url, - MessageEntityPre as Pre, - MessageEntityUnderline as Underline, - MessageEntityStrike as Strike, - MessageEntityBlockquote as Blockquote, - MessageEntityMentionName as MentionInvalid, - InputMessageEntityMentionName as Mention, -) +from pyrogram.api import types from pyrogram.errors import PeerIdInvalid from . import utils -class HTML: - HTML_RE = re.compile(r"<(\w+)(?: href=([\"'])([^<]+)\2)?>([^>]+)") +class Parser(HTMLParser): MENTION_RE = re.compile(r"tg://user\?id=(\d+)") + def __init__(self, client: "pyrogram.BaseClient"): + super().__init__() + + self.client = client + + self.text = "" + self.entities = [] + self.temp_entities = [] + self.tags = [] + + def handle_starttag(self, tag, attrs): + attrs = dict(attrs) + extra = {} + + if tag in ["b", "strong"]: + entity = types.MessageEntityBold + elif tag in ["i", "em"]: + entity = types.MessageEntityItalic + elif tag == "u": + entity = types.MessageEntityUnderline + elif tag in ["s", "del", "strike"]: + entity = types.MessageEntityStrike + elif tag == "blockquote": + entity = types.MessageEntityBlockquote + elif tag == "code": + entity = types.MessageEntityCode + elif tag == "pre": + entity = types.MessageEntityPre + extra["language"] = "" + elif tag == "a": + url = attrs.get("href", "") + + mention = Parser.MENTION_RE.match(url) + + if mention: + user_id = int(mention.group(1)) + + try: + user = self.client.resolve_peer(user_id) + except PeerIdInvalid: + entity = types.MessageEntityMentionName + extra["user_id"] = user_id + else: + entity = types.InputMessageEntityMentionName + extra["user_id"] = user + else: + entity = types.MessageEntityTextUrl + extra["url"] = url + else: + return + + self.tags.append(tag) + self.temp_entities.append(entity(offset=len(self.text), length=0, **extra)) + + def handle_data(self, data): + data = html.unescape(data) + + for entity in self.temp_entities: + entity.length += len(data) + + self.text += data + + def handle_endtag(self, tag): + start_tag = self.tags.pop() + + if start_tag != tag: + line, offset = self.getpos() + offset += 1 + + raise ValueError("Expected end tag , but found at {}:{}".format(start_tag, tag, line, offset)) + + self.entities.append(self.temp_entities.pop()) + + def error(self, message): + pass + + +class HTML: def __init__(self, client: "pyrogram.BaseClient" = None): self.client = client - def parse(self, message: str): - entities = [] - message = utils.add_surrogates(str(message or "")) - offset = 0 + def parse(self, text: str): + text = utils.add_surrogates(str(text or "").strip()) - for match in self.HTML_RE.finditer(message): - start = match.start() - offset - style, url, body = match.group(1, 3, 4) + parser = Parser(self.client) + parser.feed(text) + print(parser.entities) - if url: - mention = self.MENTION_RE.match(url) - - if mention: - user_id = int(mention.group(1)) - - try: - input_user = self.client.resolve_peer(user_id) - except PeerIdInvalid: - input_user = None - - entity = ( - Mention(offset=start, length=len(body), user_id=input_user) - if input_user else MentionInvalid(offset=start, length=len(body), user_id=user_id) - ) - else: - entity = Url(offset=start, length=len(body), url=url) - else: - if style == "b" or style == "strong": - entity = Bold(offset=start, length=len(body)) - elif style == "i" or style == "em": - entity = Italic(offset=start, length=len(body)) - elif style == "code": - entity = Code(offset=start, length=len(body)) - elif style == "pre": - entity = Pre(offset=start, length=len(body), language="") - elif style == "u": - entity = Underline(offset=start, length=len(body)) - elif style in ["strike", "s", "del"]: - entity = Strike(offset=start, length=len(body)) - elif style == "blockquote": - entity = Blockquote(offset=start, length=len(body)) - else: - continue - - entities.append(entity) - message = message.replace(match.group(), body) - offset += len(style) * 2 + 5 + (len(url) + 8 if url else 0) - - # TODO: OrderedDict to be removed in Python3.6 + # TODO: OrderedDict to be removed in Python 3.6 return OrderedDict([ - ("message", utils.remove_surrogates(message)), - ("entities", entities) + ("message", utils.remove_surrogates(parser.text)), + ("entities", parser.entities) ]) - - def unparse(self, message: str, entities: list): - message = utils.add_surrogates(message).strip() - offset = 0 - - for entity in entities: - start = entity.offset + offset - type = entity.type - url = entity.url - user = entity.user - sub = message[start: start + entity.length] - - if type == "bold": - style = "b" - elif type == "italic": - style = "i" - elif type == "code": - style = "code" - elif type == "pre": - style = "pre" - elif type == "underline": - style = "u" - elif type == "strike": - style = "s" - elif type == "blockquote": - style = "blockquote" - elif type == "text_link": - offset += 15 + len(url) - message = message[:start] + message[start:].replace( - sub, "{}".format(url, sub), 1) - continue - elif type == "text_mention": - offset += 28 + len(str(user.id)) - message = message[:start] + message[start:].replace( - sub, "{}".format(user.id, sub), 1) - continue - else: - continue - - offset += len(style) * 2 + 5 - message = message[:start] + message[start:].replace( - sub, "<{0}>{1}".format(style, sub), 1) - - return utils.remove_surrogates(message) diff --git a/pyrogram/client/style/markdown.py b/pyrogram/client/style/markdown.py index 9dded1f3..26effe5c 100644 --- a/pyrogram/client/style/markdown.py +++ b/pyrogram/client/style/markdown.py @@ -17,22 +17,9 @@ # along with Pyrogram. If not, see . import re -from collections import OrderedDict import pyrogram -from pyrogram.api.types import ( - MessageEntityBold as Bold, - MessageEntityItalic as Italic, - MessageEntityCode as Code, - MessageEntityTextUrl as Url, - MessageEntityPre as Pre, - MessageEntityUnderline as Underline, - MessageEntityStrike as Strike, - MessageEntityMentionName as MentionInvalid, - InputMessageEntityMentionName as Mention -) -from pyrogram.errors import PeerIdInvalid -from . import utils +from .html import HTML class Markdown: @@ -43,10 +30,10 @@ class Markdown: CODE_DELIMITER = "`" PRE_DELIMITER = "```" - MARKDOWN_RE = re.compile(r"({d})([\w\W]*?)\1|\[([^[]+?)\]\(([^(]+?)\)".format( + MARKDOWN_RE = re.compile(r"({d})".format( d="|".join( ["".join(i) for i in [ - ["\{}".format(j) for j in i] + [r"\{}".format(j) for j in i] for i in [ PRE_DELIMITER, CODE_DELIMITER, @@ -56,107 +43,56 @@ class Markdown: BOLD_DELIMITER ] ]] - ) - )) - MENTION_RE = re.compile(r"tg://user\?id=(\d+)") + ))) - def __init__(self, client: "pyrogram.BaseClient" = None): - self.client = client + URL_RE = re.compile(r"\[([^[]+)]\(([^(]+)\)") - def parse(self, message: str): - message = utils.add_surrogates(str(message or "")).strip() - entities = [] + def __init__(self, client: "pyrogram.BaseClient"): + self.html = HTML(client) + + def parse(self, text: str): offset = 0 + delimiters = set() - for match in self.MARKDOWN_RE.finditer(message): - start = match.start() - offset - style, body, text, url = match.groups() + for i, match in enumerate(re.finditer(Markdown.MARKDOWN_RE, text)): + start, stop = match.span() + delimiter = match.group(1) - if url: - mention = self.MENTION_RE.match(url) - - if mention: - user_id = int(mention.group(1)) - - try: - input_user = self.client.resolve_peer(user_id) - except PeerIdInvalid: - input_user = None - - entity = ( - Mention(offset=start, length=len(text), user_id=input_user) - if input_user else MentionInvalid(offset=start, length=len(text), user_id=user_id) - ) - else: - entity = Url(offset=start, length=len(text), url=url) - - body = text - offset += len(url) + 4 - else: - if style == self.BOLD_DELIMITER: - entity = Bold(offset=start, length=len(body)) - elif style == self.ITALIC_DELIMITER: - entity = Italic(offset=start, length=len(body)) - elif style == self.UNDERLINE_DELIMITER: - entity = Underline(offset=start, length=len(body)) - elif style == self.STRIKE_DELIMITER: - entity = Strike(offset=start, length=len(body)) - elif style == self.CODE_DELIMITER: - entity = Code(offset=start, length=len(body)) - elif style == self.PRE_DELIMITER: - entity = Pre(offset=start, length=len(body), language="") - else: - continue - - offset += len(style) * 2 - - entities.append(entity) - message = message.replace(match.group(), body) - - # TODO: OrderedDict to be removed in Python3.6 - return OrderedDict([ - ("message", utils.remove_surrogates(message)), - ("entities", entities) - ]) - - def unparse(self, message: str, entities: list): - message = utils.add_surrogates(message).strip() - offset = 0 - - for entity in entities: - start = entity.offset + offset - type = entity.type - url = entity.url - user = entity.user - sub = message[start: start + entity.length] - - if type == "bold": - style = self.BOLD_DELIMITER - elif type == "italic": - style = self.ITALIC_DELIMITER - elif type == "underline": - style = self.UNDERLINE_DELIMITER - elif type == "strike": - style = self.STRIKE_DELIMITER - elif type == "code": - style = self.CODE_DELIMITER - elif type == "pre": - style = self.PRE_DELIMITER - elif type == "text_link": - offset += 4 + len(url) - message = message[:start] + message[start:].replace( - sub, "[{}]({})".format(sub, url), 1) - continue - elif type == "text_mention": - offset += 17 + len(str(user.id)) - message = message[:start] + message[start:].replace( - sub, "[{}](tg://user?id={})".format(sub, user.id), 1) - continue + if delimiter == Markdown.BOLD_DELIMITER: + tag = "b" + elif delimiter == Markdown.ITALIC_DELIMITER: + tag = "i" + elif delimiter == Markdown.UNDERLINE_DELIMITER: + tag = "u" + elif delimiter == Markdown.STRIKE_DELIMITER: + tag = "s" + elif delimiter == Markdown.CODE_DELIMITER: + tag = "code" + elif delimiter == Markdown.PRE_DELIMITER: + tag = "pre" else: continue - offset += len(style) * 2 - message = message[:start] + message[start:].replace( - sub, "{0}{1}{0}".format(style, sub), 1) + if delimiter not in delimiters: + delimiters.add(delimiter) + tag = "<{}>".format(tag) + else: + delimiters.remove(delimiter) + tag = "".format(tag) - return utils.remove_surrogates(message) + text = text[:start + offset] + tag + text[stop + offset:] + + offset += len(tag) - len(delimiter) + + offset = 0 + + for match in re.finditer(Markdown.URL_RE, text): + start, stop = match.span() + full = match.group(0) + body, url = match.groups() + replace = '{}'.format(url, body) + + text = text[:start + offset] + replace + text[stop + offset:] + offset += len(replace) - len(full) + + return self.html.parse(text) From d6900cde9f5adca32fb80624ceb88331af1c0b85 Mon Sep 17 00:00:00 2001 From: Dan <14043624+delivrance@users.noreply.github.com> Date: Mon, 24 Jun 2019 10:11:21 +0200 Subject: [PATCH 03/16] Remove debug print() --- pyrogram/client/style/html.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pyrogram/client/style/html.py b/pyrogram/client/style/html.py index 82921f4c..81d7ce9b 100644 --- a/pyrogram/client/style/html.py +++ b/pyrogram/client/style/html.py @@ -116,7 +116,6 @@ class HTML: parser = Parser(self.client) parser.feed(text) - print(parser.entities) # TODO: OrderedDict to be removed in Python 3.6 return OrderedDict([ From f12cee5d94f42c4b5d7e669e33e1aa85de9ea0ce Mon Sep 17 00:00:00 2001 From: Dan <14043624+delivrance@users.noreply.github.com> Date: Mon, 24 Jun 2019 10:54:58 +0200 Subject: [PATCH 04/16] Automatically escape URL bodies when using markdown --- pyrogram/client/style/markdown.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pyrogram/client/style/markdown.py b/pyrogram/client/style/markdown.py index 26effe5c..001fc60f 100644 --- a/pyrogram/client/style/markdown.py +++ b/pyrogram/client/style/markdown.py @@ -16,6 +16,7 @@ # You should have received a copy of the GNU Lesser General Public License # along with Pyrogram. If not, see . +import html import re import pyrogram @@ -89,10 +90,14 @@ class Markdown: for match in re.finditer(Markdown.URL_RE, text): start, stop = match.span() full = match.group(0) + body, url = match.groups() + body = html.escape(body) + replace = '{}'.format(url, body) text = text[:start + offset] + replace + text[stop + offset:] + offset += len(replace) - len(full) return self.html.parse(text) From 8e0182633f8036d23afe6de98e3f391f80a2074c Mon Sep 17 00:00:00 2001 From: Dan <14043624+delivrance@users.noreply.github.com> Date: Mon, 24 Jun 2019 13:35:58 +0200 Subject: [PATCH 05/16] Ignore any other style when inside a fixed-width style --- pyrogram/client/style/markdown.py | 66 +++++++++++++++++-------------- 1 file changed, 37 insertions(+), 29 deletions(-) diff --git a/pyrogram/client/style/markdown.py b/pyrogram/client/style/markdown.py index 001fc60f..520008a8 100644 --- a/pyrogram/client/style/markdown.py +++ b/pyrogram/client/style/markdown.py @@ -24,66 +24,76 @@ from .html import HTML class Markdown: - BOLD_DELIMITER = "**" - ITALIC_DELIMITER = "__" - UNDERLINE_DELIMITER = "--" - STRIKE_DELIMITER = "~~" - CODE_DELIMITER = "`" - PRE_DELIMITER = "```" + BOLD_DELIM = "**" + ITALIC_DELIM = "__" + UNDERLINE_DELIM = "--" + STRIKE_DELIM = "~~" + CODE_DELIM = "`" + PRE_DELIM = "```" MARKDOWN_RE = re.compile(r"({d})".format( d="|".join( ["".join(i) for i in [ [r"\{}".format(j) for j in i] for i in [ - PRE_DELIMITER, - CODE_DELIMITER, - STRIKE_DELIMITER, - UNDERLINE_DELIMITER, - ITALIC_DELIMITER, - BOLD_DELIMITER + PRE_DELIM, + CODE_DELIM, + STRIKE_DELIM, + UNDERLINE_DELIM, + ITALIC_DELIM, + BOLD_DELIM ] ]] ))) URL_RE = re.compile(r"\[([^[]+)]\(([^(]+)\)") + OPENING_TAG = "<{}>" + CLOSING_TAG = "" + URL_MARKUP = '{}' + FIXED_WIDTH_DELIMS = [CODE_DELIM, PRE_DELIM] + def __init__(self, client: "pyrogram.BaseClient"): self.html = HTML(client) def parse(self, text: str): + text = html.escape(text) + offset = 0 - delimiters = set() + delims = set() for i, match in enumerate(re.finditer(Markdown.MARKDOWN_RE, text)): start, stop = match.span() - delimiter = match.group(1) + delim = match.group(1) - if delimiter == Markdown.BOLD_DELIMITER: + if delim == Markdown.BOLD_DELIM: tag = "b" - elif delimiter == Markdown.ITALIC_DELIMITER: + elif delim == Markdown.ITALIC_DELIM: tag = "i" - elif delimiter == Markdown.UNDERLINE_DELIMITER: + elif delim == Markdown.UNDERLINE_DELIM: tag = "u" - elif delimiter == Markdown.STRIKE_DELIMITER: + elif delim == Markdown.STRIKE_DELIM: tag = "s" - elif delimiter == Markdown.CODE_DELIMITER: + elif delim == Markdown.CODE_DELIM: tag = "code" - elif delimiter == Markdown.PRE_DELIMITER: + elif delim == Markdown.PRE_DELIM: tag = "pre" else: continue - if delimiter not in delimiters: - delimiters.add(delimiter) - tag = "<{}>".format(tag) + if delim not in Markdown.FIXED_WIDTH_DELIMS and any(x in delims for x in Markdown.FIXED_WIDTH_DELIMS): + continue + + if delim not in delims: + delims.add(delim) + tag = Markdown.OPENING_TAG.format(tag) else: - delimiters.remove(delimiter) - tag = "".format(tag) + delims.remove(delim) + tag = Markdown.CLOSING_TAG.format(tag) text = text[:start + offset] + tag + text[stop + offset:] - offset += len(tag) - len(delimiter) + offset += len(tag) - len(delim) offset = 0 @@ -92,9 +102,7 @@ class Markdown: full = match.group(0) body, url = match.groups() - body = html.escape(body) - - replace = '{}'.format(url, body) + replace = Markdown.URL_MARKUP.format(url, body) text = text[:start + offset] + replace + text[stop + offset:] From cac0bcabf915ff3e7a06a7d7be2a21da257050e5 Mon Sep 17 00:00:00 2001 From: Dan <14043624+delivrance@users.noreply.github.com> Date: Mon, 24 Jun 2019 13:36:27 +0200 Subject: [PATCH 06/16] Fix HTML parsing breaking with no tags --- pyrogram/client/style/html.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pyrogram/client/style/html.py b/pyrogram/client/style/html.py index 81d7ce9b..df7c64ff 100644 --- a/pyrogram/client/style/html.py +++ b/pyrogram/client/style/html.py @@ -93,7 +93,10 @@ class Parser(HTMLParser): self.text += data def handle_endtag(self, tag): - start_tag = self.tags.pop() + try: + start_tag = self.tags.pop() + except IndexError: + return if start_tag != tag: line, offset = self.getpos() @@ -113,6 +116,7 @@ class HTML: def parse(self, text: str): text = utils.add_surrogates(str(text or "").strip()) + text = "

{}

".format(text) parser = Parser(self.client) parser.feed(text) From a27dc575e413968f8814a7fb365bf7f2ada9aaa6 Mon Sep 17 00:00:00 2001 From: Dan <14043624+delivrance@users.noreply.github.com> Date: Mon, 24 Jun 2019 14:17:46 +0200 Subject: [PATCH 07/16] Actually fix the HTML Parser feeding by calling .close() when done --- pyrogram/client/style/html.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyrogram/client/style/html.py b/pyrogram/client/style/html.py index df7c64ff..1748fa58 100644 --- a/pyrogram/client/style/html.py +++ b/pyrogram/client/style/html.py @@ -116,10 +116,10 @@ class HTML: def parse(self, text: str): text = utils.add_surrogates(str(text or "").strip()) - text = "

{}

".format(text) parser = Parser(self.client) parser.feed(text) + parser.close() # TODO: OrderedDict to be removed in Python 3.6 return OrderedDict([ From e7457de947524a11c9690f98bddd7b2295d08e0d Mon Sep 17 00:00:00 2001 From: Dan <14043624+delivrance@users.noreply.github.com> Date: Mon, 24 Jun 2019 14:25:09 +0200 Subject: [PATCH 08/16] Add MSGID_DECREASE_RETRY 5xx-class error --- compiler/error/source/500_INTERNAL_SERVER_ERROR.tsv | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/compiler/error/source/500_INTERNAL_SERVER_ERROR.tsv b/compiler/error/source/500_INTERNAL_SERVER_ERROR.tsv index 446fe908..4bbea8ea 100644 --- a/compiler/error/source/500_INTERNAL_SERVER_ERROR.tsv +++ b/compiler/error/source/500_INTERNAL_SERVER_ERROR.tsv @@ -9,4 +9,5 @@ RANDOM_ID_DUPLICATE Telegram is having internal problems. Please try again later WORKER_BUSY_TOO_LONG_RETRY Telegram is having internal problems. Please try again later INTERDC_X_CALL_ERROR Telegram is having internal problems at DC{x}. Please try again later INTERDC_X_CALL_RICH_ERROR Telegram is having internal problems at DC{x}. Please try again later -FOLDER_DEAC_AUTOFIX_ALL Telegram is having internal problems. Please try again later \ No newline at end of file +FOLDER_DEAC_AUTOFIX_ALL Telegram is having internal problems. Please try again later +MSGID_DECREASE_RETRY Telegram is having internal problems. Please try again later \ No newline at end of file From cd1e41b130297d3517c4e3cf0d14f2f81b2801a7 Mon Sep 17 00:00:00 2001 From: Dan <14043624+delivrance@users.noreply.github.com> Date: Mon, 24 Jun 2019 14:33:17 +0200 Subject: [PATCH 09/16] Delete style utils.py and move its content inside html.py The HTML parser is now the only one that makes use of those util methods --- pyrogram/client/style/html.py | 23 ++++++++++++++++++--- pyrogram/client/style/utils.py | 37 ---------------------------------- 2 files changed, 20 insertions(+), 40 deletions(-) delete mode 100644 pyrogram/client/style/utils.py diff --git a/pyrogram/client/style/html.py b/pyrogram/client/style/html.py index 1748fa58..9376f793 100644 --- a/pyrogram/client/style/html.py +++ b/pyrogram/client/style/html.py @@ -20,11 +20,11 @@ import html import re from collections import OrderedDict from html.parser import HTMLParser +from struct import unpack import pyrogram from pyrogram.api import types from pyrogram.errors import PeerIdInvalid -from . import utils class Parser(HTMLParser): @@ -111,11 +111,28 @@ class Parser(HTMLParser): class HTML: + # SMP = Supplementary Multilingual Plane: https://en.wikipedia.org/wiki/Plane_(Unicode)#Overview + SMP_RE = re.compile(r"[\U00010000-\U0010FFFF]") + def __init__(self, client: "pyrogram.BaseClient" = None): self.client = client + @staticmethod + def add_surrogates(text): + # Replace each SMP code point with a surrogate pair + return HTML.SMP_RE.sub( + lambda match: # Split SMP in two surrogates + "".join(chr(i) for i in unpack(" -# -# This file is part of Pyrogram. -# -# Pyrogram is free software: you can redistribute it and/or modify -# it under the terms of the GNU Lesser General Public License as published -# by the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# Pyrogram is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Lesser General Public License for more details. -# -# You should have received a copy of the GNU Lesser General Public License -# along with Pyrogram. If not, see . - -import re -from struct import unpack - -# SMP = Supplementary Multilingual Plane: https://en.wikipedia.org/wiki/Plane_(Unicode)#Overview -SMP_RE = re.compile(r"[\U00010000-\U0010FFFF]") - - -def add_surrogates(text): - # Replace each SMP code point with a surrogate pair - return SMP_RE.sub( - lambda match: # Split SMP in two surrogates - "".join(chr(i) for i in unpack(" Date: Tue, 25 Jun 2019 05:47:57 +0200 Subject: [PATCH 10/16] Revert "Delete style utils.py and move its content inside html.py The HTML parser is now the only one that makes use of those util methods" This reverts commit cd1e41b1 --- pyrogram/client/style/html.py | 23 +++------------------ pyrogram/client/style/utils.py | 37 ++++++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 20 deletions(-) create mode 100644 pyrogram/client/style/utils.py diff --git a/pyrogram/client/style/html.py b/pyrogram/client/style/html.py index 9376f793..1748fa58 100644 --- a/pyrogram/client/style/html.py +++ b/pyrogram/client/style/html.py @@ -20,11 +20,11 @@ import html import re from collections import OrderedDict from html.parser import HTMLParser -from struct import unpack import pyrogram from pyrogram.api import types from pyrogram.errors import PeerIdInvalid +from . import utils class Parser(HTMLParser): @@ -111,28 +111,11 @@ class Parser(HTMLParser): class HTML: - # SMP = Supplementary Multilingual Plane: https://en.wikipedia.org/wiki/Plane_(Unicode)#Overview - SMP_RE = re.compile(r"[\U00010000-\U0010FFFF]") - def __init__(self, client: "pyrogram.BaseClient" = None): self.client = client - @staticmethod - def add_surrogates(text): - # Replace each SMP code point with a surrogate pair - return HTML.SMP_RE.sub( - lambda match: # Split SMP in two surrogates - "".join(chr(i) for i in unpack(" +# +# This file is part of Pyrogram. +# +# Pyrogram is free software: you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published +# by the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Pyrogram is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with Pyrogram. If not, see . + +import re +from struct import unpack + +# SMP = Supplementary Multilingual Plane: https://en.wikipedia.org/wiki/Plane_(Unicode)#Overview +SMP_RE = re.compile(r"[\U00010000-\U0010FFFF]") + + +def add_surrogates(text): + # Replace each SMP code point with a surrogate pair + return SMP_RE.sub( + lambda match: # Split SMP in two surrogates + "".join(chr(i) for i in unpack(" Date: Tue, 25 Jun 2019 05:53:41 +0200 Subject: [PATCH 11/16] Make slicing text messages & captions work properly with entity offsets --- pyrogram/client/types/messages_and_media/message.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pyrogram/client/types/messages_and_media/message.py b/pyrogram/client/types/messages_and_media/message.py index 52e8f473..34080d7f 100644 --- a/pyrogram/client/types/messages_and_media/message.py +++ b/pyrogram/client/types/messages_and_media/message.py @@ -31,6 +31,7 @@ from ..object import Object from ..update import Update from ..user_and_chats.chat import Chat from ..user_and_chats.user import User +from ...style import utils class Str(str): @@ -58,6 +59,9 @@ class Str(str): def html(self): return self._client.html.unparse(self, self._entities) + def __getitem__(self, item): + return utils.remove_surrogates(utils.add_surrogates(self)[item]) + class Message(Object, Update): """A message. From 07bc7e39df8f9e66514bc78b3b6896e8a0d70d8d Mon Sep 17 00:00:00 2001 From: Dan <14043624+delivrance@users.noreply.github.com> Date: Tue, 25 Jun 2019 07:08:38 +0200 Subject: [PATCH 12/16] Allow entities to overlap, like: bold and italic --- pyrogram/client/style/html.py | 27 +++++++++------------------ 1 file changed, 9 insertions(+), 18 deletions(-) diff --git a/pyrogram/client/style/html.py b/pyrogram/client/style/html.py index 1748fa58..f861412b 100644 --- a/pyrogram/client/style/html.py +++ b/pyrogram/client/style/html.py @@ -37,8 +37,7 @@ class Parser(HTMLParser): self.text = "" self.entities = [] - self.temp_entities = [] - self.tags = [] + self.tag_entities = {} def handle_starttag(self, tag, attrs): attrs = dict(attrs) @@ -81,30 +80,22 @@ class Parser(HTMLParser): else: return - self.tags.append(tag) - self.temp_entities.append(entity(offset=len(self.text), length=0, **extra)) + if tag not in self.tag_entities: + self.tag_entities[tag] = [] + + self.tag_entities[tag].append(entity(offset=len(self.text), length=0, **extra)) def handle_data(self, data): data = html.unescape(data) - for entity in self.temp_entities: - entity.length += len(data) + for entities in self.tag_entities.values(): + for entity in entities: + entity.length += len(data) self.text += data def handle_endtag(self, tag): - try: - start_tag = self.tags.pop() - except IndexError: - return - - if start_tag != tag: - line, offset = self.getpos() - offset += 1 - - raise ValueError("Expected end tag , but found at {}:{}".format(start_tag, tag, line, offset)) - - self.entities.append(self.temp_entities.pop()) + self.entities.append(self.tag_entities[tag].pop()) def error(self, message): pass From a086964e85851b25ae351cca73ac26c496799642 Mon Sep 17 00:00:00 2001 From: Dan <14043624+delivrance@users.noreply.github.com> Date: Tue, 25 Jun 2019 07:41:48 +0200 Subject: [PATCH 13/16] Make the HTML parser more sound --- pyrogram/client/style/html.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/pyrogram/client/style/html.py b/pyrogram/client/style/html.py index f861412b..579ed7ec 100644 --- a/pyrogram/client/style/html.py +++ b/pyrogram/client/style/html.py @@ -95,7 +95,16 @@ class Parser(HTMLParser): self.text += data def handle_endtag(self, tag): - self.entities.append(self.tag_entities[tag].pop()) + try: + self.entities.append(self.tag_entities[tag].pop()) + except (KeyError, IndexError): + line, offset = self.getpos() + offset += 1 + + raise ValueError("Unmatched closing tag at line {}:{}".format(tag, line, offset)) + else: + if not self.tag_entities[tag]: + self.tag_entities.pop(tag) def error(self, message): pass @@ -112,6 +121,14 @@ class HTML: parser.feed(text) parser.close() + if parser.tag_entities: + unclosed_tags = [] + + for tag, entities in parser.tag_entities.items(): + unclosed_tags.append("<{}> (x{})".format(tag, len(entities))) + + raise ValueError("Unclosed tags: {}".format(", ".join(unclosed_tags))) + # TODO: OrderedDict to be removed in Python 3.6 return OrderedDict([ ("message", utils.remove_surrogates(parser.text)), From 168fce09da2d5b26f24187970a425ae875a49527 Mon Sep 17 00:00:00 2001 From: Dan <14043624+delivrance@users.noreply.github.com> Date: Tue, 25 Jun 2019 10:24:19 +0200 Subject: [PATCH 14/16] Implement HTML.unparse and Markdown.unparse --- pyrogram/client/style/html.py | 44 +++++++++++++++++++ pyrogram/client/style/markdown.py | 72 +++++++++++++++++++++++++------ 2 files changed, 103 insertions(+), 13 deletions(-) diff --git a/pyrogram/client/style/html.py b/pyrogram/client/style/html.py index 579ed7ec..17a5daa6 100644 --- a/pyrogram/client/style/html.py +++ b/pyrogram/client/style/html.py @@ -134,3 +134,47 @@ class HTML: ("message", utils.remove_surrogates(parser.text)), ("entities", parser.entities) ]) + + @staticmethod + def unparse(text: str, entities: list): + text = utils.add_surrogates(text) + copy = text + + for entity in entities: + start = entity.offset + end = start + entity.length + + type = entity.type + + url = entity.url + user = entity.user + + sub = copy[start:end] + + if type == "bold": + style = "b" + elif type == "italic": + style = "i" + elif type == "underline": + style = "u" + elif type == "strike": + style = "s" + elif type == "code": + style = "code" + elif type == "pre": + style = "pre" + elif type == "blockquote": + style = "blockquote" + elif type == "text_link": + text = text[:start] + text[start:].replace(sub, '{}'.format(url, sub), 1) + continue + elif type == "text_mention": + text = text[:start] + text[start:].replace( + sub, '{}'.format(user.id, sub), 1) + continue + else: + continue + + text = text[:start] + text[start:].replace(sub, "<{0}>{1}".format(style, sub), 1) + + return utils.remove_surrogates(text) diff --git a/pyrogram/client/style/markdown.py b/pyrogram/client/style/markdown.py index 520008a8..93d8fc9a 100644 --- a/pyrogram/client/style/markdown.py +++ b/pyrogram/client/style/markdown.py @@ -20,17 +20,18 @@ import html import re import pyrogram +from . import utils from .html import HTML +BOLD_DELIM = "**" +ITALIC_DELIM = "__" +UNDERLINE_DELIM = "--" +STRIKE_DELIM = "~~" +CODE_DELIM = "`" +PRE_DELIM = "```" + class Markdown: - BOLD_DELIM = "**" - ITALIC_DELIM = "__" - UNDERLINE_DELIM = "--" - STRIKE_DELIM = "~~" - CODE_DELIM = "`" - PRE_DELIM = "```" - MARKDOWN_RE = re.compile(r"({d})".format( d="|".join( ["".join(i) for i in [ @@ -66,17 +67,17 @@ class Markdown: start, stop = match.span() delim = match.group(1) - if delim == Markdown.BOLD_DELIM: + if delim == BOLD_DELIM: tag = "b" - elif delim == Markdown.ITALIC_DELIM: + elif delim == ITALIC_DELIM: tag = "i" - elif delim == Markdown.UNDERLINE_DELIM: + elif delim == UNDERLINE_DELIM: tag = "u" - elif delim == Markdown.STRIKE_DELIM: + elif delim == STRIKE_DELIM: tag = "s" - elif delim == Markdown.CODE_DELIM: + elif delim == CODE_DELIM: tag = "code" - elif delim == Markdown.PRE_DELIM: + elif delim == PRE_DELIM: tag = "pre" else: continue @@ -109,3 +110,48 @@ class Markdown: offset += len(replace) - len(full) return self.html.parse(text) + + @staticmethod + def unparse(text: str, entities: list): + text = utils.add_surrogates(text) + copy = text + + for entity in entities: + start = entity.offset + end = start + entity.length + + type = entity.type + + url = entity.url + user = entity.user + + sub = copy[start:end] + + if type == "bold": + style = BOLD_DELIM + elif type == "italic": + style = ITALIC_DELIM + elif type == "underline": + style = UNDERLINE_DELIM + elif type == "strike": + style = STRIKE_DELIM + elif type == "code": + style = CODE_DELIM + elif type == "pre": + style = PRE_DELIM + # TODO: Blockquote for MD + # elif type == "blockquote": + # style = ... + elif type == "text_link": + text = text[:start] + text[start:].replace(sub, '[{1}]({0})'.format(url, sub), 1) + continue + elif type == "text_mention": + text = text[:start] + text[start:].replace( + sub, '[{1}](tg://user?id={0})'.format(user.id, sub), 1) + continue + else: + continue + + text = text[:start] + text[start:].replace(sub, "{0}{1}{0}".format(style, sub), 1) + + return utils.remove_surrogates(text) From 32ca805f6be0295f53ff9d4e20c817ad7183e949 Mon Sep 17 00:00:00 2001 From: Dan <14043624+delivrance@users.noreply.github.com> Date: Tue, 25 Jun 2019 10:25:21 +0200 Subject: [PATCH 15/16] Update message.py --- .../types/messages_and_media/message.py | 24 +++++++------------ 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/pyrogram/client/types/messages_and_media/message.py b/pyrogram/client/types/messages_and_media/message.py index 34080d7f..16726a47 100644 --- a/pyrogram/client/types/messages_and_media/message.py +++ b/pyrogram/client/types/messages_and_media/message.py @@ -31,33 +31,27 @@ from ..object import Object from ..update import Update from ..user_and_chats.chat import Chat from ..user_and_chats.user import User -from ...style import utils +from ...style import utils, Markdown, HTML class Str(str): def __init__(self, *args): super().__init__() - self._client = None - self._entities = None + self.entities = None - def init(self, client, entities): - self._client = client - self._entities = entities + def init(self, entities): + self.entities = entities return self - @property - def text(self): - return self - @property def markdown(self): - return self._client.markdown.unparse(self, self._entities) + return Markdown.unparse(self, self.entities) @property def html(self): - return self._client.html.unparse(self, self._entities) + return HTML.unparse(self, self.entities) def __getitem__(self, item): return utils.remove_surrogates(utils.add_surrogates(self)[item]) @@ -490,7 +484,7 @@ class Message(Object, Update): if isinstance(message, types.Message): entities = [MessageEntity._parse(client, entity, users) for entity in message.entities] - entities = list(filter(lambda x: x is not None, entities)) + entities = pyrogram.List(filter(lambda x: x is not None, entities)) forward_from = None forward_sender_name = None @@ -607,8 +601,8 @@ class Message(Object, Update): date=message.date, chat=Chat._parse(client, message, users, chats), from_user=User._parse(client, users.get(message.from_id, None)), - text=Str(message.message).init(client, entities) or None if media is None else None, - caption=Str(message.message).init(client, entities) or None if media is not None else None, + text=Str(message.message).init(entities) or None if media is None else None, + caption=Str(message.message).init(entities) or None if media is not None else None, entities=entities or None if media is None else None, caption_entities=entities or None if media is not None else None, author_signature=message.post_author, From 7490f6cfa3b8b401c00316bdd5a17389a1b6ee01 Mon Sep 17 00:00:00 2001 From: Dan <14043624+delivrance@users.noreply.github.com> Date: Tue, 25 Jun 2019 11:47:45 +0200 Subject: [PATCH 16/16] Update the HTML parser: make it easy for asyncio to deal with mentions We can't await coroutines inside HTMLParser overridden methods, such as handle_starttag, because they can't be async. This commit moves the resolve_peer call into the parse method of the HTML class, which can be defined async. --- pyrogram/client/style/html.py | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/pyrogram/client/style/html.py b/pyrogram/client/style/html.py index 17a5daa6..5617cb54 100644 --- a/pyrogram/client/style/html.py +++ b/pyrogram/client/style/html.py @@ -64,16 +64,8 @@ class Parser(HTMLParser): mention = Parser.MENTION_RE.match(url) if mention: - user_id = int(mention.group(1)) - - try: - user = self.client.resolve_peer(user_id) - except PeerIdInvalid: - entity = types.MessageEntityMentionName - extra["user_id"] = user_id - else: - entity = types.InputMessageEntityMentionName - extra["user_id"] = user + entity = types.InputMessageEntityMentionName + extra["user_id"] = int(mention.group(1)) else: entity = types.MessageEntityTextUrl extra["url"] = url @@ -129,10 +121,21 @@ class HTML: raise ValueError("Unclosed tags: {}".format(", ".join(unclosed_tags))) + entities = [] + + for entity in parser.entities: + if isinstance(entity, types.InputMessageEntityMentionName): + try: + entity.user_id = self.client.resolve_peer(entity.user_id) + except PeerIdInvalid: + continue + + entities.append(entity) + # TODO: OrderedDict to be removed in Python 3.6 return OrderedDict([ ("message", utils.remove_surrogates(parser.text)), - ("entities", parser.entities) + ("entities", entities) ]) @staticmethod