From 8102a7fe82323988f45aaaf32fdb75db1169dfd5 Mon Sep 17 00:00:00 2001
From: Dan <14043624+delivrance@users.noreply.github.com>
Date: Mon, 24 Jun 2019 01:46:20 +0200
Subject: [PATCH 01/16] Update FUNDING.yml
---
.github/FUNDING.yml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml
index f34f615a..3437aeae 100644
--- a/.github/FUNDING.yml
+++ b/.github/FUNDING.yml
@@ -1,2 +1,2 @@
-github: delivrance
+# github: delivrance
custom: https://docs.pyrogram.org/support-pyrogram
From e7c49c6a1b98148a792beecdafff79e9030a277e Mon Sep 17 00:00:00 2001
From: Dan <14043624+delivrance@users.noreply.github.com>
Date: Mon, 24 Jun 2019 10:07:28 +0200
Subject: [PATCH 02/16] Revamp HTML and Markdown parsers to allow multiple
nested entities
---
pyrogram/client/style/html.py | 197 ++++++++++++++----------------
pyrogram/client/style/markdown.py | 158 +++++++-----------------
2 files changed, 137 insertions(+), 218 deletions(-)
diff --git a/pyrogram/client/style/html.py b/pyrogram/client/style/html.py
index b42114a8..82921f4c 100644
--- a/pyrogram/client/style/html.py
+++ b/pyrogram/client/style/html.py
@@ -16,127 +16,110 @@
# You should have received a copy of the GNU Lesser General Public License
# along with Pyrogram. If not, see .
+import html
import re
from collections import OrderedDict
+from html.parser import HTMLParser
import pyrogram
-from pyrogram.api.types import (
- MessageEntityBold as Bold,
- MessageEntityItalic as Italic,
- MessageEntityCode as Code,
- MessageEntityTextUrl as Url,
- MessageEntityPre as Pre,
- MessageEntityUnderline as Underline,
- MessageEntityStrike as Strike,
- MessageEntityBlockquote as Blockquote,
- MessageEntityMentionName as MentionInvalid,
- InputMessageEntityMentionName as Mention,
-)
+from pyrogram.api import types
from pyrogram.errors import PeerIdInvalid
from . import utils
-class HTML:
- HTML_RE = re.compile(r"<(\w+)(?: href=([\"'])([^<]+)\2)?>([^>]+)\1>")
+class Parser(HTMLParser):
MENTION_RE = re.compile(r"tg://user\?id=(\d+)")
+ def __init__(self, client: "pyrogram.BaseClient"):
+ super().__init__()
+
+ self.client = client
+
+ self.text = ""
+ self.entities = []
+ self.temp_entities = []
+ self.tags = []
+
+ def handle_starttag(self, tag, attrs):
+ attrs = dict(attrs)
+ extra = {}
+
+ if tag in ["b", "strong"]:
+ entity = types.MessageEntityBold
+ elif tag in ["i", "em"]:
+ entity = types.MessageEntityItalic
+ elif tag == "u":
+ entity = types.MessageEntityUnderline
+ elif tag in ["s", "del", "strike"]:
+ entity = types.MessageEntityStrike
+ elif tag == "blockquote":
+ entity = types.MessageEntityBlockquote
+ elif tag == "code":
+ entity = types.MessageEntityCode
+ elif tag == "pre":
+ entity = types.MessageEntityPre
+ extra["language"] = ""
+ elif tag == "a":
+ url = attrs.get("href", "")
+
+ mention = Parser.MENTION_RE.match(url)
+
+ if mention:
+ user_id = int(mention.group(1))
+
+ try:
+ user = self.client.resolve_peer(user_id)
+ except PeerIdInvalid:
+ entity = types.MessageEntityMentionName
+ extra["user_id"] = user_id
+ else:
+ entity = types.InputMessageEntityMentionName
+ extra["user_id"] = user
+ else:
+ entity = types.MessageEntityTextUrl
+ extra["url"] = url
+ else:
+ return
+
+ self.tags.append(tag)
+ self.temp_entities.append(entity(offset=len(self.text), length=0, **extra))
+
+ def handle_data(self, data):
+ data = html.unescape(data)
+
+ for entity in self.temp_entities:
+ entity.length += len(data)
+
+ self.text += data
+
+ def handle_endtag(self, tag):
+ start_tag = self.tags.pop()
+
+ if start_tag != tag:
+ line, offset = self.getpos()
+ offset += 1
+
+ raise ValueError("Expected end tag {}>, but found {}> at {}:{}".format(start_tag, tag, line, offset))
+
+ self.entities.append(self.temp_entities.pop())
+
+ def error(self, message):
+ pass
+
+
+class HTML:
def __init__(self, client: "pyrogram.BaseClient" = None):
self.client = client
- def parse(self, message: str):
- entities = []
- message = utils.add_surrogates(str(message or ""))
- offset = 0
+ def parse(self, text: str):
+ text = utils.add_surrogates(str(text or "").strip())
- for match in self.HTML_RE.finditer(message):
- start = match.start() - offset
- style, url, body = match.group(1, 3, 4)
+ parser = Parser(self.client)
+ parser.feed(text)
+ print(parser.entities)
- if url:
- mention = self.MENTION_RE.match(url)
-
- if mention:
- user_id = int(mention.group(1))
-
- try:
- input_user = self.client.resolve_peer(user_id)
- except PeerIdInvalid:
- input_user = None
-
- entity = (
- Mention(offset=start, length=len(body), user_id=input_user)
- if input_user else MentionInvalid(offset=start, length=len(body), user_id=user_id)
- )
- else:
- entity = Url(offset=start, length=len(body), url=url)
- else:
- if style == "b" or style == "strong":
- entity = Bold(offset=start, length=len(body))
- elif style == "i" or style == "em":
- entity = Italic(offset=start, length=len(body))
- elif style == "code":
- entity = Code(offset=start, length=len(body))
- elif style == "pre":
- entity = Pre(offset=start, length=len(body), language="")
- elif style == "u":
- entity = Underline(offset=start, length=len(body))
- elif style in ["strike", "s", "del"]:
- entity = Strike(offset=start, length=len(body))
- elif style == "blockquote":
- entity = Blockquote(offset=start, length=len(body))
- else:
- continue
-
- entities.append(entity)
- message = message.replace(match.group(), body)
- offset += len(style) * 2 + 5 + (len(url) + 8 if url else 0)
-
- # TODO: OrderedDict to be removed in Python3.6
+ # TODO: OrderedDict to be removed in Python 3.6
return OrderedDict([
- ("message", utils.remove_surrogates(message)),
- ("entities", entities)
+ ("message", utils.remove_surrogates(parser.text)),
+ ("entities", parser.entities)
])
-
- def unparse(self, message: str, entities: list):
- message = utils.add_surrogates(message).strip()
- offset = 0
-
- for entity in entities:
- start = entity.offset + offset
- type = entity.type
- url = entity.url
- user = entity.user
- sub = message[start: start + entity.length]
-
- if type == "bold":
- style = "b"
- elif type == "italic":
- style = "i"
- elif type == "code":
- style = "code"
- elif type == "pre":
- style = "pre"
- elif type == "underline":
- style = "u"
- elif type == "strike":
- style = "s"
- elif type == "blockquote":
- style = "blockquote"
- elif type == "text_link":
- offset += 15 + len(url)
- message = message[:start] + message[start:].replace(
- sub, "{}".format(url, sub), 1)
- continue
- elif type == "text_mention":
- offset += 28 + len(str(user.id))
- message = message[:start] + message[start:].replace(
- sub, "{}".format(user.id, sub), 1)
- continue
- else:
- continue
-
- offset += len(style) * 2 + 5
- message = message[:start] + message[start:].replace(
- sub, "<{0}>{1}{0}>".format(style, sub), 1)
-
- return utils.remove_surrogates(message)
diff --git a/pyrogram/client/style/markdown.py b/pyrogram/client/style/markdown.py
index 9dded1f3..26effe5c 100644
--- a/pyrogram/client/style/markdown.py
+++ b/pyrogram/client/style/markdown.py
@@ -17,22 +17,9 @@
# along with Pyrogram. If not, see .
import re
-from collections import OrderedDict
import pyrogram
-from pyrogram.api.types import (
- MessageEntityBold as Bold,
- MessageEntityItalic as Italic,
- MessageEntityCode as Code,
- MessageEntityTextUrl as Url,
- MessageEntityPre as Pre,
- MessageEntityUnderline as Underline,
- MessageEntityStrike as Strike,
- MessageEntityMentionName as MentionInvalid,
- InputMessageEntityMentionName as Mention
-)
-from pyrogram.errors import PeerIdInvalid
-from . import utils
+from .html import HTML
class Markdown:
@@ -43,10 +30,10 @@ class Markdown:
CODE_DELIMITER = "`"
PRE_DELIMITER = "```"
- MARKDOWN_RE = re.compile(r"({d})([\w\W]*?)\1|\[([^[]+?)\]\(([^(]+?)\)".format(
+ MARKDOWN_RE = re.compile(r"({d})".format(
d="|".join(
["".join(i) for i in [
- ["\{}".format(j) for j in i]
+ [r"\{}".format(j) for j in i]
for i in [
PRE_DELIMITER,
CODE_DELIMITER,
@@ -56,107 +43,56 @@ class Markdown:
BOLD_DELIMITER
]
]]
- )
- ))
- MENTION_RE = re.compile(r"tg://user\?id=(\d+)")
+ )))
- def __init__(self, client: "pyrogram.BaseClient" = None):
- self.client = client
+ URL_RE = re.compile(r"\[([^[]+)]\(([^(]+)\)")
- def parse(self, message: str):
- message = utils.add_surrogates(str(message or "")).strip()
- entities = []
+ def __init__(self, client: "pyrogram.BaseClient"):
+ self.html = HTML(client)
+
+ def parse(self, text: str):
offset = 0
+ delimiters = set()
- for match in self.MARKDOWN_RE.finditer(message):
- start = match.start() - offset
- style, body, text, url = match.groups()
+ for i, match in enumerate(re.finditer(Markdown.MARKDOWN_RE, text)):
+ start, stop = match.span()
+ delimiter = match.group(1)
- if url:
- mention = self.MENTION_RE.match(url)
-
- if mention:
- user_id = int(mention.group(1))
-
- try:
- input_user = self.client.resolve_peer(user_id)
- except PeerIdInvalid:
- input_user = None
-
- entity = (
- Mention(offset=start, length=len(text), user_id=input_user)
- if input_user else MentionInvalid(offset=start, length=len(text), user_id=user_id)
- )
- else:
- entity = Url(offset=start, length=len(text), url=url)
-
- body = text
- offset += len(url) + 4
- else:
- if style == self.BOLD_DELIMITER:
- entity = Bold(offset=start, length=len(body))
- elif style == self.ITALIC_DELIMITER:
- entity = Italic(offset=start, length=len(body))
- elif style == self.UNDERLINE_DELIMITER:
- entity = Underline(offset=start, length=len(body))
- elif style == self.STRIKE_DELIMITER:
- entity = Strike(offset=start, length=len(body))
- elif style == self.CODE_DELIMITER:
- entity = Code(offset=start, length=len(body))
- elif style == self.PRE_DELIMITER:
- entity = Pre(offset=start, length=len(body), language="")
- else:
- continue
-
- offset += len(style) * 2
-
- entities.append(entity)
- message = message.replace(match.group(), body)
-
- # TODO: OrderedDict to be removed in Python3.6
- return OrderedDict([
- ("message", utils.remove_surrogates(message)),
- ("entities", entities)
- ])
-
- def unparse(self, message: str, entities: list):
- message = utils.add_surrogates(message).strip()
- offset = 0
-
- for entity in entities:
- start = entity.offset + offset
- type = entity.type
- url = entity.url
- user = entity.user
- sub = message[start: start + entity.length]
-
- if type == "bold":
- style = self.BOLD_DELIMITER
- elif type == "italic":
- style = self.ITALIC_DELIMITER
- elif type == "underline":
- style = self.UNDERLINE_DELIMITER
- elif type == "strike":
- style = self.STRIKE_DELIMITER
- elif type == "code":
- style = self.CODE_DELIMITER
- elif type == "pre":
- style = self.PRE_DELIMITER
- elif type == "text_link":
- offset += 4 + len(url)
- message = message[:start] + message[start:].replace(
- sub, "[{}]({})".format(sub, url), 1)
- continue
- elif type == "text_mention":
- offset += 17 + len(str(user.id))
- message = message[:start] + message[start:].replace(
- sub, "[{}](tg://user?id={})".format(sub, user.id), 1)
- continue
+ if delimiter == Markdown.BOLD_DELIMITER:
+ tag = "b"
+ elif delimiter == Markdown.ITALIC_DELIMITER:
+ tag = "i"
+ elif delimiter == Markdown.UNDERLINE_DELIMITER:
+ tag = "u"
+ elif delimiter == Markdown.STRIKE_DELIMITER:
+ tag = "s"
+ elif delimiter == Markdown.CODE_DELIMITER:
+ tag = "code"
+ elif delimiter == Markdown.PRE_DELIMITER:
+ tag = "pre"
else:
continue
- offset += len(style) * 2
- message = message[:start] + message[start:].replace(
- sub, "{0}{1}{0}".format(style, sub), 1)
+ if delimiter not in delimiters:
+ delimiters.add(delimiter)
+ tag = "<{}>".format(tag)
+ else:
+ delimiters.remove(delimiter)
+ tag = "{}>".format(tag)
- return utils.remove_surrogates(message)
+ text = text[:start + offset] + tag + text[stop + offset:]
+
+ offset += len(tag) - len(delimiter)
+
+ offset = 0
+
+ for match in re.finditer(Markdown.URL_RE, text):
+ start, stop = match.span()
+ full = match.group(0)
+ body, url = match.groups()
+ replace = '{}'.format(url, body)
+
+ text = text[:start + offset] + replace + text[stop + offset:]
+ offset += len(replace) - len(full)
+
+ return self.html.parse(text)
From d6900cde9f5adca32fb80624ceb88331af1c0b85 Mon Sep 17 00:00:00 2001
From: Dan <14043624+delivrance@users.noreply.github.com>
Date: Mon, 24 Jun 2019 10:11:21 +0200
Subject: [PATCH 03/16] Remove debug print()
---
pyrogram/client/style/html.py | 1 -
1 file changed, 1 deletion(-)
diff --git a/pyrogram/client/style/html.py b/pyrogram/client/style/html.py
index 82921f4c..81d7ce9b 100644
--- a/pyrogram/client/style/html.py
+++ b/pyrogram/client/style/html.py
@@ -116,7 +116,6 @@ class HTML:
parser = Parser(self.client)
parser.feed(text)
- print(parser.entities)
# TODO: OrderedDict to be removed in Python 3.6
return OrderedDict([
From f12cee5d94f42c4b5d7e669e33e1aa85de9ea0ce Mon Sep 17 00:00:00 2001
From: Dan <14043624+delivrance@users.noreply.github.com>
Date: Mon, 24 Jun 2019 10:54:58 +0200
Subject: [PATCH 04/16] Automatically escape URL bodies when using markdown
---
pyrogram/client/style/markdown.py | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/pyrogram/client/style/markdown.py b/pyrogram/client/style/markdown.py
index 26effe5c..001fc60f 100644
--- a/pyrogram/client/style/markdown.py
+++ b/pyrogram/client/style/markdown.py
@@ -16,6 +16,7 @@
# You should have received a copy of the GNU Lesser General Public License
# along with Pyrogram. If not, see .
+import html
import re
import pyrogram
@@ -89,10 +90,14 @@ class Markdown:
for match in re.finditer(Markdown.URL_RE, text):
start, stop = match.span()
full = match.group(0)
+
body, url = match.groups()
+ body = html.escape(body)
+
replace = '{}'.format(url, body)
text = text[:start + offset] + replace + text[stop + offset:]
+
offset += len(replace) - len(full)
return self.html.parse(text)
From 8e0182633f8036d23afe6de98e3f391f80a2074c Mon Sep 17 00:00:00 2001
From: Dan <14043624+delivrance@users.noreply.github.com>
Date: Mon, 24 Jun 2019 13:35:58 +0200
Subject: [PATCH 05/16] Ignore any other style when inside a fixed-width style
---
pyrogram/client/style/markdown.py | 66 +++++++++++++++++--------------
1 file changed, 37 insertions(+), 29 deletions(-)
diff --git a/pyrogram/client/style/markdown.py b/pyrogram/client/style/markdown.py
index 001fc60f..520008a8 100644
--- a/pyrogram/client/style/markdown.py
+++ b/pyrogram/client/style/markdown.py
@@ -24,66 +24,76 @@ from .html import HTML
class Markdown:
- BOLD_DELIMITER = "**"
- ITALIC_DELIMITER = "__"
- UNDERLINE_DELIMITER = "--"
- STRIKE_DELIMITER = "~~"
- CODE_DELIMITER = "`"
- PRE_DELIMITER = "```"
+ BOLD_DELIM = "**"
+ ITALIC_DELIM = "__"
+ UNDERLINE_DELIM = "--"
+ STRIKE_DELIM = "~~"
+ CODE_DELIM = "`"
+ PRE_DELIM = "```"
MARKDOWN_RE = re.compile(r"({d})".format(
d="|".join(
["".join(i) for i in [
[r"\{}".format(j) for j in i]
for i in [
- PRE_DELIMITER,
- CODE_DELIMITER,
- STRIKE_DELIMITER,
- UNDERLINE_DELIMITER,
- ITALIC_DELIMITER,
- BOLD_DELIMITER
+ PRE_DELIM,
+ CODE_DELIM,
+ STRIKE_DELIM,
+ UNDERLINE_DELIM,
+ ITALIC_DELIM,
+ BOLD_DELIM
]
]]
)))
URL_RE = re.compile(r"\[([^[]+)]\(([^(]+)\)")
+ OPENING_TAG = "<{}>"
+ CLOSING_TAG = "{}>"
+ URL_MARKUP = '{}'
+ FIXED_WIDTH_DELIMS = [CODE_DELIM, PRE_DELIM]
+
def __init__(self, client: "pyrogram.BaseClient"):
self.html = HTML(client)
def parse(self, text: str):
+ text = html.escape(text)
+
offset = 0
- delimiters = set()
+ delims = set()
for i, match in enumerate(re.finditer(Markdown.MARKDOWN_RE, text)):
start, stop = match.span()
- delimiter = match.group(1)
+ delim = match.group(1)
- if delimiter == Markdown.BOLD_DELIMITER:
+ if delim == Markdown.BOLD_DELIM:
tag = "b"
- elif delimiter == Markdown.ITALIC_DELIMITER:
+ elif delim == Markdown.ITALIC_DELIM:
tag = "i"
- elif delimiter == Markdown.UNDERLINE_DELIMITER:
+ elif delim == Markdown.UNDERLINE_DELIM:
tag = "u"
- elif delimiter == Markdown.STRIKE_DELIMITER:
+ elif delim == Markdown.STRIKE_DELIM:
tag = "s"
- elif delimiter == Markdown.CODE_DELIMITER:
+ elif delim == Markdown.CODE_DELIM:
tag = "code"
- elif delimiter == Markdown.PRE_DELIMITER:
+ elif delim == Markdown.PRE_DELIM:
tag = "pre"
else:
continue
- if delimiter not in delimiters:
- delimiters.add(delimiter)
- tag = "<{}>".format(tag)
+ if delim not in Markdown.FIXED_WIDTH_DELIMS and any(x in delims for x in Markdown.FIXED_WIDTH_DELIMS):
+ continue
+
+ if delim not in delims:
+ delims.add(delim)
+ tag = Markdown.OPENING_TAG.format(tag)
else:
- delimiters.remove(delimiter)
- tag = "{}>".format(tag)
+ delims.remove(delim)
+ tag = Markdown.CLOSING_TAG.format(tag)
text = text[:start + offset] + tag + text[stop + offset:]
- offset += len(tag) - len(delimiter)
+ offset += len(tag) - len(delim)
offset = 0
@@ -92,9 +102,7 @@ class Markdown:
full = match.group(0)
body, url = match.groups()
- body = html.escape(body)
-
- replace = '{}'.format(url, body)
+ replace = Markdown.URL_MARKUP.format(url, body)
text = text[:start + offset] + replace + text[stop + offset:]
From cac0bcabf915ff3e7a06a7d7be2a21da257050e5 Mon Sep 17 00:00:00 2001
From: Dan <14043624+delivrance@users.noreply.github.com>
Date: Mon, 24 Jun 2019 13:36:27 +0200
Subject: [PATCH 06/16] Fix HTML parsing breaking with no tags
---
pyrogram/client/style/html.py | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/pyrogram/client/style/html.py b/pyrogram/client/style/html.py
index 81d7ce9b..df7c64ff 100644
--- a/pyrogram/client/style/html.py
+++ b/pyrogram/client/style/html.py
@@ -93,7 +93,10 @@ class Parser(HTMLParser):
self.text += data
def handle_endtag(self, tag):
- start_tag = self.tags.pop()
+ try:
+ start_tag = self.tags.pop()
+ except IndexError:
+ return
if start_tag != tag:
line, offset = self.getpos()
@@ -113,6 +116,7 @@ class HTML:
def parse(self, text: str):
text = utils.add_surrogates(str(text or "").strip())
+ text = "
{}
".format(text)
parser = Parser(self.client)
parser.feed(text)
From a27dc575e413968f8814a7fb365bf7f2ada9aaa6 Mon Sep 17 00:00:00 2001
From: Dan <14043624+delivrance@users.noreply.github.com>
Date: Mon, 24 Jun 2019 14:17:46 +0200
Subject: [PATCH 07/16] Actually fix the HTML Parser feeding by calling
.close() when done
---
pyrogram/client/style/html.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pyrogram/client/style/html.py b/pyrogram/client/style/html.py
index df7c64ff..1748fa58 100644
--- a/pyrogram/client/style/html.py
+++ b/pyrogram/client/style/html.py
@@ -116,10 +116,10 @@ class HTML:
def parse(self, text: str):
text = utils.add_surrogates(str(text or "").strip())
- text = "{}
".format(text)
parser = Parser(self.client)
parser.feed(text)
+ parser.close()
# TODO: OrderedDict to be removed in Python 3.6
return OrderedDict([
From e7457de947524a11c9690f98bddd7b2295d08e0d Mon Sep 17 00:00:00 2001
From: Dan <14043624+delivrance@users.noreply.github.com>
Date: Mon, 24 Jun 2019 14:25:09 +0200
Subject: [PATCH 08/16] Add MSGID_DECREASE_RETRY 5xx-class error
---
compiler/error/source/500_INTERNAL_SERVER_ERROR.tsv | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/compiler/error/source/500_INTERNAL_SERVER_ERROR.tsv b/compiler/error/source/500_INTERNAL_SERVER_ERROR.tsv
index 446fe908..4bbea8ea 100644
--- a/compiler/error/source/500_INTERNAL_SERVER_ERROR.tsv
+++ b/compiler/error/source/500_INTERNAL_SERVER_ERROR.tsv
@@ -9,4 +9,5 @@ RANDOM_ID_DUPLICATE Telegram is having internal problems. Please try again later
WORKER_BUSY_TOO_LONG_RETRY Telegram is having internal problems. Please try again later
INTERDC_X_CALL_ERROR Telegram is having internal problems at DC{x}. Please try again later
INTERDC_X_CALL_RICH_ERROR Telegram is having internal problems at DC{x}. Please try again later
-FOLDER_DEAC_AUTOFIX_ALL Telegram is having internal problems. Please try again later
\ No newline at end of file
+FOLDER_DEAC_AUTOFIX_ALL Telegram is having internal problems. Please try again later
+MSGID_DECREASE_RETRY Telegram is having internal problems. Please try again later
\ No newline at end of file
From cd1e41b130297d3517c4e3cf0d14f2f81b2801a7 Mon Sep 17 00:00:00 2001
From: Dan <14043624+delivrance@users.noreply.github.com>
Date: Mon, 24 Jun 2019 14:33:17 +0200
Subject: [PATCH 09/16] Delete style utils.py and move its content inside
html.py The HTML parser is now the only one that makes use of those util
methods
---
pyrogram/client/style/html.py | 23 ++++++++++++++++++---
pyrogram/client/style/utils.py | 37 ----------------------------------
2 files changed, 20 insertions(+), 40 deletions(-)
delete mode 100644 pyrogram/client/style/utils.py
diff --git a/pyrogram/client/style/html.py b/pyrogram/client/style/html.py
index 1748fa58..9376f793 100644
--- a/pyrogram/client/style/html.py
+++ b/pyrogram/client/style/html.py
@@ -20,11 +20,11 @@ import html
import re
from collections import OrderedDict
from html.parser import HTMLParser
+from struct import unpack
import pyrogram
from pyrogram.api import types
from pyrogram.errors import PeerIdInvalid
-from . import utils
class Parser(HTMLParser):
@@ -111,11 +111,28 @@ class Parser(HTMLParser):
class HTML:
+ # SMP = Supplementary Multilingual Plane: https://en.wikipedia.org/wiki/Plane_(Unicode)#Overview
+ SMP_RE = re.compile(r"[\U00010000-\U0010FFFF]")
+
def __init__(self, client: "pyrogram.BaseClient" = None):
self.client = client
+ @staticmethod
+ def add_surrogates(text):
+ # Replace each SMP code point with a surrogate pair
+ return HTML.SMP_RE.sub(
+ lambda match: # Split SMP in two surrogates
+ "".join(chr(i) for i in unpack("
-#
-# This file is part of Pyrogram.
-#
-# Pyrogram is free software: you can redistribute it and/or modify
-# it under the terms of the GNU Lesser General Public License as published
-# by the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# Pyrogram is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU Lesser General Public License for more details.
-#
-# You should have received a copy of the GNU Lesser General Public License
-# along with Pyrogram. If not, see .
-
-import re
-from struct import unpack
-
-# SMP = Supplementary Multilingual Plane: https://en.wikipedia.org/wiki/Plane_(Unicode)#Overview
-SMP_RE = re.compile(r"[\U00010000-\U0010FFFF]")
-
-
-def add_surrogates(text):
- # Replace each SMP code point with a surrogate pair
- return SMP_RE.sub(
- lambda match: # Split SMP in two surrogates
- "".join(chr(i) for i in unpack("
Date: Tue, 25 Jun 2019 05:47:57 +0200
Subject: [PATCH 10/16] Revert "Delete style utils.py and move its content
inside html.py The HTML parser is now the only one that makes use of those
util methods"
This reverts commit cd1e41b1
---
pyrogram/client/style/html.py | 23 +++------------------
pyrogram/client/style/utils.py | 37 ++++++++++++++++++++++++++++++++++
2 files changed, 40 insertions(+), 20 deletions(-)
create mode 100644 pyrogram/client/style/utils.py
diff --git a/pyrogram/client/style/html.py b/pyrogram/client/style/html.py
index 9376f793..1748fa58 100644
--- a/pyrogram/client/style/html.py
+++ b/pyrogram/client/style/html.py
@@ -20,11 +20,11 @@ import html
import re
from collections import OrderedDict
from html.parser import HTMLParser
-from struct import unpack
import pyrogram
from pyrogram.api import types
from pyrogram.errors import PeerIdInvalid
+from . import utils
class Parser(HTMLParser):
@@ -111,28 +111,11 @@ class Parser(HTMLParser):
class HTML:
- # SMP = Supplementary Multilingual Plane: https://en.wikipedia.org/wiki/Plane_(Unicode)#Overview
- SMP_RE = re.compile(r"[\U00010000-\U0010FFFF]")
-
def __init__(self, client: "pyrogram.BaseClient" = None):
self.client = client
- @staticmethod
- def add_surrogates(text):
- # Replace each SMP code point with a surrogate pair
- return HTML.SMP_RE.sub(
- lambda match: # Split SMP in two surrogates
- "".join(chr(i) for i in unpack("
+#
+# This file is part of Pyrogram.
+#
+# Pyrogram is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published
+# by the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# Pyrogram is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with Pyrogram. If not, see .
+
+import re
+from struct import unpack
+
+# SMP = Supplementary Multilingual Plane: https://en.wikipedia.org/wiki/Plane_(Unicode)#Overview
+SMP_RE = re.compile(r"[\U00010000-\U0010FFFF]")
+
+
+def add_surrogates(text):
+ # Replace each SMP code point with a surrogate pair
+ return SMP_RE.sub(
+ lambda match: # Split SMP in two surrogates
+ "".join(chr(i) for i in unpack("
Date: Tue, 25 Jun 2019 05:53:41 +0200
Subject: [PATCH 11/16] Make slicing text messages & captions work properly
with entity offsets
---
pyrogram/client/types/messages_and_media/message.py | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/pyrogram/client/types/messages_and_media/message.py b/pyrogram/client/types/messages_and_media/message.py
index 52e8f473..34080d7f 100644
--- a/pyrogram/client/types/messages_and_media/message.py
+++ b/pyrogram/client/types/messages_and_media/message.py
@@ -31,6 +31,7 @@ from ..object import Object
from ..update import Update
from ..user_and_chats.chat import Chat
from ..user_and_chats.user import User
+from ...style import utils
class Str(str):
@@ -58,6 +59,9 @@ class Str(str):
def html(self):
return self._client.html.unparse(self, self._entities)
+ def __getitem__(self, item):
+ return utils.remove_surrogates(utils.add_surrogates(self)[item])
+
class Message(Object, Update):
"""A message.
From 07bc7e39df8f9e66514bc78b3b6896e8a0d70d8d Mon Sep 17 00:00:00 2001
From: Dan <14043624+delivrance@users.noreply.github.com>
Date: Tue, 25 Jun 2019 07:08:38 +0200
Subject: [PATCH 12/16] Allow entities to overlap, like: bold and
italic
---
pyrogram/client/style/html.py | 27 +++++++++------------------
1 file changed, 9 insertions(+), 18 deletions(-)
diff --git a/pyrogram/client/style/html.py b/pyrogram/client/style/html.py
index 1748fa58..f861412b 100644
--- a/pyrogram/client/style/html.py
+++ b/pyrogram/client/style/html.py
@@ -37,8 +37,7 @@ class Parser(HTMLParser):
self.text = ""
self.entities = []
- self.temp_entities = []
- self.tags = []
+ self.tag_entities = {}
def handle_starttag(self, tag, attrs):
attrs = dict(attrs)
@@ -81,30 +80,22 @@ class Parser(HTMLParser):
else:
return
- self.tags.append(tag)
- self.temp_entities.append(entity(offset=len(self.text), length=0, **extra))
+ if tag not in self.tag_entities:
+ self.tag_entities[tag] = []
+
+ self.tag_entities[tag].append(entity(offset=len(self.text), length=0, **extra))
def handle_data(self, data):
data = html.unescape(data)
- for entity in self.temp_entities:
- entity.length += len(data)
+ for entities in self.tag_entities.values():
+ for entity in entities:
+ entity.length += len(data)
self.text += data
def handle_endtag(self, tag):
- try:
- start_tag = self.tags.pop()
- except IndexError:
- return
-
- if start_tag != tag:
- line, offset = self.getpos()
- offset += 1
-
- raise ValueError("Expected end tag {}>, but found {}> at {}:{}".format(start_tag, tag, line, offset))
-
- self.entities.append(self.temp_entities.pop())
+ self.entities.append(self.tag_entities[tag].pop())
def error(self, message):
pass
From a086964e85851b25ae351cca73ac26c496799642 Mon Sep 17 00:00:00 2001
From: Dan <14043624+delivrance@users.noreply.github.com>
Date: Tue, 25 Jun 2019 07:41:48 +0200
Subject: [PATCH 13/16] Make the HTML parser more sound
---
pyrogram/client/style/html.py | 19 ++++++++++++++++++-
1 file changed, 18 insertions(+), 1 deletion(-)
diff --git a/pyrogram/client/style/html.py b/pyrogram/client/style/html.py
index f861412b..579ed7ec 100644
--- a/pyrogram/client/style/html.py
+++ b/pyrogram/client/style/html.py
@@ -95,7 +95,16 @@ class Parser(HTMLParser):
self.text += data
def handle_endtag(self, tag):
- self.entities.append(self.tag_entities[tag].pop())
+ try:
+ self.entities.append(self.tag_entities[tag].pop())
+ except (KeyError, IndexError):
+ line, offset = self.getpos()
+ offset += 1
+
+ raise ValueError("Unmatched closing tag {}> at line {}:{}".format(tag, line, offset))
+ else:
+ if not self.tag_entities[tag]:
+ self.tag_entities.pop(tag)
def error(self, message):
pass
@@ -112,6 +121,14 @@ class HTML:
parser.feed(text)
parser.close()
+ if parser.tag_entities:
+ unclosed_tags = []
+
+ for tag, entities in parser.tag_entities.items():
+ unclosed_tags.append("<{}> (x{})".format(tag, len(entities)))
+
+ raise ValueError("Unclosed tags: {}".format(", ".join(unclosed_tags)))
+
# TODO: OrderedDict to be removed in Python 3.6
return OrderedDict([
("message", utils.remove_surrogates(parser.text)),
From 168fce09da2d5b26f24187970a425ae875a49527 Mon Sep 17 00:00:00 2001
From: Dan <14043624+delivrance@users.noreply.github.com>
Date: Tue, 25 Jun 2019 10:24:19 +0200
Subject: [PATCH 14/16] Implement HTML.unparse and Markdown.unparse
---
pyrogram/client/style/html.py | 44 +++++++++++++++++++
pyrogram/client/style/markdown.py | 72 +++++++++++++++++++++++++------
2 files changed, 103 insertions(+), 13 deletions(-)
diff --git a/pyrogram/client/style/html.py b/pyrogram/client/style/html.py
index 579ed7ec..17a5daa6 100644
--- a/pyrogram/client/style/html.py
+++ b/pyrogram/client/style/html.py
@@ -134,3 +134,47 @@ class HTML:
("message", utils.remove_surrogates(parser.text)),
("entities", parser.entities)
])
+
+ @staticmethod
+ def unparse(text: str, entities: list):
+ text = utils.add_surrogates(text)
+ copy = text
+
+ for entity in entities:
+ start = entity.offset
+ end = start + entity.length
+
+ type = entity.type
+
+ url = entity.url
+ user = entity.user
+
+ sub = copy[start:end]
+
+ if type == "bold":
+ style = "b"
+ elif type == "italic":
+ style = "i"
+ elif type == "underline":
+ style = "u"
+ elif type == "strike":
+ style = "s"
+ elif type == "code":
+ style = "code"
+ elif type == "pre":
+ style = "pre"
+ elif type == "blockquote":
+ style = "blockquote"
+ elif type == "text_link":
+ text = text[:start] + text[start:].replace(sub, '{}'.format(url, sub), 1)
+ continue
+ elif type == "text_mention":
+ text = text[:start] + text[start:].replace(
+ sub, '{}'.format(user.id, sub), 1)
+ continue
+ else:
+ continue
+
+ text = text[:start] + text[start:].replace(sub, "<{0}>{1}{0}>".format(style, sub), 1)
+
+ return utils.remove_surrogates(text)
diff --git a/pyrogram/client/style/markdown.py b/pyrogram/client/style/markdown.py
index 520008a8..93d8fc9a 100644
--- a/pyrogram/client/style/markdown.py
+++ b/pyrogram/client/style/markdown.py
@@ -20,17 +20,18 @@ import html
import re
import pyrogram
+from . import utils
from .html import HTML
+BOLD_DELIM = "**"
+ITALIC_DELIM = "__"
+UNDERLINE_DELIM = "--"
+STRIKE_DELIM = "~~"
+CODE_DELIM = "`"
+PRE_DELIM = "```"
+
class Markdown:
- BOLD_DELIM = "**"
- ITALIC_DELIM = "__"
- UNDERLINE_DELIM = "--"
- STRIKE_DELIM = "~~"
- CODE_DELIM = "`"
- PRE_DELIM = "```"
-
MARKDOWN_RE = re.compile(r"({d})".format(
d="|".join(
["".join(i) for i in [
@@ -66,17 +67,17 @@ class Markdown:
start, stop = match.span()
delim = match.group(1)
- if delim == Markdown.BOLD_DELIM:
+ if delim == BOLD_DELIM:
tag = "b"
- elif delim == Markdown.ITALIC_DELIM:
+ elif delim == ITALIC_DELIM:
tag = "i"
- elif delim == Markdown.UNDERLINE_DELIM:
+ elif delim == UNDERLINE_DELIM:
tag = "u"
- elif delim == Markdown.STRIKE_DELIM:
+ elif delim == STRIKE_DELIM:
tag = "s"
- elif delim == Markdown.CODE_DELIM:
+ elif delim == CODE_DELIM:
tag = "code"
- elif delim == Markdown.PRE_DELIM:
+ elif delim == PRE_DELIM:
tag = "pre"
else:
continue
@@ -109,3 +110,48 @@ class Markdown:
offset += len(replace) - len(full)
return self.html.parse(text)
+
+ @staticmethod
+ def unparse(text: str, entities: list):
+ text = utils.add_surrogates(text)
+ copy = text
+
+ for entity in entities:
+ start = entity.offset
+ end = start + entity.length
+
+ type = entity.type
+
+ url = entity.url
+ user = entity.user
+
+ sub = copy[start:end]
+
+ if type == "bold":
+ style = BOLD_DELIM
+ elif type == "italic":
+ style = ITALIC_DELIM
+ elif type == "underline":
+ style = UNDERLINE_DELIM
+ elif type == "strike":
+ style = STRIKE_DELIM
+ elif type == "code":
+ style = CODE_DELIM
+ elif type == "pre":
+ style = PRE_DELIM
+ # TODO: Blockquote for MD
+ # elif type == "blockquote":
+ # style = ...
+ elif type == "text_link":
+ text = text[:start] + text[start:].replace(sub, '[{1}]({0})'.format(url, sub), 1)
+ continue
+ elif type == "text_mention":
+ text = text[:start] + text[start:].replace(
+ sub, '[{1}](tg://user?id={0})'.format(user.id, sub), 1)
+ continue
+ else:
+ continue
+
+ text = text[:start] + text[start:].replace(sub, "{0}{1}{0}".format(style, sub), 1)
+
+ return utils.remove_surrogates(text)
From 32ca805f6be0295f53ff9d4e20c817ad7183e949 Mon Sep 17 00:00:00 2001
From: Dan <14043624+delivrance@users.noreply.github.com>
Date: Tue, 25 Jun 2019 10:25:21 +0200
Subject: [PATCH 15/16] Update message.py
---
.../types/messages_and_media/message.py | 24 +++++++------------
1 file changed, 9 insertions(+), 15 deletions(-)
diff --git a/pyrogram/client/types/messages_and_media/message.py b/pyrogram/client/types/messages_and_media/message.py
index 34080d7f..16726a47 100644
--- a/pyrogram/client/types/messages_and_media/message.py
+++ b/pyrogram/client/types/messages_and_media/message.py
@@ -31,33 +31,27 @@ from ..object import Object
from ..update import Update
from ..user_and_chats.chat import Chat
from ..user_and_chats.user import User
-from ...style import utils
+from ...style import utils, Markdown, HTML
class Str(str):
def __init__(self, *args):
super().__init__()
- self._client = None
- self._entities = None
+ self.entities = None
- def init(self, client, entities):
- self._client = client
- self._entities = entities
+ def init(self, entities):
+ self.entities = entities
return self
- @property
- def text(self):
- return self
-
@property
def markdown(self):
- return self._client.markdown.unparse(self, self._entities)
+ return Markdown.unparse(self, self.entities)
@property
def html(self):
- return self._client.html.unparse(self, self._entities)
+ return HTML.unparse(self, self.entities)
def __getitem__(self, item):
return utils.remove_surrogates(utils.add_surrogates(self)[item])
@@ -490,7 +484,7 @@ class Message(Object, Update):
if isinstance(message, types.Message):
entities = [MessageEntity._parse(client, entity, users) for entity in message.entities]
- entities = list(filter(lambda x: x is not None, entities))
+ entities = pyrogram.List(filter(lambda x: x is not None, entities))
forward_from = None
forward_sender_name = None
@@ -607,8 +601,8 @@ class Message(Object, Update):
date=message.date,
chat=Chat._parse(client, message, users, chats),
from_user=User._parse(client, users.get(message.from_id, None)),
- text=Str(message.message).init(client, entities) or None if media is None else None,
- caption=Str(message.message).init(client, entities) or None if media is not None else None,
+ text=Str(message.message).init(entities) or None if media is None else None,
+ caption=Str(message.message).init(entities) or None if media is not None else None,
entities=entities or None if media is None else None,
caption_entities=entities or None if media is not None else None,
author_signature=message.post_author,
From 7490f6cfa3b8b401c00316bdd5a17389a1b6ee01 Mon Sep 17 00:00:00 2001
From: Dan <14043624+delivrance@users.noreply.github.com>
Date: Tue, 25 Jun 2019 11:47:45 +0200
Subject: [PATCH 16/16] Update the HTML parser: make it easy for asyncio to
deal with mentions We can't await coroutines inside HTMLParser overridden
methods, such as handle_starttag, because they can't be async. This commit
moves the resolve_peer call into the parse method of the HTML class, which
can be defined async.
---
pyrogram/client/style/html.py | 25 ++++++++++++++-----------
1 file changed, 14 insertions(+), 11 deletions(-)
diff --git a/pyrogram/client/style/html.py b/pyrogram/client/style/html.py
index 17a5daa6..5617cb54 100644
--- a/pyrogram/client/style/html.py
+++ b/pyrogram/client/style/html.py
@@ -64,16 +64,8 @@ class Parser(HTMLParser):
mention = Parser.MENTION_RE.match(url)
if mention:
- user_id = int(mention.group(1))
-
- try:
- user = self.client.resolve_peer(user_id)
- except PeerIdInvalid:
- entity = types.MessageEntityMentionName
- extra["user_id"] = user_id
- else:
- entity = types.InputMessageEntityMentionName
- extra["user_id"] = user
+ entity = types.InputMessageEntityMentionName
+ extra["user_id"] = int(mention.group(1))
else:
entity = types.MessageEntityTextUrl
extra["url"] = url
@@ -129,10 +121,21 @@ class HTML:
raise ValueError("Unclosed tags: {}".format(", ".join(unclosed_tags)))
+ entities = []
+
+ for entity in parser.entities:
+ if isinstance(entity, types.InputMessageEntityMentionName):
+ try:
+ entity.user_id = self.client.resolve_peer(entity.user_id)
+ except PeerIdInvalid:
+ continue
+
+ entities.append(entity)
+
# TODO: OrderedDict to be removed in Python 3.6
return OrderedDict([
("message", utils.remove_surrogates(parser.text)),
- ("entities", parser.entities)
+ ("entities", entities)
])
@staticmethod