2
0
mirror of https://github.com/pyrogram/pyrogram synced 2025-08-28 12:57:52 +00:00

Revamp HTML and Markdown parsers to allow multiple nested entities

This commit is contained in:
Dan 2019-06-24 10:07:28 +02:00
parent 648f37cf6d
commit e7c49c6a1b
2 changed files with 137 additions and 218 deletions

View File

@ -16,127 +16,110 @@
# You should have received a copy of the GNU Lesser General Public License # You should have received a copy of the GNU Lesser General Public License
# along with Pyrogram. If not, see <http://www.gnu.org/licenses/>. # along with Pyrogram. If not, see <http://www.gnu.org/licenses/>.
import html
import re import re
from collections import OrderedDict from collections import OrderedDict
from html.parser import HTMLParser
import pyrogram import pyrogram
from pyrogram.api.types import ( from pyrogram.api import types
MessageEntityBold as Bold,
MessageEntityItalic as Italic,
MessageEntityCode as Code,
MessageEntityTextUrl as Url,
MessageEntityPre as Pre,
MessageEntityUnderline as Underline,
MessageEntityStrike as Strike,
MessageEntityBlockquote as Blockquote,
MessageEntityMentionName as MentionInvalid,
InputMessageEntityMentionName as Mention,
)
from pyrogram.errors import PeerIdInvalid from pyrogram.errors import PeerIdInvalid
from . import utils from . import utils
class HTML: class Parser(HTMLParser):
HTML_RE = re.compile(r"<(\w+)(?: href=([\"'])([^<]+)\2)?>([^>]+)</\1>")
MENTION_RE = re.compile(r"tg://user\?id=(\d+)") MENTION_RE = re.compile(r"tg://user\?id=(\d+)")
def __init__(self, client: "pyrogram.BaseClient" = None): def __init__(self, client: "pyrogram.BaseClient"):
super().__init__()
self.client = client self.client = client
def parse(self, message: str): self.text = ""
entities = [] self.entities = []
message = utils.add_surrogates(str(message or "")) self.temp_entities = []
offset = 0 self.tags = []
for match in self.HTML_RE.finditer(message): def handle_starttag(self, tag, attrs):
start = match.start() - offset attrs = dict(attrs)
style, url, body = match.group(1, 3, 4) extra = {}
if url: if tag in ["b", "strong"]:
mention = self.MENTION_RE.match(url) entity = types.MessageEntityBold
elif tag in ["i", "em"]:
entity = types.MessageEntityItalic
elif tag == "u":
entity = types.MessageEntityUnderline
elif tag in ["s", "del", "strike"]:
entity = types.MessageEntityStrike
elif tag == "blockquote":
entity = types.MessageEntityBlockquote
elif tag == "code":
entity = types.MessageEntityCode
elif tag == "pre":
entity = types.MessageEntityPre
extra["language"] = ""
elif tag == "a":
url = attrs.get("href", "")
mention = Parser.MENTION_RE.match(url)
if mention: if mention:
user_id = int(mention.group(1)) user_id = int(mention.group(1))
try: try:
input_user = self.client.resolve_peer(user_id) user = self.client.resolve_peer(user_id)
except PeerIdInvalid: except PeerIdInvalid:
input_user = None entity = types.MessageEntityMentionName
extra["user_id"] = user_id
entity = (
Mention(offset=start, length=len(body), user_id=input_user)
if input_user else MentionInvalid(offset=start, length=len(body), user_id=user_id)
)
else: else:
entity = Url(offset=start, length=len(body), url=url) entity = types.InputMessageEntityMentionName
extra["user_id"] = user
else: else:
if style == "b" or style == "strong": entity = types.MessageEntityTextUrl
entity = Bold(offset=start, length=len(body)) extra["url"] = url
elif style == "i" or style == "em":
entity = Italic(offset=start, length=len(body))
elif style == "code":
entity = Code(offset=start, length=len(body))
elif style == "pre":
entity = Pre(offset=start, length=len(body), language="")
elif style == "u":
entity = Underline(offset=start, length=len(body))
elif style in ["strike", "s", "del"]:
entity = Strike(offset=start, length=len(body))
elif style == "blockquote":
entity = Blockquote(offset=start, length=len(body))
else: else:
continue return
entities.append(entity) self.tags.append(tag)
message = message.replace(match.group(), body) self.temp_entities.append(entity(offset=len(self.text), length=0, **extra))
offset += len(style) * 2 + 5 + (len(url) + 8 if url else 0)
# TODO: OrderedDict to be removed in Python3.6 def handle_data(self, data):
data = html.unescape(data)
for entity in self.temp_entities:
entity.length += len(data)
self.text += data
def handle_endtag(self, tag):
start_tag = self.tags.pop()
if start_tag != tag:
line, offset = self.getpos()
offset += 1
raise ValueError("Expected end tag </{}>, but found </{}> at {}:{}".format(start_tag, tag, line, offset))
self.entities.append(self.temp_entities.pop())
def error(self, message):
pass
class HTML:
def __init__(self, client: "pyrogram.BaseClient" = None):
self.client = client
def parse(self, text: str):
text = utils.add_surrogates(str(text or "").strip())
parser = Parser(self.client)
parser.feed(text)
print(parser.entities)
# TODO: OrderedDict to be removed in Python 3.6
return OrderedDict([ return OrderedDict([
("message", utils.remove_surrogates(message)), ("message", utils.remove_surrogates(parser.text)),
("entities", entities) ("entities", parser.entities)
]) ])
def unparse(self, message: str, entities: list):
message = utils.add_surrogates(message).strip()
offset = 0
for entity in entities:
start = entity.offset + offset
type = entity.type
url = entity.url
user = entity.user
sub = message[start: start + entity.length]
if type == "bold":
style = "b"
elif type == "italic":
style = "i"
elif type == "code":
style = "code"
elif type == "pre":
style = "pre"
elif type == "underline":
style = "u"
elif type == "strike":
style = "s"
elif type == "blockquote":
style = "blockquote"
elif type == "text_link":
offset += 15 + len(url)
message = message[:start] + message[start:].replace(
sub, "<a href=\"{}\">{}</a>".format(url, sub), 1)
continue
elif type == "text_mention":
offset += 28 + len(str(user.id))
message = message[:start] + message[start:].replace(
sub, "<a href=\"tg://user?id={}\">{}</a>".format(user.id, sub), 1)
continue
else:
continue
offset += len(style) * 2 + 5
message = message[:start] + message[start:].replace(
sub, "<{0}>{1}</{0}>".format(style, sub), 1)
return utils.remove_surrogates(message)

View File

@ -17,22 +17,9 @@
# along with Pyrogram. If not, see <http://www.gnu.org/licenses/>. # along with Pyrogram. If not, see <http://www.gnu.org/licenses/>.
import re import re
from collections import OrderedDict
import pyrogram import pyrogram
from pyrogram.api.types import ( from .html import HTML
MessageEntityBold as Bold,
MessageEntityItalic as Italic,
MessageEntityCode as Code,
MessageEntityTextUrl as Url,
MessageEntityPre as Pre,
MessageEntityUnderline as Underline,
MessageEntityStrike as Strike,
MessageEntityMentionName as MentionInvalid,
InputMessageEntityMentionName as Mention
)
from pyrogram.errors import PeerIdInvalid
from . import utils
class Markdown: class Markdown:
@ -43,10 +30,10 @@ class Markdown:
CODE_DELIMITER = "`" CODE_DELIMITER = "`"
PRE_DELIMITER = "```" PRE_DELIMITER = "```"
MARKDOWN_RE = re.compile(r"({d})([\w\W]*?)\1|\[([^[]+?)\]\(([^(]+?)\)".format( MARKDOWN_RE = re.compile(r"({d})".format(
d="|".join( d="|".join(
["".join(i) for i in [ ["".join(i) for i in [
["\{}".format(j) for j in i] [r"\{}".format(j) for j in i]
for i in [ for i in [
PRE_DELIMITER, PRE_DELIMITER,
CODE_DELIMITER, CODE_DELIMITER,
@ -56,107 +43,56 @@ class Markdown:
BOLD_DELIMITER BOLD_DELIMITER
] ]
]] ]]
) )))
))
MENTION_RE = re.compile(r"tg://user\?id=(\d+)")
def __init__(self, client: "pyrogram.BaseClient" = None): URL_RE = re.compile(r"\[([^[]+)]\(([^(]+)\)")
self.client = client
def __init__(self, client: "pyrogram.BaseClient"):
self.html = HTML(client)
def parse(self, text: str):
offset = 0
delimiters = set()
for i, match in enumerate(re.finditer(Markdown.MARKDOWN_RE, text)):
start, stop = match.span()
delimiter = match.group(1)
if delimiter == Markdown.BOLD_DELIMITER:
tag = "b"
elif delimiter == Markdown.ITALIC_DELIMITER:
tag = "i"
elif delimiter == Markdown.UNDERLINE_DELIMITER:
tag = "u"
elif delimiter == Markdown.STRIKE_DELIMITER:
tag = "s"
elif delimiter == Markdown.CODE_DELIMITER:
tag = "code"
elif delimiter == Markdown.PRE_DELIMITER:
tag = "pre"
else:
continue
if delimiter not in delimiters:
delimiters.add(delimiter)
tag = "<{}>".format(tag)
else:
delimiters.remove(delimiter)
tag = "</{}>".format(tag)
text = text[:start + offset] + tag + text[stop + offset:]
offset += len(tag) - len(delimiter)
def parse(self, message: str):
message = utils.add_surrogates(str(message or "")).strip()
entities = []
offset = 0 offset = 0
for match in self.MARKDOWN_RE.finditer(message): for match in re.finditer(Markdown.URL_RE, text):
start = match.start() - offset start, stop = match.span()
style, body, text, url = match.groups() full = match.group(0)
body, url = match.groups()
replace = '<a href="{}">{}</a>'.format(url, body)
if url: text = text[:start + offset] + replace + text[stop + offset:]
mention = self.MENTION_RE.match(url) offset += len(replace) - len(full)
if mention: return self.html.parse(text)
user_id = int(mention.group(1))
try:
input_user = self.client.resolve_peer(user_id)
except PeerIdInvalid:
input_user = None
entity = (
Mention(offset=start, length=len(text), user_id=input_user)
if input_user else MentionInvalid(offset=start, length=len(text), user_id=user_id)
)
else:
entity = Url(offset=start, length=len(text), url=url)
body = text
offset += len(url) + 4
else:
if style == self.BOLD_DELIMITER:
entity = Bold(offset=start, length=len(body))
elif style == self.ITALIC_DELIMITER:
entity = Italic(offset=start, length=len(body))
elif style == self.UNDERLINE_DELIMITER:
entity = Underline(offset=start, length=len(body))
elif style == self.STRIKE_DELIMITER:
entity = Strike(offset=start, length=len(body))
elif style == self.CODE_DELIMITER:
entity = Code(offset=start, length=len(body))
elif style == self.PRE_DELIMITER:
entity = Pre(offset=start, length=len(body), language="")
else:
continue
offset += len(style) * 2
entities.append(entity)
message = message.replace(match.group(), body)
# TODO: OrderedDict to be removed in Python3.6
return OrderedDict([
("message", utils.remove_surrogates(message)),
("entities", entities)
])
def unparse(self, message: str, entities: list):
message = utils.add_surrogates(message).strip()
offset = 0
for entity in entities:
start = entity.offset + offset
type = entity.type
url = entity.url
user = entity.user
sub = message[start: start + entity.length]
if type == "bold":
style = self.BOLD_DELIMITER
elif type == "italic":
style = self.ITALIC_DELIMITER
elif type == "underline":
style = self.UNDERLINE_DELIMITER
elif type == "strike":
style = self.STRIKE_DELIMITER
elif type == "code":
style = self.CODE_DELIMITER
elif type == "pre":
style = self.PRE_DELIMITER
elif type == "text_link":
offset += 4 + len(url)
message = message[:start] + message[start:].replace(
sub, "[{}]({})".format(sub, url), 1)
continue
elif type == "text_mention":
offset += 17 + len(str(user.id))
message = message[:start] + message[start:].replace(
sub, "[{}](tg://user?id={})".format(sub, user.id), 1)
continue
else:
continue
offset += len(style) * 2
message = message[:start] + message[start:].replace(
sub, "{0}{1}{0}".format(style, sub), 1)
return utils.remove_surrogates(message)