From e1b2fc7043b29f126828093cdd60e699f92a8731 Mon Sep 17 00:00:00 2001 From: Dan <14043624+delivrance@users.noreply.github.com> Date: Thu, 15 Feb 2018 11:24:56 +0100 Subject: [PATCH] Revamp markdown parser --- pyrogram/client/style/markdown.py | 115 ++++++++++++++---------------- 1 file changed, 52 insertions(+), 63 deletions(-) diff --git a/pyrogram/client/style/markdown.py b/pyrogram/client/style/markdown.py index 8e0c9462..e39ac876 100644 --- a/pyrogram/client/style/markdown.py +++ b/pyrogram/client/style/markdown.py @@ -24,95 +24,84 @@ from pyrogram.api.types import ( MessageEntityCode as Code, MessageEntityTextUrl as Url, MessageEntityPre as Pre, + MessageEntityMentionName as MentionInvalid, InputMessageEntityMentionName as Mention ) from . import utils class Markdown: - INLINE_DELIMITERS = { - "**": Bold, - "__": Italic, - "`": Code - } + BOLD_DELIMITER = "**" + ITALIC_DELIMITER = "__" + CODE_DELIMITER = "`" + PRE_DELIMITER = "```" - # ``` python - # for i in range(10): - # print(i) - # ``` - PRE_RE = r"(?P
```(?P.*)\n(?P (.|\n)*)\n```)" - - # [url](github.com) - URL_RE = r"(?P
(\[(?P .+?)\]\((?P .+?)\)))" - - # [name](tg://user?id=123456789) - MENTION_RE = r"(?P (\[(?P .+?)\]\(tg:\/\/user\?id=(?P \d+?)\)))" - - # **bold** - # __italic__ - # `code` - INLINE_RE = r"(?P (?P {d})(?P.+?)(?P {d}))".format( + MARKDOWN_RE = re.compile(r"```([\w ]*)\n([\w\W]*)(?:\n|)```|\[([^[(]+)\]\(([^])]+)\)|({d})(.+?)\5".format( d="|".join( ["".join(i) for i in [ ["\{}".format(j) for j in i] - for i in sorted( # Sort delimiters by length - INLINE_DELIMITERS.keys(), - key=lambda k: len(k), # Or: key=len - reverse=True - ) + for i in [ + PRE_DELIMITER, + CODE_DELIMITER, + ITALIC_DELIMITER, + BOLD_DELIMITER + ] ]] ) - ) + )) + MENTION_RE = re.compile(r"tg://user\?id=(\d+)") - MARKDOWN_RE = re.compile("|".join([PRE_RE, MENTION_RE, URL_RE, INLINE_RE])) - - def __init__(self, peers_by_id): + def __init__(self, peers_by_id: dict): self.peers_by_id = peers_by_id - def parse(self, text): + def parse(self, message: str): entities = [] - text = utils.add_surrogates(text) + message = utils.add_surrogates(message).strip() offset = 0 - for match in self.MARKDOWN_RE.finditer(text): + for match in self.MARKDOWN_RE.finditer(message): start = match.start() - offset + lang, pre, text, url, style, body = match.groups() - if match.group("pre"): - pattern = match.group("pre") - lang = match.group("lang") - replace = match.group("code") - entity = Pre(start, len(replace), lang.strip()) - offset += len(lang) + 8 - elif match.group("url"): - pattern = match.group("url") - replace = match.group("url_text") - path = match.group("url_path") - entity = Url(start, len(replace), path) - offset += len(path) + 4 - elif match.group("mention"): - pattern = match.group("mention") - replace = match.group("mention_text") - user_id = match.group("user_id") - entity = Mention(start, len(replace), self.peers_by_id[int(user_id)]) - offset += len(user_id) + 17 - elif match.group("inline"): - pattern = match.group("inline") - replace = match.group("body") - start_delimiter = match.group("start_delimiter") - end_delimiter = match.group("end_delimiter") + if pre: + body = pre = pre.strip() + entity = Pre(start, len(pre), lang.strip() or "") + offset += len(lang) + len(self.PRE_DELIMITER) * 2 + elif url: + mention = self.MENTION_RE.match(url) - if start_delimiter != end_delimiter: + if mention: + user_id = int(mention.group(1)) + input_user = self.peers_by_id.get(user_id, None) + + entity = ( + Mention(start, len(text), input_user) + if input_user + else MentionInvalid(start, len(text), user_id) + ) + else: + entity = Url(start, len(text), url) + + body = text + offset += len(url) + 4 + else: + if style == self.BOLD_DELIMITER: + entity = Bold(start, len(body)) + elif style == self.ITALIC_DELIMITER: + entity = Italic(start, len(body)) + elif style == self.CODE_DELIMITER: + entity = Code(start, len(body)) + elif style == self.PRE_DELIMITER: + entity = Pre(start, len(body), "") + else: continue - entity = self.INLINE_DELIMITERS[start_delimiter](start, len(replace)) - offset += len(start_delimiter) * 2 - else: - continue + offset += len(style) * 2 entities.append(entity) - text = text.replace(pattern, replace) + message = message.replace(match.group(), body) return dict( - message=utils.remove_surrogates(text), + message=utils.remove_surrogates(message), entities=entities )