""" Built-in span-level token classes. """ import re from typing import Pattern import attr from mistletoe import nested_tokenizer from mistletoe.base_elements import Position, SpanToken from mistletoe.parse_context import get_parse_context from mistletoe.attr_doc import autodoc """ Tokens to be included in the parsing process, in the order specified. """ __all__ = [ "EscapeSequence", "AutoLink", "CoreTokens", "InlineCode", "LineBreak", "RawText", ] [docs]class CoreTokens(SpanToken): precedence = 3 @classmethod def read(cls, match: Pattern): # TODO this needs to be made more general (so tokens can be in diffent modules) return globals()[match.type].read(match) @classmethod def find(cls, string): return nested_tokenizer.find_nested_tokenizer(string) [docs]class Strong(SpanToken): """ Strong tokens: `**some text**` or `__some text__`, read in `CoreTokens.read` :param content: raw string content of the token :param children: list of child tokens """ [docs]class Emphasis(SpanToken): """ Emphasis tokens `*some text*` or `_some text_`, read in `CoreTokens.read` :param content: raw string content of the token :param children: list of child tokens """ [docs]@autodoc @attr.s(kw_only=True, slots=True) class InlineCode(SpanToken): """ Inline code tokens: \\`some code\\`, read in `CoreTokens.read` """ pattern = re.compile(r"(?<!\\|`)(?:\\\\)*(`+)(?!`)(.+?)(?<!`)\1(?!`)", re.DOTALL) parse_inner = False parse_group = 2 children: list = attr.ib( repr=False, metadata={"doc": "a single RawText node for alternative text."} ) position: Position = attr.ib( default=None, repr=False, metadata={"doc": "Line position in source text"} ) @classmethod def read(cls, match: Pattern): content = match.group(cls.parse_group) return cls(children=(RawText(" ".join(re.split("[ \n]+", content.strip()))),)) @classmethod def find(cls, string): matches = get_parse_context().nesting_matches.pop("InlineCode", []) return matches [docs]@autodoc @attr.s(kw_only=True, slots=True) class Image(SpanToken): """ Image tokens, with inline targets: "![alt](src "title")", read in `CoreTokens.read` """ src: str = attr.ib(metadata={"doc": "image source"}) title: str = attr.ib(default=None, metadata={"doc": "image title"}) children: list = attr.ib( factory=list, repr=False, metadata={"doc": "alternative text."} ) position: Position = attr.ib( default=None, repr=False, metadata={"doc": "Line position in source text"} ) @classmethod def read(cls, match: Pattern): return cls(src=match.group(2).strip(), title=match.group(3)) [docs]@autodoc @attr.s(kw_only=True, slots=True) class Link(SpanToken): """ Link tokens, with inline targets: "[name](target)", read in `CoreTokens.read` """ target: str = attr.ib(metadata={"doc": "link target"}) title: str = attr.ib(default=None, metadata={"doc": "link title"}) children: list = attr.ib(factory=list, repr=False, metadata={"doc": "link text."}) position: Position = attr.ib( default=None, repr=False, metadata={"doc": "Line position in source text"} ) @classmethod def read(cls, match: Pattern): return cls( target=EscapeSequence.strip(match.group(2).strip()), title=EscapeSequence.strip(match.group(3)), ) [docs]@autodoc @attr.s(kw_only=True, slots=True) class AutoLink(SpanToken): """ Autolink tokens. ("<http://www.google.com>") """ pattern = re.compile( r"(?<!\\)(?:\\\\)*<([A-Za-z][A-Za-z0-9+.-]{1,31}:[^ <>]*?|[A-Za-z0-9.!#$%&'*+/=?^_`{|}~-]+@[A-Za-z0-9](?:[A-Za-z0-9-]{0,61}[A-Za-z0-9])?(?:\.[A-Za-z0-9](?:[A-Za-z0-9-]{0,61}[A-Za-z0-9])?)*)>" # noqa: E501 ) parse_inner = False target: str = attr.ib(metadata={"doc": "link target"}) mailto: bool = attr.ib(default=False, metadata={"doc": "if the link is an email"}) children: list = attr.ib( repr=False, metadata={"doc": "a single RawText node for alternative text."} ) position: Position = attr.ib( default=None, repr=False, metadata={"doc": "Line position in source text"} ) @classmethod def read(cls, match: Pattern): content = match.group(cls.parse_group) return cls( children=(RawText(content),), target=content, mailto="@" in content and "mailto" not in content.casefold(), ) [docs]@autodoc @attr.s(kw_only=True, slots=True) class EscapeSequence(SpanToken): """ Escape sequences. ("\\\\*") This should be set first in the token parse list. """ pattern = re.compile(r"\\([!\"#$%&'()*+,-./:;<=>?@\[\\\]^_`{|}~])") parse_inner = False precedence = 2 children: list = attr.ib( repr=False, metadata={"doc": "a single RawText node for alternative text."} ) position: Position = attr.ib( default=None, repr=False, metadata={"doc": "Line position in source text"} ) @classmethod def read(cls, match: Pattern): return cls(children=(RawText(match.group(cls.parse_group)),)) @classmethod def strip(cls, string): return cls.pattern.sub(r"\1", string) [docs]@autodoc @attr.s(kw_only=True, slots=True) class LineBreak(SpanToken): """ Hard or soft line breaks. """ pattern = re.compile(r"( *|\\)\n") parse_inner = False parse_group = 0 content: bool = attr.ib(default="", repr=False, metadata={"doc": "raw content."}) soft: bool = attr.ib(metadata={"doc": "if the break is soft or hard."}) position: Position = attr.ib( default=None, repr=False, metadata={"doc": "Line position in source text"} ) @classmethod def read(cls, match: Pattern): content = match.group(1) return cls(soft=not content.startswith((" ", "\\"))) [docs]@autodoc @attr.s(slots=True) class RawText(SpanToken): """ Raw text. A leaf node. RawText is the only token that accepts a string for its `read` method, instead of a match object. Also, all recursions should bottom out here. """ content: bool = attr.ib( repr=False, metadata={"doc": "raw string content of the token"} ) position: Position = attr.ib( default=None, repr=False, metadata={"doc": "Line position in source text"} ) @classmethod def read(cls, content: str): return cls(content=content) _tags = { "address", "article", "aside", "base", "basefont", "blockquote", "body", "caption", "center", "col", "colgroup", "dd", "details", "dialog", "dir", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "frame", "frameset", "h1", "h2", "h3", "h4", "h5", "h6", "head", "header", "hr", "html", "iframe", "legend", "li", "link", "main", "menu", "menuitem", "meta", "nav", "noframes", "ol", "optgroup", "option", "p", "param", "section", "source", "summary", "table", "tbody", "td", "tfoot", "th", "thead", "title", "tr", "track", "ul", } _tag = r"[A-Za-z][A-Za-z0-9-]*" _attrs = r'(?:\s+[A-Za-z_:][A-Za-z0-9_.:-]*(?:\s*=\s*(?:[^ "\'=<>`]+|\'[^\']*?\'|"[^\"]*?"))?)*' # noqa: E501 _open_tag = r"(?<!\\)<" + _tag + _attrs + r"\s*/?>" _closing_tag = r"(?<!\\)</" + _tag + r"\s*>" _comment = r"(?<!\\)<!--(?!>|->)(?:(?!--).)+?(?<!-)-->" _instruction = r"(?<!\\)<\?.+?\?>" _declaration = r"(?<!\\)<![A-Z].+?>" _cdata = r"(?<!\\)<!\[CDATA.+?\]\]>" [docs]class HTMLSpan(SpanToken): """ Span-level HTML tokens. :param content: raw string content of the token :param children: list of child tokens """ pattern = re.compile( "|".join( [_open_tag, _closing_tag, _comment, _instruction, _declaration, _cdata] ), re.DOTALL, ) parse_inner = False parse_group = 0