Source code for mistletoe.span_tokens

"""
Built-in span-level token classes.
"""
import re
from typing import Pattern

import attr

from mistletoe import nested_tokenizer
from mistletoe.base_elements import Position, SpanToken
from mistletoe.parse_context import get_parse_context
from mistletoe.attr_doc import autodoc

"""
Tokens to be included in the parsing process, in the order specified.
"""
__all__ = [
    "EscapeSequence",
    "AutoLink",
    "CoreTokens",
    "InlineCode",
    "LineBreak",
    "RawText",
]


[docs]class CoreTokens(SpanToken): precedence = 3 @classmethod def read(cls, match: Pattern): # TODO this needs to be made more general (so tokens can be in diffent modules) return globals()[match.type].read(match) @classmethod def find(cls, string): return nested_tokenizer.find_nested_tokenizer(string)
[docs]class Strong(SpanToken): """ Strong tokens: `**some text**` or `__some text__`, read in `CoreTokens.read` :param content: raw string content of the token :param children: list of child tokens """
[docs]class Emphasis(SpanToken): """ Emphasis tokens `*some text*` or `_some text_`, read in `CoreTokens.read` :param content: raw string content of the token :param children: list of child tokens """
[docs]@autodoc @attr.s(kw_only=True, slots=True) class InlineCode(SpanToken): """ Inline code tokens: \\`some code\\`, read in `CoreTokens.read` """ pattern = re.compile(r"(?<!\\|`)(?:\\\\)*(`+)(?!`)(.+?)(?<!`)\1(?!`)", re.DOTALL) parse_inner = False parse_group = 2 children: list = attr.ib( repr=False, metadata={"doc": "a single RawText node for alternative text."} ) position: Position = attr.ib( default=None, repr=False, metadata={"doc": "Line position in source text"} ) @classmethod def read(cls, match: Pattern): content = match.group(cls.parse_group) return cls(children=(RawText(" ".join(re.split("[ \n]+", content.strip()))),)) @classmethod def find(cls, string): matches = get_parse_context().nesting_matches.pop("InlineCode", []) return matches
[docs]@autodoc @attr.s(kw_only=True, slots=True) class Image(SpanToken): """ Image tokens, with inline targets: "![alt](src "title")", read in `CoreTokens.read` """ src: str = attr.ib(metadata={"doc": "image source"}) title: str = attr.ib(default=None, metadata={"doc": "image title"}) children: list = attr.ib( factory=list, repr=False, metadata={"doc": "alternative text."} ) position: Position = attr.ib( default=None, repr=False, metadata={"doc": "Line position in source text"} ) @classmethod def read(cls, match: Pattern): return cls(src=match.group(2).strip(), title=match.group(3))
[docs]@autodoc @attr.s(kw_only=True, slots=True) class EscapeSequence(SpanToken): """ Escape sequences. ("\\\\*") This should be set first in the token parse list. """ pattern = re.compile(r"\\([!\"#$%&'()*+,-./:;<=>?@\[\\\]^_`{|}~])") parse_inner = False precedence = 2 children: list = attr.ib( repr=False, metadata={"doc": "a single RawText node for alternative text."} ) position: Position = attr.ib( default=None, repr=False, metadata={"doc": "Line position in source text"} ) @classmethod def read(cls, match: Pattern): return cls(children=(RawText(match.group(cls.parse_group)),)) @classmethod def strip(cls, string): return cls.pattern.sub(r"\1", string)
[docs]@autodoc @attr.s(kw_only=True, slots=True) class LineBreak(SpanToken): """ Hard or soft line breaks. """ pattern = re.compile(r"( *|\\)\n") parse_inner = False parse_group = 0 content: bool = attr.ib(default="", repr=False, metadata={"doc": "raw content."}) soft: bool = attr.ib(metadata={"doc": "if the break is soft or hard."}) position: Position = attr.ib( default=None, repr=False, metadata={"doc": "Line position in source text"} ) @classmethod def read(cls, match: Pattern): content = match.group(1) return cls(soft=not content.startswith((" ", "\\")))
[docs]@autodoc @attr.s(slots=True) class RawText(SpanToken): """ Raw text. A leaf node. RawText is the only token that accepts a string for its `read` method, instead of a match object. Also, all recursions should bottom out here. """ content: bool = attr.ib( repr=False, metadata={"doc": "raw string content of the token"} ) position: Position = attr.ib( default=None, repr=False, metadata={"doc": "Line position in source text"} ) @classmethod def read(cls, content: str): return cls(content=content)
_tags = { "address", "article", "aside", "base", "basefont", "blockquote", "body", "caption", "center", "col", "colgroup", "dd", "details", "dialog", "dir", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "frame", "frameset", "h1", "h2", "h3", "h4", "h5", "h6", "head", "header", "hr", "html", "iframe", "legend", "li", "link", "main", "menu", "menuitem", "meta", "nav", "noframes", "ol", "optgroup", "option", "p", "param", "section", "source", "summary", "table", "tbody", "td", "tfoot", "th", "thead", "title", "tr", "track", "ul", } _tag = r"[A-Za-z][A-Za-z0-9-]*" _attrs = r'(?:\s+[A-Za-z_:][A-Za-z0-9_.:-]*(?:\s*=\s*(?:[^ "\'=<>`]+|\'[^\']*?\'|"[^\"]*?"))?)*' # noqa: E501 _open_tag = r"(?<!\\)<" + _tag + _attrs + r"\s*/?>" _closing_tag = r"(?<!\\)</" + _tag + r"\s*>" _comment = r"(?<!\\)<!--(?!>|->)(?:(?!--).)+?(?<!-)-->" _instruction = r"(?<!\\)<\?.+?\?>" _declaration = r"(?<!\\)<![A-Z].+?>" _cdata = r"(?<!\\)<!\[CDATA.+?\]\]>"
[docs]class HTMLSpan(SpanToken): """ Span-level HTML tokens. :param content: raw string content of the token :param children: list of child tokens """ pattern = re.compile( "|".join( [_open_tag, _closing_tag, _comment, _instruction, _declaration, _cdata] ), re.DOTALL, ) parse_inner = False parse_group = 0