Source code for mistletoe.block_tokens

"""
Built-in block-level token classes.
"""
import re
from typing import Dict, Optional, Union
from typing import List as ListType

import attr

import mistletoe.block_tokenizer as tokenizer
from mistletoe import span_tokens
from mistletoe.nested_tokenizer import (
    follows,
    shift_whitespace,
    whitespace,
    is_control_char,
    normalize_label,
)
from mistletoe.parse_context import get_parse_context
from mistletoe.base_elements import (
    Token,
    BlockToken,
    Position,
    SpanContainer,
    SourceLines,
)
from mistletoe.attr_doc import autodoc


"""
Tokens to be included in the parsing process, in the order specified.
"""
__all__ = [
    "BlockCode",
    "Heading",
    "Quote",
    "CodeFence",
    "ThematicBreak",
    "List",
    "LinkDefinition",
    "Paragraph",
]


[docs]@autodoc
@attr.s(slots=True, kw_only=True)
class FrontMatter(BlockToken):
    """Front matter YAML block, on the first line of the document.

    ::

        ---
        a: b
        c: d
        ---

    NOTE: The content of the block should be valid YAML,
    but its parsing (and hence syntax testing) is deferred to the renderers.
    This is so that, given 'bad' YAML,
    the rest of the of document will still be parsed,
    and then the renderers can apply there own error reporting.

    Not included in the parsing process, but called by `Document.read`,
    if `front_matter=True`, and stored on `Document.front_matter` in the syntax tree.
    """

    content: Union[str, dict] = attr.ib(
        repr=False, metadata={"doc": "Source text (should be valid YAML)"}
    )
    position: Position = attr.ib(
        default=None, metadata={"doc": "Line position in source text"}
    )

[docs]    def get_data(self) -> dict:
        """Return the de-serialized front matter data (requires pyyaml)."""
        if isinstance(self.content, str):
            import yaml

            return yaml.safe_load(self.content) or {}
        return self.content

[docs]    @classmethod
    def start(cls, line: str) -> bool:
        # handled by Document
        return False

[docs]    @classmethod
    def read(cls, lines: SourceLines):
        start_line = lines.lineno + 1

        next(lines)  # skip first ``---``
        line_buffer = []
        next_line = lines.peek()
        while not (next_line is None or next_line.startswith("---")):
            line_buffer.append(next(lines))
            next_line = lines.peek()

        if next_line is not None:
            next(lines)  # move past closing ``---``
            log_warning = False
        else:
            log_warning = True

        position = Position.from_source_lines(lines, start_line=start_line)
        if log_warning:
            get_parse_context().logger.warning(
                "{} No closing `---` was found for initial metadata block".format(
                    position.make_loc_str()
                )
            )

        return cls(content="".join(line_buffer), position=position)


[docs]@autodoc
@attr.s(slots=False, kw_only=True)
class Document(BlockToken):
    """Document container."""

    children: ListType[Token] = attr.ib(
        repr=lambda c: str(len(c)), metadata={"doc": "Child tokens list"}
    )
    link_definitions: dict = attr.ib(
        factory=dict,
        repr=lambda d: str(len(d)),
        metadata={"doc": "Mapping of keys to (url, title)"},
    )
    footnotes: Dict[str, Token] = attr.ib(
        factory=dict,
        repr=lambda d: str(len(d)),
        metadata={"doc": "Footnote tokens mapped to their target names"},
    )
    footref_order: list = attr.ib(
        factory=list,
        repr=lambda d: str(len(d)),
        metadata={
            "doc": (
                "A set of footnote targets, "
                "in the order they are referenced in the document."
            )
        },
    )
    front_matter: Optional[FrontMatter] = attr.ib(
        default=None, metadata={"doc": "Front matter YAML block"}
    )
    # TODO add is_nested parameter?
    # or have a subclass of document specifically for nesting?

[docs]    @classmethod
    def read(
        cls,
        lines: Union[str, ListType[str], SourceLines],
        reset_definitions: bool = True,
        skip_tokens: list = ("LinkDefinition", "Footnote"),
        front_matter: bool = False,
    ):
        """Read a document

        :param lines: Lines to parse
        :param reset_definitions: remove any previously stored definitions
            in the global context (see ``ParseContext.reset_definitions()``).
        :param skip_tokens: do not store these ``token.name`` in the syntax tree.
            These are usually tokens that store themselves in the global context.
        :param front_matter: search for an initial YAML block front matter block
            (note this is not strictly CommonMark compliant)
        """
        if reset_definitions:
            get_parse_context().reset_definitions()

        if not isinstance(lines, SourceLines):
            lines = SourceLines(lines, standardize_ends=True)

        # TODO can we do this in a way where we are checking
        # FrontMatter in get_parse_context().block_tokens?
        # then it would be easier to add/remove it in the renderers
        front_matter_token = None
        if front_matter and lines.peek() and lines.peek().startswith("---"):
            front_matter_token = FrontMatter.read(lines)

        children = tokenizer.tokenize_main(lines=lines, skip_tokens=skip_tokens)
        foot_defs = get_parse_context().foot_definitions
        return cls(
            children=children,
            front_matter=front_matter_token,
            link_definitions=get_parse_context().link_definitions,
            footnotes=foot_defs,
            footref_order=[
                t for t in get_parse_context().foot_references if t in foot_defs
            ],
        )


[docs]@autodoc
@attr.s(slots=True, kw_only=True)
class Heading(BlockToken):
    """Heading token. (["### some heading ###\\n"])

    Boundary between span-level and block-level tokens.
    """

    level: int = attr.ib(metadata={"doc": "Heading level"})
    children: ListType[Token] = attr.ib(
        repr=lambda c: str(len(c)), metadata={"doc": "Child tokens list"}
    )
    position: Position = attr.ib(
        default=None, metadata={"doc": "Line position in source text"}
    )

    pattern = re.compile(r" {0,3}(#{1,6})(?:\n|\s+?(.*?)(?:\n|\s+?#+\s*?$))")

[docs]    @classmethod
    def start(cls, line):
        match_obj = cls.pattern.match(line)
        if match_obj is None:
            return False
        cls.level = len(match_obj.group(1))
        cls.content = (match_obj.group(2) or "").strip()
        if set(cls.content) == {"#"}:
            cls.content = ""
        return True

[docs]    @classmethod
    def read(cls, lines, expand_spans=False):
        next(lines)
        children = SpanContainer(cls.content)
        if expand_spans:
            children = children.expand()
        return cls(
            level=cls.level,
            children=children,
            position=Position.from_source_lines(lines),
        )


[docs]@autodoc
@attr.s(slots=True, kw_only=True)
class SetextHeading(BlockToken):
    """Setext headings.

    Not included in the parsing process, but returned by `Paragraph.read`.
    """

    level: int = attr.ib(metadata={"doc": "Heading level"})
    children: ListType[Token] = attr.ib(
        repr=lambda c: str(len(c)), metadata={"doc": "Child tokens list"}
    )
    position: Position = attr.ib(
        default=None, metadata={"doc": "Line position in source text"}
    )

[docs]    @classmethod
    def start(cls, line):
        raise NotImplementedError()

[docs]    @classmethod
    def read(cls, lines):
        raise NotImplementedError()


[docs]@autodoc
@attr.s(slots=True, kw_only=True)
class Quote(BlockToken):
    """Quote token. (`["> # heading\\n", "> paragraph\\n"]`)."""

    children: ListType[Token] = attr.ib(
        repr=lambda c: str(len(c)), metadata={"doc": "Child tokens list"}
    )
    position: Position = attr.ib(
        default=None, metadata={"doc": "Line position in source text"}
    )

[docs]    @staticmethod
    def start(line):
        stripped = line.lstrip(" ")
        if len(line) - len(stripped) > 3:
            return False
        return stripped.startswith(">")

    @classmethod
    def transition(cls, next_line):
        return (
            next_line is None
            or next_line.strip() == ""
            or Heading.start(next_line)
            or CodeFence.start(next_line)
            or ThematicBreak.start(next_line)
            or List.start(next_line)
        )

[docs]    @classmethod
    def read(cls, lines):
        # first line
        start_line = lines.lineno + 1
        line = cls.convert_leading_tabs(next(lines).lstrip()).split(">", 1)[1]
        if len(line) > 0 and line[0] == " ":
            line = line[1:]
        line_buffer = [line]

        # set booleans
        in_code_fence = CodeFence.start(line)
        in_block_code = BlockCode.start(line)
        blank_line = line.strip() == ""

        # loop
        next_line = lines.peek()
        while not cls.transition(next_line):
            stripped = cls.convert_leading_tabs(next_line.lstrip())
            prepend = 0
            if stripped[0] == ">":
                # has leader, not lazy continuation
                prepend += 1
                if stripped[1] == " ":
                    prepend += 1
                stripped = stripped[prepend:]
                in_code_fence = CodeFence.start(stripped)
                in_block_code = BlockCode.start(stripped)
                blank_line = stripped.strip() == ""
                line_buffer.append(stripped)
            elif in_code_fence or in_block_code or blank_line:
                # not paragraph continuation text
                break
            else:
                # lazy continuation, preserve whitespace
                line_buffer.append(next_line)
            next(lines)
            next_line = lines.peek()

        # block level tokens are parsed here, so that link_definitions
        # in quotes can be recognized before span-level tokenizing.
        Paragraph.parse_setext = False
        try:
            child_tokens = tokenizer.tokenize_block(
                SourceLines(line_buffer, start_line=start_line)
            )
        finally:
            Paragraph.parse_setext = True
        return cls(
            children=child_tokens,
            position=Position.from_source_lines(lines, start_line=start_line),
        )

    @staticmethod
    def convert_leading_tabs(string):
        string = string.replace(">\t", "   ", 1)
        count = 0
        for i, c in enumerate(string):
            if c == "\t":
                count += 4
            elif c == " ":
                count += 1
            else:
                break
        if i == 0:
            return string
        return ">" + " " * count + string[i:]


[docs]@autodoc
@attr.s(slots=True, kw_only=True)
class Paragraph(BlockToken):
    """Paragraph token. (`["some\\n", "continuous\\n", "lines\\n"]`)

    Boundary between span-level and block-level tokens.
    """

    children: ListType[Token] = attr.ib(
        repr=lambda c: str(len(c)), metadata={"doc": "Child tokens list"}
    )
    position: Position = attr.ib(
        default=None, metadata={"doc": "Line position in source text"}
    )

    _setext_pattern = re.compile(r" {0,3}(=|-)+ *$")
    parse_setext = True  # can be disabled by Quote

[docs]    @staticmethod
    def start(line):
        return line.strip() != ""

    @classmethod
    def is_setext_heading(cls, line):
        return cls._setext_pattern.match(line)

    @classmethod
    def transition(cls, next_line):
        return (
            next_line is None
            or next_line.strip() == ""
            or Heading.start(next_line)
            or CodeFence.start(next_line)
            or Quote.start(next_line)
        )

    @classmethod
    def parse_list_marker(cls, next_line):
        return ListItem.parse_marker(next_line)

[docs]    @classmethod
    def read(cls, lines, expand_spans=False):
        line_buffer = [next(lines)]
        start_line = lines.lineno
        next_line = lines.peek()
        while not cls.transition(next_line):
            # check if next_line starts List
            list_pair = cls.parse_list_marker(next_line)
            if len(next_line) - len(next_line.lstrip()) < 4 and list_pair is not None:
                prepend, leader = list_pair
                # non-empty list item
                if next_line[:prepend].endswith(" "):
                    # unordered list, or ordered list starting from 1
                    if not leader[:-1].isdigit() or leader[:-1] == "1":
                        break

            # check if next_line starts HTMLBlock other than type 7
            html_block = HTMLBlock.start(next_line)
            if html_block and html_block != 7:
                break

            # check if we see a setext underline
            if cls.parse_setext and cls.is_setext_heading(next_line):
                line_buffer.append(next(lines))
                level = 1 if line_buffer.pop().lstrip().startswith("=") else 2
                children = SpanContainer(
                    "\n".join([line.strip() for line in line_buffer])
                )
                if expand_spans:
                    children = children.expand()
                return SetextHeading(
                    children=children,
                    level=level,
                    position=Position.from_source_lines(lines, start_line=start_line),
                )

            # check if we have a ThematicBreak (has to be after setext)
            if ThematicBreak.start(next_line):
                break

            # no other tokens, we're good
            line_buffer.append(next(lines))
            next_line = lines.peek()

        content = "".join([line.lstrip() for line in line_buffer]).strip()
        children = SpanContainer(content)
        if expand_spans:
            children = children.expand()
        return cls(
            children=children,
            position=Position.from_source_lines(lines, start_line=start_line),
        )


[docs]@autodoc
@attr.s(slots=True, kw_only=True)
class BlockCode(BlockToken):
    """Indented code."""

    children: ListType[Token] = attr.ib(
        repr=lambda c: str(len(c)), metadata={"doc": "Child tokens list"}
    )
    language: str = attr.ib(
        default="", metadata={"doc": "The code language (for sytax highlighting)"}
    )
    position: Position = attr.ib(
        default=None, metadata={"doc": "Line position in source text"}
    )

[docs]    @staticmethod
    def start(line):
        return line.replace("\t", "    ", 1).startswith("    ")

[docs]    @classmethod
    def read(cls, lines):
        start_line = lines.lineno
        line_buffer = []
        for line in lines:
            if line.strip() == "":
                line_buffer.append(line.lstrip(" ") if len(line) < 5 else line[4:])
                continue
            if not line.replace("\t", "    ", 1).startswith("    "):
                lines.backstep()
                break
            line_buffer.append(cls.strip(line))

        children = (span_tokens.RawText("".join(line_buffer).strip("\n") + "\n"),)

        return cls(
            children=children,
            language="",
            position=Position.from_source_lines(lines, start_line=start_line),
        )

    @staticmethod
    def strip(string):
        count = 0
        for i, c in enumerate(string):
            if c == "\t":
                return string[i + 1 :]
            elif c == " ":
                count += 1
            else:
                break
            if count == 4:
                return string[i + 1 :]
        return string


[docs]@autodoc
@attr.s(slots=True, kw_only=True)
class CodeFence(BlockToken):
    """Code fence. (["```sh\\n", "rm -rf /", ..., "```"])

    Boundary between span-level and block-level tokens.

    See <https://spec.commonmark.org/0.29/#fenced-code-blocks>
    """

    children: ListType[Token] = attr.ib(
        repr=lambda c: str(len(c)), metadata={"doc": "Child tokens list"}
    )
    language: str = attr.ib(
        default="", metadata={"doc": "The code language (for sytax highlighting)"}
    )
    arguments: str = attr.ib(
        default="", metadata={"doc": "Any string occuring after the language"}
    )
    position: Position = attr.ib(
        default=None, metadata={"doc": "Line position in source text"}
    )

    # Tildes and backticks cannot be mixed.
    pattern_tick = re.compile(r"^( {0,3})(`{3,}) *([^`\s]*) *([^`]*)$")
    pattern_tilde = re.compile(r"^( {0,3})(~{3,}) *([^~\s]*) *([^~]*)$")
    _open_info = None

[docs]    @classmethod
    def start(cls, line):
        match_obj = cls.pattern_tick.match(line)
        if not match_obj:
            match_obj = cls.pattern_tilde.match(line)
            if not match_obj:
                return False
        prepend, leader, lang, arguments = match_obj.groups()
        if leader[0] in lang or leader[0] in line[match_obj.end() :]:
            return False
        cls._open_info = len(prepend), leader, lang, arguments
        return True

[docs]    @classmethod
    def read(cls, lines):
        start_line = lines.lineno + 1
        next(lines)
        line_buffer = []
        for line in lines:
            stripped_line = line.lstrip(" ")
            diff = len(line) - len(stripped_line)
            if (
                stripped_line.startswith(cls._open_info[1])
                and len(stripped_line.split(maxsplit=1)) == 1
                and diff < 4
            ):
                break
            if diff > cls._open_info[0]:
                stripped_line = " " * (diff - cls._open_info[0]) + stripped_line
            line_buffer.append(stripped_line)

        language = span_tokens.EscapeSequence.strip(cls._open_info[2])
        arg_lines = cls._open_info[3].splitlines() or [""]
        arguments = span_tokens.EscapeSequence.strip(arg_lines[0])
        children = (span_tokens.RawText("".join(line_buffer)),)

        return cls(
            children=children,
            language=language,
            arguments=arguments,
            position=Position.from_source_lines(lines, start_line=start_line),
        )


[docs]@autodoc
@attr.s(slots=True, kw_only=True)
class List(BlockToken):
    """List token (unordered or ordered)"""

    children: ListType[Token] = attr.ib(
        repr=lambda c: str(len(c)), metadata={"doc": "Child tokens list"}
    )
    loose: bool = attr.ib(
        metadata={"doc": "Whether list items are separated by blank lines"}
    )
    start_at: Optional[int] = attr.ib(
        metadata={"doc": "None if unordered, starting number if ordered."}
    )
    position: Position = attr.ib(
        default=None, metadata={"doc": "Line position in source text"}
    )

    _pattern = re.compile(r" {0,3}(?:\d{0,9}[.)]|[+\-*])(?:[ \t]*$|[ \t]+)")

[docs]    @classmethod
    def start(cls, line):
        return cls._pattern.match(line)

[docs]    @classmethod
    def read(cls, lines):
        start_line = lines.lineno
        leader = None
        next_marker = None
        children = []
        while True:
            item = ListItem.read(lines, next_marker)
            next_marker = item.next_marker
            item_leader = item.leader
            if leader is None:
                leader = item_leader
            elif not cls.same_marker_type(leader, item_leader):
                lines.reset()
                break
            children.append(item)
            if next_marker is None:
                break

        if children:
            # Only consider the last list item loose if there's more than one element
            last_parse_buffer = children[-1]
            last_parse_buffer.loose = (
                len(last_parse_buffer.children) > 1 and last_parse_buffer.loose
            )

        loose = any(item.loose for item in children)
        leader = children[0].leader
        start = None
        if len(leader) != 1:
            start = int(leader[:-1])
        return cls(
            children=children,
            loose=loose,
            start_at=start,
            position=Position.from_source_lines(lines, start_line=start_line),
        )

    @staticmethod
    def same_marker_type(leader, other):
        if len(leader) == 1:
            return leader == other
        return (
            leader[:-1].isdigit() and other[:-1].isdigit() and leader[-1] == other[-1]
        )


[docs]@autodoc
@attr.s(slots=True, kw_only=True)
class ListItem(BlockToken):
    """List items.

    Not included in the parsing process, but called by List.
    """

    children: ListType[Token] = attr.ib(
        repr=lambda c: str(len(c)), metadata={"doc": "Child tokens list"}
    )
    loose: bool = attr.ib(
        metadata={"doc": "Whether list items are separated by blank lines"}
    )
    leader: str = attr.ib(metadata={"doc": "The prefix number or bullet point."})
    prepend: int = attr.ib(metadata={"doc": ""})
    next_marker = attr.ib(default=None, metadata={"doc": ""})
    position: Position = attr.ib(
        default=None, metadata={"doc": "Line position in source text"}
    )

    _pattern = re.compile(r"\s*(\d{0,9}[.)]|[+\-*])(\s*$|\s+)")

    @staticmethod
    def in_continuation(line, prepend):
        return line.strip() == "" or len(line) - len(line.lstrip()) >= prepend

    @staticmethod
    def transition(next_line):
        return (
            Heading.start(next_line)
            or Quote.start(next_line)
            or CodeFence.start(next_line)
            or ThematicBreak.start(next_line)
        )

[docs]    @classmethod
    def parse_marker(cls, line):
        """
        Returns a pair (prepend, leader) if the line has a valid leader.
        """
        match_obj = cls._pattern.match(line)
        if match_obj is None:
            return None  # no valid leader
        leader = match_obj.group(1)
        content = match_obj.group(0).replace(leader + "\t", leader + "   ", 1)
        # reassign prepend and leader
        prepend = len(content)
        if prepend == len(line.rstrip("\n")):
            prepend = match_obj.end(1) + 1
        else:
            spaces = match_obj.group(2)
            if spaces.startswith("\t"):
                spaces = spaces.replace("\t", "   ", 1)
            spaces = spaces.replace("\t", "    ")
            n_spaces = len(spaces)
            if n_spaces > 4:
                prepend = match_obj.end(1) + 1
        return prepend, leader

[docs]    @classmethod
    def read(cls, lines, prev_marker=None):
        next_marker = None
        lines.anchor()
        prepend = -1
        leader = None
        start_line = lines.lineno
        line_buffer = []

        # first line
        line = next(lines)
        prepend, leader = prev_marker if prev_marker else cls.parse_marker(line)
        line = line.replace(leader + "\t", leader + "   ", 1).replace("\t", "    ")
        empty_first_line = line[prepend:].strip() == ""
        if not empty_first_line:
            line_buffer.append(line[prepend:])
        next_line = lines.peek()
        if empty_first_line and next_line is not None and next_line.strip() == "":
            child_tokens = tokenizer.tokenize_block(
                SourceLines([next(lines)], start_line=lines.lineno)
            )
            next_line = lines.peek()
            if next_line is not None:
                marker_info = cls.parse_marker(next_line)
                if marker_info is not None:
                    next_marker = marker_info
            return cls(
                children=child_tokens,
                loose=child_tokens.loose,
                prepend=prepend,
                leader=leader,
                next_marker=next_marker,
                position=Position.from_source_lines(lines, start_line=start_line),
            )

        # loop
        newline = 0
        while True:
            # no more lines
            if next_line is None:
                # strip off newlines
                if newline:
                    lines.backstep()
                    del line_buffer[-newline:]
                break
            next_line = next_line.replace("\t", "    ")
            # not in continuation
            if not cls.in_continuation(next_line, prepend):
                # directly followed by another token
                if cls.transition(next_line):
                    if newline:
                        lines.backstep()
                        del line_buffer[-newline:]
                    break
                # next_line is a new list item
                marker_info = cls.parse_marker(next_line)
                if marker_info is not None:
                    next_marker = marker_info
                    break
                # not another item, has newlines -> not continuation
                if newline:
                    lines.backstep()
                    del line_buffer[-newline:]
                    break
            next(lines)
            line = next_line
            stripped = line.lstrip(" ")
            diff = len(line) - len(stripped)
            if diff > prepend:
                stripped = " " * (diff - prepend) + stripped
            line_buffer.append(stripped)
            newline = newline + 1 if next_line.strip() == "" else 0
            next_line = lines.peek()

        child_tokens = tokenizer.tokenize_block(
            SourceLines(line_buffer, start_line=start_line)
        )

        return cls(
            children=child_tokens,
            loose=child_tokens.loose,
            prepend=prepend,
            leader=leader,
            next_marker=next_marker,
            position=Position.from_source_lines(lines, start_line=start_line),
        )


[docs]@autodoc
@attr.s(slots=True, kw_only=True)
class LinkDefinition(BlockToken):
    """LinkDefinition token: `[ref]: url "title"`

    These are stores in `Document.link_definitions` in the final syntax tree.
    """

    # TODO this should only store one definition, then they can be stored as a dict
    # in parse_context.list_definitions

    definitions: list = attr.ib(metadata={"doc": "list of (label, dest, title)"})
    position: Position = attr.ib(
        default=None, metadata={"doc": "Line position in source text"}
    )

    label_pattern = re.compile(r"[ \n]{0,3}\[(.+?)\]", re.DOTALL)

[docs]    @classmethod
    def start(cls, line):
        return line.lstrip().startswith("[")

[docs]    @classmethod
    def read(cls, lines: SourceLines):
        line_buffer = []
        start_line = lines.lineno + 1
        next_line = lines.peek()
        while next_line is not None and next_line.strip() != "":
            line_buffer.append(next(lines))
            next_line = lines.peek()
        string = "".join(line_buffer)
        offset = 0
        matches = []
        while offset < len(string) - 1:
            match_info = cls.match_reference(lines, string, offset)
            if match_info is None:
                break
            offset, match = match_info
            matches.append(match)
        position = Position.from_source_lines(lines, start_line=start_line)
        cls.append_link_definitions(matches, position)
        return cls(position=position, definitions=matches) if matches else None

    @classmethod
    def match_reference(cls, lines, string, offset):
        match_info = cls.match_link_label(string, offset)
        if not match_info:
            cls.backtrack(lines, string, offset)
            return None
        _, label_end, label = match_info

        if not follows(string, label_end - 1, ":"):
            cls.backtrack(lines, string, offset)
            return None

        match_info = cls.match_link_dest(string, label_end)
        if not match_info:
            cls.backtrack(lines, string, offset)
            return None
        _, dest_end, dest = match_info

        match_info = cls.match_link_title(string, dest_end)
        if not match_info:
            cls.backtrack(lines, string, dest_end)
            return None
        _, title_end, title = match_info

        return title_end, (label, dest, title)

    @classmethod
    def match_link_label(cls, string, offset):
        start = -1
        end = -1
        escaped = False
        for i, c in enumerate(string[offset:], start=offset):
            if c == "\\" and not escaped:
                escaped = True
            elif c == "[" and not escaped:
                if start == -1:
                    start = i
                else:
                    return None
            elif c == "]" and not escaped:
                end = i
                label = string[start + 1 : end]
                if label.strip() != "":
                    return start, end + 1, label
                return None
            elif escaped:
                escaped = False
        return None

    @classmethod
    def match_link_dest(cls, string, offset):
        offset = shift_whitespace(string, offset + 1)
        if offset == len(string):
            return None
        if string[offset] == "<":
            escaped = False
            for i, c in enumerate(string[offset + 1 :], start=offset + 1):
                if c == "\\" and not escaped:
                    escaped = True
                elif c == " " or c == "\n" or (c == "<" and not escaped):
                    return None
                elif c == ">" and not escaped:
                    return offset, i + 1, string[offset + 1 : i]
                elif escaped:
                    escaped = False
            return None
        else:
            escaped = False
            count = 0
            for i, c in enumerate(string[offset:], start=offset):
                if c == "\\" and not escaped:
                    escaped = True
                elif c in whitespace:
                    break
                elif not escaped:
                    if c == "(":
                        count += 1
                    elif c == ")":
                        count -= 1
                elif is_control_char(c):
                    return None
                elif escaped:
                    escaped = False
            if count != 0:
                return None
            return offset, i, string[offset:i]

    @classmethod
    def match_link_title(cls, string, offset):
        new_offset = shift_whitespace(string, offset)
        if (
            new_offset == len(string)
            or "\n" in string[offset:new_offset]
            and string[new_offset] == "["
        ):
            return offset, new_offset, ""
        if string[new_offset] == '"':
            closing = '"'
        elif string[new_offset] == "'":
            closing = "'"
        elif string[new_offset] == "(":
            closing = ")"
        elif "\n" in string[offset:new_offset]:
            return offset, offset, ""
        else:
            return None
        offset = new_offset
        escaped = False
        for i, c in enumerate(string[offset + 1 :], start=offset + 1):
            if c == "\\" and not escaped:
                escaped = True
            elif c == closing and not escaped:
                new_offset = shift_whitespace(string, i + 1)
                if "\n" not in string[i + 1 : new_offset]:
                    return None
                return offset, new_offset, string[offset + 1 : i]
            elif escaped:
                escaped = False
        return None

    @staticmethod
    def append_link_definitions(matches, position):
        for key, dest, title in matches:
            key = normalize_label(key)
            dest = span_tokens.EscapeSequence.strip(dest.strip())
            title = span_tokens.EscapeSequence.strip(title)
            link_definitions = get_parse_context().link_definitions
            if key not in link_definitions:
                link_definitions[key] = dest, title
            else:
                get_parse_context().logger.warning(
                    "{} ignoring duplicate link definition '{}'".format(
                        position.make_loc_str(), key
                    )
                )

    @staticmethod
    def backtrack(lines, string, offset):
        lines._index -= string[offset + 1 :].count("\n")


[docs]@autodoc
@attr.s(slots=True, kw_only=True)
class ThematicBreak(BlockToken):
    """Thematic break token (a.k.a. horizontal rule.)"""

    position: Position = attr.ib(
        default=None, metadata={"doc": "Line position in source text"}
    )

    _pattern = re.compile(r" {0,3}(?:([-_*])\s*?)(?:\1\s*?){2,}$")

[docs]    @classmethod
    def start(cls, line):
        return cls._pattern.match(line)

[docs]    @classmethod
    def read(cls, lines):
        next(lines)
        return cls(position=Position.from_source_lines(lines))


[docs]@autodoc
@attr.s(slots=True, kw_only=True)
class HTMLBlock(BlockToken):
    """Block-level HTML token."""

    content: str = attr.ib(
        repr=False, metadata={"doc": "literal strings rendered as-is"}
    )
    position: Position = attr.ib(
        default=None, metadata={"doc": "Line position in source text"}
    )

    _end_cond = None
    multiblock = re.compile(r"<(script|pre|style)[ >\n]")
    predefined = re.compile(r"<\/?(.+?)(?:\/?>|[ \n])")
    custom_tag = re.compile(
        r"(?:" + "|".join((span_tokens._open_tag, span_tokens._closing_tag)) + r")\s*$"
    )

[docs]    @classmethod
    def start(cls, line):
        stripped = line.lstrip()
        if len(line) - len(stripped) >= 4:
            return False
        # rule 1: <pre>, <script> or <style> tags, allow newlines in block
        match_obj = cls.multiblock.match(stripped)
        if match_obj is not None:
            cls._end_cond = "</{}>".format(match_obj.group(1).casefold())
            return 1
        # rule 2: html comment tags, allow newlines in block
        if stripped.startswith("<!--"):
            cls._end_cond = "-->"
            return 2
        # rule 3: tags that starts with <?, allow newlines in block
        if stripped.startswith("<?"):
            cls._end_cond = "?>"
            return 3
        # rule 4: tags that starts with <!, allow newlines in block
        if stripped.startswith("<!") and stripped[2].isupper():
            cls._end_cond = ">"
            return 4
        # rule 5: CDATA declaration, allow newlines in block
        if stripped.startswith("<![CDATA["):
            cls._end_cond = "]]>"
            return 5
        # rule 6: predefined tags (see html_token._tags), read until newline
        match_obj = cls.predefined.match(stripped)
        if match_obj is not None and match_obj.group(1).casefold() in span_tokens._tags:
            cls._end_cond = None
            return 6
        # rule 7: custom tags, read until newline
        match_obj = cls.custom_tag.match(stripped)
        if match_obj is not None:
            cls._end_cond = None
            return 7
        return False

[docs]    @classmethod
    def read(cls, lines):
        # note: stop condition can trigger on the starting line
        start_line = lines.lineno
        line_buffer = []
        for line in lines:
            line_buffer.append(line)
            if cls._end_cond is not None:
                if cls._end_cond in line.casefold():
                    break
            elif line.strip() == "":
                line_buffer.pop()
                break
        return cls(
            content="".join(line_buffer).rstrip("\n"),
            position=Position.from_source_lines(lines, start_line=start_line),
        )