Source code for pymend.docstring_parser.numpydoc

"""Numpydoc-style docstring parsing.

See
---
https://numpydoc.readthedocs.io/en/latest/format.html
"""

import inspect
import itertools
import re
from collections.abc import Iterable, Iterator
from textwrap import dedent
from typing import Optional, TypeVar

from typing_extensions import override

from .common import (
    Docstring,
    DocstringDeprecated,
    DocstringExample,
    DocstringMeta,
    DocstringParam,
    DocstringRaises,
    DocstringReturns,
    DocstringStyle,
    DocstringYields,
    KeyReturnDict,
    MainSections,
    ParseError,
    RenderingStyle,
    clean_str,
    split_description,
)

_T = TypeVar("_T")


def _pairwise(
    iterable: Iterable[_T], end: Optional[_T] = None
) -> Iterator[tuple[_T, Optional[_T]]]:
    """Iterate over successive pairs with overhang for last element.

    Parameters
    ----------
    iterable : Iterable[_T]
        Iterable to iterate over.
    end : Optional[_T]
        Value for the overhang (Default value = None)

    Returns
    -------
    Iterator[tuple[_T, Optional[_T]]]
        Iterator yielding the successive pairs.
    """
    left, right = itertools.tee(iterable)
    next(right, None)
    return zip(left, itertools.chain(right, [end]))


KV_REGEX = re.compile(r"^[^\s].*$", flags=re.MULTILINE)
PARAM_KEY_REGEX = re.compile(r"^(?P<name>.*?)(?:\s+:\s*(?P<type>.*?))?$")
PARAM_OPTIONAL_REGEX = re.compile(r"(?P<type>.*?)(?:, optional|\(optional\))$")

# numpydoc format has no formal grammar for this,
# but we can make some educated guesses...
PARAM_DEFAULT_REGEX = re.compile(
    r"(?<!\S)[Dd]efault(?: is | = |: |s to |)\s*(?P<value>[\w\-\.]*\w)"
)

RETURN_KEY_REGEX = re.compile(r"^(?:(?P<name>.*?)\s*:\s*)?(?P<type>.*?)$")



[docs]
class Section:
    """Numpydoc section parser."""

    def __init__(self, title: str, key: str) -> None:
        """Initialize a section.

        Parameters
        ----------
        title : str
            section title. For most sections, this is a heading like
            "Parameters" which appears on its own line, underlined by
            en-dashes ('-') on the following line.
        key : str
            meta key string. In the parsed ``DocstringMeta`` instance this
            will be the first element of the ``args`` attribute list.
        """
        self.title = title
        self.key = key

    @property
    def title_pattern(self) -> str:
        """Regular expression pattern matching this section's header.

        This pattern will match this instance's ``title`` attribute in
        an anonymous group.

        Returns
        -------
        str
            Regex pattern as a string.
        """
        dashes = "-" * len(self.title)
        return rf"^({self.title})\s*?\n{dashes}\s*$"


[docs]
    def parse(self, text: str) -> Iterable[DocstringMeta]:
        """Parse ``DocstringMeta`` objects from the body of this section.

        Parameters
        ----------
        text : str
            section body text. Should be cleaned with
            ``inspect.cleandoc`` before parsing.

        Yields
        ------
        DocstringMeta
            object from this section body.
        """
        yield DocstringMeta([self.key], description=clean_str(text))




class _KVSection(Section):
    """Base parser for numpydoc sections with key-value syntax.

    E.g. sections that look like this:
        key
            value
        key2 : type
            values can also span...
            ... multiple lines
    """

    def _parse_item(self, key: str, value: str) -> DocstringMeta:
        """Abstract method for parsing a single item of a section.

        Parameters
        ----------
        key : str
            Key of the item to parse
        value : str
            Value of the item to parse

        Raises
        ------
        NotImplementedError
            To be implemented by child classes.
        """
        raise NotImplementedError

    @override
    def parse(self, text: str) -> Iterable[DocstringMeta]:
        """Parse all items in the docstring text.

        Parameters
        ----------
        text : str
            Docstring text to parse.

        Yields
        ------
        DocstringMeta
            Items parsed from the docstring.
        """
        for match, next_match in _pairwise(KV_REGEX.finditer(text)):
            start = match.end()
            end = next_match.start() if next_match is not None else None
            value = text[start:end]
            yield self._parse_item(key=match.group(), value=inspect.cleandoc(value))


class _SphinxSection(Section):
    """Base parser for numpydoc sections with sphinx-style syntax.

    E.g. sections that look like this:
        .. title:: something
            possibly over multiple lines
    """

    @property
    @override
    def title_pattern(self) -> str:
        """Title pattern used by sphinx sections.

        Returns
        -------
        str
            Regex pattern as a string.
        """
        return rf"^\.\.\s*({self.title})\s*::"



[docs]
class ParamSection(_KVSection):
    """Parser for numpydoc parameter sections.

    E.g. any section that looks like this:
        arg_name
            arg_description
        arg_2 : type, optional
            descriptions can also span...
            ... multiple lines
    """

    @override
    def _parse_item(self, key: str, value: str) -> DocstringParam:
        """Parse item from a parameter section.

        Parameters
        ----------
        key : str
            Key of the item. Contains parameter name and optionally type information.
        value : str
            Description for the item. Also possibly contains default value.

        Returns
        -------
        DocstringParam
            Parsed representation of the parameter item.

        Raises
        ------
        ParseError
            If no key could be parsed.
        ParseError
            If mandatory parts of the section were parsed incorrectly.
        """
        match = PARAM_KEY_REGEX.match(key)
        arg_name = type_name = is_optional = None
        if match is None:
            msg = f"Could not parse param key on line `{key}`"
            raise ParseError(msg)
        arg_name = match.group("name")
        type_name = match.group("type")
        if not isinstance(arg_name, str):
            msg = (
                f"Did not get a string when capturing mandatory section"
                f" 'arg_name' for key line `{key}`. Got `{arg_name}` instead."
            )
            raise ParseError(msg)
        if isinstance(type_name, str):
            optional_match = PARAM_OPTIONAL_REGEX.match(type_name)
            if optional_match is not None:
                type_name = optional_match.group("type")
                is_optional = True
            else:
                is_optional = False
        else:
            type_name = None

        default = None
        if value != "":
            default_match = PARAM_DEFAULT_REGEX.search(value)
            if default_match is not None:
                default = default_match.group("value")

        return DocstringParam(
            args=[self.key, arg_name],
            description=clean_str(value),
            arg_name=arg_name,
            type_name=type_name,
            is_optional=is_optional,
            default=default,
        )




[docs]
class RaisesSection(_KVSection):
    """Parser for numpydoc raises sections.

    E.g. any section that looks like this:
        ValueError
            A description of what might raise ValueError
    """

    @override
    def _parse_item(self, key: str, value: str) -> DocstringRaises:
        """Parse an item in the raises section.

        Parameters
        ----------
        key : str
            Key of the item to be parsed. Usually name of the exception raised.
        value : str
            Description of the item.

        Returns
        -------
        DocstringRaises
            Parsed representation of the raises item.
        """
        return DocstringRaises(
            args=[self.key, key],
            description=clean_str(value),
            type_name=key if key != "" else None,
        )




[docs]
class ReturnsSection(_KVSection):
    """Parser for numpydoc returns sections.

    E.g. any section that looks like this:
        return_name : type
            A description of this returned value
        another_type
            Return names are optional, types are required
    """

    is_generator = False

    @override
    def _parse_item(self, key: str, value: str) -> DocstringReturns:
        """Parse an item from the return section.

        Parameters
        ----------
        key : str
            Key of the item (usually type, possibly name + type)
        value : str
            Description of the return value.

        Returns
        -------
        DocstringReturns
            Parsed representation of the return item.
        """
        match = RETURN_KEY_REGEX.match(key)
        if match is not None:
            return_name = match.group("name")
            type_name = match.group("type")
        else:
            return_name = None
            type_name = None

        return DocstringReturns(
            args=[self.key],
            description=clean_str(value),
            type_name=type_name,
            is_generator=self.is_generator,
            return_name=return_name,
        )




[docs]
class YieldsSection(_KVSection):
    """Parser for numpydoc generator "yields" sections."""

    is_generator = True

    @override
    def _parse_item(self, key: str, value: str) -> DocstringYields:
        """Parse an item from the yield section.

        Parameters
        ----------
        key : str
            Key of the item (usually type, possibly name + type)
        value : str
            Description of the yielded value.

        Returns
        -------
        DocstringYields
            Parsed representation of the yield item.
        """
        match = RETURN_KEY_REGEX.match(key)
        if match is not None:
            yield_name = match.group("name")
            type_name = match.group("type")
        else:
            yield_name = None
            type_name = None

        return DocstringYields(
            args=[self.key],
            description=clean_str(value),
            type_name=type_name,
            is_generator=self.is_generator,
            yield_name=yield_name,
        )




[docs]
class DeprecationSection(_SphinxSection):
    """Parser for numpydoc "deprecation warning" sections.

    E.g. any section that looks like this:
        .. deprecated:: 1.6.0
            This description has
            multiple lines!
    """


[docs]
    @override
    def parse(self, text: str) -> Iterable[DocstringDeprecated]:
        """Parse ``DocstringDeprecated`` objects from the body of this section.

        Parameters
        ----------
        text : str
            Text of the deprecation section.

        Yields
        ------
        DocstringDeprecated
            Parsed representation of the deprecation item.

        Raises
        ------
        ParseError
            If the parsed version number was unexpectedly `None`.
            Usually a lack of version number would be represented by an empty string.
        """
        version, desc, *_ = [*text.split(sep="\n", maxsplit=1), None, None]
        if version is None:
            msg = (
                f"Got `None` while parsing version number "
                f"in deprecated section `{text}`."
            )
            raise ParseError(msg)
        if desc is not None:
            desc = clean_str(inspect.cleandoc(desc))

        yield DocstringDeprecated(
            args=[self.key], description=desc, version=clean_str(version)
        )





[docs]
class ExamplesSection(Section):
    """Parser for numpydoc examples sections.

    E.g. any section that looks like this:

        Optional description for the following example. Always preceded
        and followed by an empty line. Except for the first description.

        >>> import numpy.matlib
        >>> np.matlib.empty((2, 2))    # filled with random data
        matrix([[  6.76425276e-320,   9.79033856e-307], # random
                [  7.39337286e-309,   3.22135945e-309]])

        Description for the second example.

        >>> d = np.zeros((5,2))
        >>> for i in range(5):
        ...   for j in range(2):
        ...     for k in range(3):
        ...       for n in range(4):
        ...         d[i,j] += a[k,n,i] * b[n,k,j]
        >>> c == d
        array([[ True,  True],
            [ True,  True],
            [ True,  True],
            [ True,  True],
            [ True,  True]])
    """


[docs]
    @override
    def parse(self, text: str) -> Iterable[DocstringExample]:
        """Parse ``DocstringExample`` objects from the body of this section.

        Parameters
        ----------
        text : str
            section body text. Should be cleaned with
            ``inspect.cleandoc`` before parsing.

        Yields
        ------
        DocstringExample
            Docstring example sections
        """
        # Reverse so that we can efficiently pop from the back
        # instead of doing constant pops from the front.
        # Could also use a deque
        # ---
        # We add a newline to the end to not have to special case the first
        # description.
        lines = [*list(reversed(dedent(text).strip().splitlines())), "\n"]
        while lines:
            snippet_lines: list[str] = []
            description_lines: list[str] = []
            # Empty lines before the description
            while lines and lines[-1].strip() == "":
                lines.pop()
            # Description. Should not start with ">>>". if that were the case
            # Then there was no description.
            while lines and lines[-1].strip() != "" and not lines[-1].startswith(">>>"):
                description_lines.append(lines.pop())
            # Empty lines after description
            while lines and lines[-1].strip() == "":
                lines.pop()
            # Here the actual example starts.
            # We take any line.
            # The code part starts with ">>>" or "..."
            # but the result part can be anything.
            # Just keeping until an empty line which should indicate the next example.
            while lines and lines[-1].strip() != "":
                snippet_lines.append(lines.pop())
            yield DocstringExample(
                [self.key],
                snippet="\n".join(snippet_lines) if snippet_lines else None,
                description="\n".join(description_lines),
            )




DEFAULT_SECTIONS = [
    ParamSection("Parameters", "param"),
    ParamSection("Params", "param"),
    ParamSection("Arguments", "param"),
    ParamSection("Args", "param"),
    ParamSection("Other Parameters", "other_param"),
    ParamSection("Other Params", "other_param"),
    ParamSection("Other Arguments", "other_param"),
    ParamSection("Other Args", "other_param"),
    ParamSection("Receives", "receives"),
    ParamSection("Receive", "receives"),
    RaisesSection("Raises", "raises"),
    RaisesSection("Raise", "raises"),
    RaisesSection("Warns", "warns"),
    RaisesSection("Warn", "warns"),
    ParamSection("Attributes", "attribute"),
    ParamSection("Attribute", "attribute"),
    ParamSection("Methods", "method"),
    ParamSection("Method", "method"),
    ReturnsSection("Returns", "returns"),
    ReturnsSection("Return", "returns"),
    YieldsSection("Yields", "yields"),
    YieldsSection("Yield", "yields"),
    ExamplesSection("Examples", "examples"),
    ExamplesSection("Example", "examples"),
    Section("Warnings", "warnings"),
    Section("Warning", "warnings"),
    Section("See Also", "see_also"),
    Section("Related", "see_also"),
    Section("Notes", "notes"),
    Section("Note", "notes"),
    Section("References", "references"),
    Section("Reference", "references"),
    DeprecationSection("deprecated", "deprecation"),
]



[docs]
class NumpydocParser:
    """Parser for numpydoc-style docstrings."""

    def __init__(self, sections: Optional[Iterable[Section]] = None) -> None:
        """Set up sections.

        Parameters
        ----------
        sections : Optional[Iterable[Section]]
            Recognized sections or None to defaults.
        """
        self.sections = {s.title: s for s in (sections or DEFAULT_SECTIONS)}
        # Maps section keys to the actually used titles.
        self.section_titles: dict[str, str] = {}
        self._setup()

    def _setup(self) -> None:
        """Set up parser title regex."""
        self.titles_re = re.compile(
            r"|".join(s.title_pattern for s in self.sections.values()),
            flags=re.MULTILINE,
        )


[docs]
    def add_section(self, section: Section) -> None:
        """Add or replace a section.

        Parameters
        ----------
        section : Section
            The new section.
        """
        self.sections[section.title] = section
        self._setup()



[docs]
    def canonical_titles(self) -> KeyReturnDict[str, str]:
        """Get the canonical title for a section key.

        Parameters
        ----------
        title : str
            Section key.

        Returns
        -------
        KeyReturnDict[str, str]
            Canonical title.
        """
        return KeyReturnDict(
            {
                title: self.section_titles[key]
                for title in self.sections
                if (key := self.sections[title].key) in self.section_titles
            }
        )



[docs]
    def parse(self, text: Optional[str]) -> Docstring:
        """Parse the numpy-style docstring into its components.

        Parameters
        ----------
        text : Optional[str]
            docstring text

        Returns
        -------
        Docstring
            parsed docstring

        Raises
        ------
        ParseError
            If multiple titles are found for the same section.
        """
        ret = Docstring(style=DocstringStyle.NUMPYDOC)
        if not text:
            return ret

        # Clean according to PEP-0257
        text = inspect.cleandoc(text)

        if match := self.titles_re.search(text):
            desc_chunk = text[: match.start()]
            meta_chunk = text[match.start() :]
        else:
            desc_chunk = text
            meta_chunk = ""

        # Break description into short and long parts
        split_description(ret, desc_chunk)

        for match, nextmatch in _pairwise(self.titles_re.finditer(meta_chunk)):
            title = next(g for g in match.groups() if g is not None)
            factory = self.sections[title]
            key = factory.key
            if key in self.section_titles:
                msg = (
                    "Duplicated titles for identical section:"
                    f" {title} and {self.section_titles[key]}"
                )
                raise ParseError(msg)
            self.section_titles[factory.key] = title

            # section chunk starts after the header,
            # ends at the start of the next header
            start = match.end()
            end = nextmatch.start() if nextmatch is not None else None
            ret.meta.extend(factory.parse(meta_chunk[start:end]))

        ret.section_titles = self.canonical_titles()
        return ret





[docs]
def parse(text: Optional[str]) -> Docstring:
    """Parse the numpy-style docstring into its components.

    Parameters
    ----------
    text : Optional[str]
        docstring text

    Returns
    -------
    Docstring
        parsed docstring
    """
    return NumpydocParser().parse(text)




[docs]
def process_examples(examples: list[DocstringExample], parts: list[str]) -> None:
    """Add string representation of examples section to parts.

    Parameters
    ----------
    examples : list[DocstringExample]
        DocstringExamples to add to parts.
    parts : list[str]
        List of strings representing the final output of compose().
    indent : str
        the characters used as indentation in the docstring string
        (Default value = '    ')
    """
    if examples:
        parts.append("Examples")
        parts.append("-" * len(parts[-1]))
        for i, example in enumerate(examples):
            # Leave out newline for first example
            if i != 0:
                parts.append("")
            if example.description:
                parts.append(example.description)
                # Only add a new line if we have an actual example snippet here.
                # If not the next description will handle it.
                if example.snippet:
                    parts.append("")
            if example.snippet:
                parts.append(example.snippet)
        parts.append("")




[docs]
def compose(  # noqa: PLR0915, PLR0912
    # pylint: disable=W0613,R0915,R0912
    docstring: Docstring,
    rendering_style: RenderingStyle = RenderingStyle.COMPACT,  # noqa: ARG001
    indent: str = "    ",
) -> str:
    """Render a parsed docstring into docstring text.

    Parameters
    ----------
    docstring : Docstring
        parsed docstring representation
    rendering_style : RenderingStyle
        the style to render docstrings (Default value = RenderingStyle.COMPACT)
    indent : str
        the characters used as indentation in the docstring string
        (Default value = '    ')

    Returns
    -------
    str
        docstring text
    """
    titles: KeyReturnDict[str, str] = (
        docstring.section_titles
        if docstring.style == DocstringStyle.NUMPYDOC
        else KeyReturnDict()
    )

    def process_one(one: MainSections) -> None:
        """Build the output text for one entry in a section.

        Parameters
        ----------
        one : MainSections
            Docstring for which to build the raw text.
        """
        if isinstance(one, DocstringParam):
            head = one.arg_name
        elif isinstance(one, DocstringReturns):
            head = one.return_name
        elif isinstance(one, DocstringYields):
            head = one.yield_name
        else:
            head = None

        if one.type_name and head:
            head += f" : {one.type_name}"
        elif one.type_name:
            head = one.type_name
        elif not head:
            head = "__missing_required_field__"

        if isinstance(one, DocstringParam) and one.is_optional:
            head += ", optional"

        if one.description:
            body = f"\n{indent}".join([head, *one.description.splitlines()])
            parts.append(body)
        else:
            parts.append(head)

    def process_sect(name: str, args: list[MainSections]) -> None:
        """Build the output for a docstring section.

        Parameters
        ----------
        name : str
            Section for which to build the output.
        args : list[MainSections]
            List of individual elements of that section.
        """
        name = titles[name]
        if args:
            parts.append(name)
            parts.append("-" * len(name))
            for arg in args:
                process_one(arg)
            parts.append("")

    parts: list[str] = []
    if docstring.short_description:
        parts.append(docstring.short_description)
    if docstring.blank_after_short_description:
        parts.append("")

    if docstring.deprecation:
        first = ".. deprecated::"
        if docstring.deprecation.version:
            first += f" {docstring.deprecation.version}"
        if docstring.deprecation.description:
            rest = docstring.deprecation.description.splitlines()
        else:
            rest = []
        sep = f"\n{indent}"
        parts.append(sep.join([first, *rest]))

    if docstring.long_description:
        parts.append(docstring.long_description)
    if docstring.blank_after_long_description:
        parts.append("")

    process_sect(
        "Parameters",
        [item for item in docstring.params or [] if item.args[0] == "param"],
    )

    process_sect(
        "Attributes",
        [item for item in docstring.params or [] if item.args[0] == "attribute"],
    )

    process_sect(
        "Methods",
        [item for item in docstring.params or [] if item.args[0] == "method"],
    )

    process_sect(
        "Returns",
        list(docstring.many_returns or []),
    )

    process_sect(
        "Yields",
        list(docstring.many_yields or []),
    )

    if docstring.returns and not docstring.many_returns:
        ret = docstring.returns
        parts.append(titles["Yields" if ret else "Returns"])
        parts.append("-" * len(parts[-1]))
        process_one(ret)

    process_sect(
        "Receives",
        [item for item in docstring.params or [] if item.args[0] == "receives"],
    )

    process_sect(
        "Other Parameters",
        [item for item in docstring.params or [] if item.args[0] == "other_param"],
    )

    process_sect(
        "Raises",
        [item for item in docstring.raises or [] if item.args[0] == "raises"],
    )

    process_sect(
        "Warns",
        [item for item in docstring.raises or [] if item.args[0] == "warns"],
    )

    process_examples(docstring.examples, parts)

    for meta in docstring.meta:
        if isinstance(
            meta,
            (
                DocstringDeprecated,
                DocstringParam,
                DocstringReturns,
                DocstringRaises,
                DocstringYields,
                DocstringExample,
            ),
        ):
            continue  # Already handled
        title = titles[meta.args[0].replace("_", "").title()]
        parts.append(title)
        parts.append("-" * len(title))

        if meta.description:
            parts.append(meta.description)
        parts.append("")

    while parts and not parts[-1]:
        parts.pop()

    return "\n".join(parts)