Source code for pymend.docstring_parser.numpydoc

"""Numpydoc-style docstring parsing.

See
---
https://numpydoc.readthedocs.io/en/latest/format.html
"""

import inspect
import itertools
import re
from collections.abc import Iterable, Iterator
from textwrap import dedent
from typing import Optional, TypeVar

from typing_extensions import override

from .common import (
    Docstring,
    DocstringDeprecated,
    DocstringExample,
    DocstringMeta,
    DocstringParam,
    DocstringRaises,
    DocstringReturns,
    DocstringStyle,
    DocstringYields,
    KeyReturnDict,
    MainSections,
    ParseError,
    RenderingStyle,
    clean_str,
    split_description,
)

_T = TypeVar("_T")


def _pairwise(
    iterable: Iterable[_T], end: Optional[_T] = None
) -> Iterator[tuple[_T, Optional[_T]]]:
    """Iterate over successive pairs with overhang for last element.

    Parameters
    ----------
    iterable : Iterable[_T]
        Iterable to iterate over.
    end : Optional[_T]
        Value for the overhang (Default value = None)

    Returns
    -------
    Iterator[tuple[_T, Optional[_T]]]
        Iterator yielding the successive pairs.
    """
    left, right = itertools.tee(iterable)
    next(right, None)
    return zip(left, itertools.chain(right, [end]))


KV_REGEX = re.compile(r"^[^\s].*$", flags=re.MULTILINE)
PARAM_KEY_REGEX = re.compile(r"^(?P<name>.*?)(?:\s+:\s*(?P<type>.*?))?$")
PARAM_OPTIONAL_REGEX = re.compile(r"(?P<type>.*?)(?:, optional|\(optional\))$")

# numpydoc format has no formal grammar for this,
# but we can make some educated guesses...
PARAM_DEFAULT_REGEX = re.compile(
    r"(?<!\S)[Dd]efault(?: is | = |: |s to |)\s*(?P<value>[\w\-\.]*\w)"
)

RETURN_KEY_REGEX = re.compile(r"^(?:(?P<name>.*?)\s*:\s*)?(?P<type>.*?)$")


[docs] class Section: """Numpydoc section parser.""" def __init__(self, title: str, key: str) -> None: """Initialize a section. Parameters ---------- title : str section title. For most sections, this is a heading like "Parameters" which appears on its own line, underlined by en-dashes ('-') on the following line. key : str meta key string. In the parsed ``DocstringMeta`` instance this will be the first element of the ``args`` attribute list. """ self.title = title self.key = key @property def title_pattern(self) -> str: """Regular expression pattern matching this section's header. This pattern will match this instance's ``title`` attribute in an anonymous group. Returns ------- str Regex pattern as a string. """ dashes = "-" * len(self.title) return rf"^({self.title})\s*?\n{dashes}\s*$"
[docs] def parse(self, text: str) -> Iterable[DocstringMeta]: """Parse ``DocstringMeta`` objects from the body of this section. Parameters ---------- text : str section body text. Should be cleaned with ``inspect.cleandoc`` before parsing. Yields ------ DocstringMeta object from this section body. """ yield DocstringMeta([self.key], description=clean_str(text))
class _KVSection(Section): """Base parser for numpydoc sections with key-value syntax. E.g. sections that look like this: key value key2 : type values can also span... ... multiple lines """ def _parse_item(self, key: str, value: str) -> DocstringMeta: """Abstract method for parsing a single item of a section. Parameters ---------- key : str Key of the item to parse value : str Value of the item to parse Raises ------ NotImplementedError To be implemented by child classes. """ raise NotImplementedError @override def parse(self, text: str) -> Iterable[DocstringMeta]: """Parse all items in the docstring text. Parameters ---------- text : str Docstring text to parse. Yields ------ DocstringMeta Items parsed from the docstring. """ for match, next_match in _pairwise(KV_REGEX.finditer(text)): start = match.end() end = next_match.start() if next_match is not None else None value = text[start:end] yield self._parse_item(key=match.group(), value=inspect.cleandoc(value)) class _SphinxSection(Section): """Base parser for numpydoc sections with sphinx-style syntax. E.g. sections that look like this: .. title:: something possibly over multiple lines """ @property @override def title_pattern(self) -> str: """Title pattern used by sphinx sections. Returns ------- str Regex pattern as a string. """ return rf"^\.\.\s*({self.title})\s*::"
[docs] class ParamSection(_KVSection): """Parser for numpydoc parameter sections. E.g. any section that looks like this: arg_name arg_description arg_2 : type, optional descriptions can also span... ... multiple lines """ @override def _parse_item(self, key: str, value: str) -> DocstringParam: """Parse item from a parameter section. Parameters ---------- key : str Key of the item. Contains parameter name and optionally type information. value : str Description for the item. Also possibly contains default value. Returns ------- DocstringParam Parsed representation of the parameter item. Raises ------ ParseError If no key could be parsed. ParseError If mandatory parts of the section were parsed incorrectly. """ match = PARAM_KEY_REGEX.match(key) arg_name = type_name = is_optional = None if match is None: msg = f"Could not parse param key on line `{key}`" raise ParseError(msg) arg_name = match.group("name") type_name = match.group("type") if not isinstance(arg_name, str): msg = ( f"Did not get a string when capturing mandatory section" f" 'arg_name' for key line `{key}`. Got `{arg_name}` instead." ) raise ParseError(msg) if isinstance(type_name, str): optional_match = PARAM_OPTIONAL_REGEX.match(type_name) if optional_match is not None: type_name = optional_match.group("type") is_optional = True else: is_optional = False else: type_name = None default = None if value != "": default_match = PARAM_DEFAULT_REGEX.search(value) if default_match is not None: default = default_match.group("value") return DocstringParam( args=[self.key, arg_name], description=clean_str(value), arg_name=arg_name, type_name=type_name, is_optional=is_optional, default=default, )
[docs] class RaisesSection(_KVSection): """Parser for numpydoc raises sections. E.g. any section that looks like this: ValueError A description of what might raise ValueError """ @override def _parse_item(self, key: str, value: str) -> DocstringRaises: """Parse an item in the raises section. Parameters ---------- key : str Key of the item to be parsed. Usually name of the exception raised. value : str Description of the item. Returns ------- DocstringRaises Parsed representation of the raises item. """ return DocstringRaises( args=[self.key, key], description=clean_str(value), type_name=key if key != "" else None, )
[docs] class ReturnsSection(_KVSection): """Parser for numpydoc returns sections. E.g. any section that looks like this: return_name : type A description of this returned value another_type Return names are optional, types are required """ is_generator = False @override def _parse_item(self, key: str, value: str) -> DocstringReturns: """Parse an item from the return section. Parameters ---------- key : str Key of the item (usually type, possibly name + type) value : str Description of the return value. Returns ------- DocstringReturns Parsed representation of the return item. """ match = RETURN_KEY_REGEX.match(key) if match is not None: return_name = match.group("name") type_name = match.group("type") else: return_name = None type_name = None return DocstringReturns( args=[self.key], description=clean_str(value), type_name=type_name, is_generator=self.is_generator, return_name=return_name, )
[docs] class YieldsSection(_KVSection): """Parser for numpydoc generator "yields" sections.""" is_generator = True @override def _parse_item(self, key: str, value: str) -> DocstringYields: """Parse an item from the yield section. Parameters ---------- key : str Key of the item (usually type, possibly name + type) value : str Description of the yielded value. Returns ------- DocstringYields Parsed representation of the yield item. """ match = RETURN_KEY_REGEX.match(key) if match is not None: yield_name = match.group("name") type_name = match.group("type") else: yield_name = None type_name = None return DocstringYields( args=[self.key], description=clean_str(value), type_name=type_name, is_generator=self.is_generator, yield_name=yield_name, )
[docs] class DeprecationSection(_SphinxSection): """Parser for numpydoc "deprecation warning" sections. E.g. any section that looks like this: .. deprecated:: 1.6.0 This description has multiple lines! """
[docs] @override def parse(self, text: str) -> Iterable[DocstringDeprecated]: """Parse ``DocstringDeprecated`` objects from the body of this section. Parameters ---------- text : str Text of the deprecation section. Yields ------ DocstringDeprecated Parsed representation of the deprecation item. Raises ------ ParseError If the parsed version number was unexpectedly `None`. Usually a lack of version number would be represented by an empty string. """ version, desc, *_ = [*text.split(sep="\n", maxsplit=1), None, None] if version is None: msg = ( f"Got `None` while parsing version number " f"in deprecated section `{text}`." ) raise ParseError(msg) if desc is not None: desc = clean_str(inspect.cleandoc(desc)) yield DocstringDeprecated( args=[self.key], description=desc, version=clean_str(version) )
[docs] class ExamplesSection(Section): """Parser for numpydoc examples sections. E.g. any section that looks like this: Optional description for the following example. Always preceded and followed by an empty line. Except for the first description. >>> import numpy.matlib >>> np.matlib.empty((2, 2)) # filled with random data matrix([[ 6.76425276e-320, 9.79033856e-307], # random [ 7.39337286e-309, 3.22135945e-309]]) Description for the second example. >>> d = np.zeros((5,2)) >>> for i in range(5): ... for j in range(2): ... for k in range(3): ... for n in range(4): ... d[i,j] += a[k,n,i] * b[n,k,j] >>> c == d array([[ True, True], [ True, True], [ True, True], [ True, True], [ True, True]]) """
[docs] @override def parse(self, text: str) -> Iterable[DocstringExample]: """Parse ``DocstringExample`` objects from the body of this section. Parameters ---------- text : str section body text. Should be cleaned with ``inspect.cleandoc`` before parsing. Yields ------ DocstringExample Docstring example sections """ # Reverse so that we can efficiently pop from the back # instead of doing constant pops from the front. # Could also use a deque # --- # We add a newline to the end to not have to special case the first # description. lines = [*list(reversed(dedent(text).strip().splitlines())), "\n"] while lines: snippet_lines: list[str] = [] description_lines: list[str] = [] # Empty lines before the description while lines and lines[-1].strip() == "": lines.pop() # Description. Should not start with ">>>". if that were the case # Then there was no description. while lines and lines[-1].strip() != "" and not lines[-1].startswith(">>>"): description_lines.append(lines.pop()) # Empty lines after description while lines and lines[-1].strip() == "": lines.pop() # Here the actual example starts. # We take any line. # The code part starts with ">>>" or "..." # but the result part can be anything. # Just keeping until an empty line which should indicate the next example. while lines and lines[-1].strip() != "": snippet_lines.append(lines.pop()) yield DocstringExample( [self.key], snippet="\n".join(snippet_lines) if snippet_lines else None, description="\n".join(description_lines), )
DEFAULT_SECTIONS = [ ParamSection("Parameters", "param"), ParamSection("Params", "param"), ParamSection("Arguments", "param"), ParamSection("Args", "param"), ParamSection("Other Parameters", "other_param"), ParamSection("Other Params", "other_param"), ParamSection("Other Arguments", "other_param"), ParamSection("Other Args", "other_param"), ParamSection("Receives", "receives"), ParamSection("Receive", "receives"), RaisesSection("Raises", "raises"), RaisesSection("Raise", "raises"), RaisesSection("Warns", "warns"), RaisesSection("Warn", "warns"), ParamSection("Attributes", "attribute"), ParamSection("Attribute", "attribute"), ParamSection("Methods", "method"), ParamSection("Method", "method"), ReturnsSection("Returns", "returns"), ReturnsSection("Return", "returns"), YieldsSection("Yields", "yields"), YieldsSection("Yield", "yields"), ExamplesSection("Examples", "examples"), ExamplesSection("Example", "examples"), Section("Warnings", "warnings"), Section("Warning", "warnings"), Section("See Also", "see_also"), Section("Related", "see_also"), Section("Notes", "notes"), Section("Note", "notes"), Section("References", "references"), Section("Reference", "references"), DeprecationSection("deprecated", "deprecation"), ]
[docs] class NumpydocParser: """Parser for numpydoc-style docstrings.""" def __init__(self, sections: Optional[Iterable[Section]] = None) -> None: """Set up sections. Parameters ---------- sections : Optional[Iterable[Section]] Recognized sections or None to defaults. """ self.sections = {s.title: s for s in (sections or DEFAULT_SECTIONS)} # Maps section keys to the actually used titles. self.section_titles: dict[str, str] = {} self._setup() def _setup(self) -> None: """Set up parser title regex.""" self.titles_re = re.compile( r"|".join(s.title_pattern for s in self.sections.values()), flags=re.MULTILINE, )
[docs] def add_section(self, section: Section) -> None: """Add or replace a section. Parameters ---------- section : Section The new section. """ self.sections[section.title] = section self._setup()
[docs] def canonical_titles(self) -> KeyReturnDict[str, str]: """Get the canonical title for a section key. Parameters ---------- title : str Section key. Returns ------- KeyReturnDict[str, str] Canonical title. """ return KeyReturnDict( { title: self.section_titles[key] for title in self.sections if (key := self.sections[title].key) in self.section_titles } )
[docs] def parse(self, text: Optional[str]) -> Docstring: """Parse the numpy-style docstring into its components. Parameters ---------- text : Optional[str] docstring text Returns ------- Docstring parsed docstring Raises ------ ParseError If multiple titles are found for the same section. """ ret = Docstring(style=DocstringStyle.NUMPYDOC) if not text: return ret # Clean according to PEP-0257 text = inspect.cleandoc(text) if match := self.titles_re.search(text): desc_chunk = text[: match.start()] meta_chunk = text[match.start() :] else: desc_chunk = text meta_chunk = "" # Break description into short and long parts split_description(ret, desc_chunk) for match, nextmatch in _pairwise(self.titles_re.finditer(meta_chunk)): title = next(g for g in match.groups() if g is not None) factory = self.sections[title] key = factory.key if key in self.section_titles: msg = ( "Duplicated titles for identical section:" f" {title} and {self.section_titles[key]}" ) raise ParseError(msg) self.section_titles[factory.key] = title # section chunk starts after the header, # ends at the start of the next header start = match.end() end = nextmatch.start() if nextmatch is not None else None ret.meta.extend(factory.parse(meta_chunk[start:end])) ret.section_titles = self.canonical_titles() return ret
[docs] def parse(text: Optional[str]) -> Docstring: """Parse the numpy-style docstring into its components. Parameters ---------- text : Optional[str] docstring text Returns ------- Docstring parsed docstring """ return NumpydocParser().parse(text)
[docs] def process_examples(examples: list[DocstringExample], parts: list[str]) -> None: """Add string representation of examples section to parts. Parameters ---------- examples : list[DocstringExample] DocstringExamples to add to parts. parts : list[str] List of strings representing the final output of compose(). indent : str the characters used as indentation in the docstring string (Default value = ' ') """ if examples: parts.append("Examples") parts.append("-" * len(parts[-1])) for i, example in enumerate(examples): # Leave out newline for first example if i != 0: parts.append("") if example.description: parts.append(example.description) # Only add a new line if we have an actual example snippet here. # If not the next description will handle it. if example.snippet: parts.append("") if example.snippet: parts.append(example.snippet) parts.append("")
[docs] def compose( # noqa: PLR0915, PLR0912 # pylint: disable=W0613,R0915,R0912 docstring: Docstring, rendering_style: RenderingStyle = RenderingStyle.COMPACT, # noqa: ARG001 indent: str = " ", ) -> str: """Render a parsed docstring into docstring text. Parameters ---------- docstring : Docstring parsed docstring representation rendering_style : RenderingStyle the style to render docstrings (Default value = RenderingStyle.COMPACT) indent : str the characters used as indentation in the docstring string (Default value = ' ') Returns ------- str docstring text """ titles: KeyReturnDict[str, str] = ( docstring.section_titles if docstring.style == DocstringStyle.NUMPYDOC else KeyReturnDict() ) def process_one(one: MainSections) -> None: """Build the output text for one entry in a section. Parameters ---------- one : MainSections Docstring for which to build the raw text. """ if isinstance(one, DocstringParam): head = one.arg_name elif isinstance(one, DocstringReturns): head = one.return_name elif isinstance(one, DocstringYields): head = one.yield_name else: head = None if one.type_name and head: head += f" : {one.type_name}" elif one.type_name: head = one.type_name elif not head: head = "__missing_required_field__" if isinstance(one, DocstringParam) and one.is_optional: head += ", optional" if one.description: body = f"\n{indent}".join([head, *one.description.splitlines()]) parts.append(body) else: parts.append(head) def process_sect(name: str, args: list[MainSections]) -> None: """Build the output for a docstring section. Parameters ---------- name : str Section for which to build the output. args : list[MainSections] List of individual elements of that section. """ name = titles[name] if args: parts.append(name) parts.append("-" * len(name)) for arg in args: process_one(arg) parts.append("") parts: list[str] = [] if docstring.short_description: parts.append(docstring.short_description) if docstring.blank_after_short_description: parts.append("") if docstring.deprecation: first = ".. deprecated::" if docstring.deprecation.version: first += f" {docstring.deprecation.version}" if docstring.deprecation.description: rest = docstring.deprecation.description.splitlines() else: rest = [] sep = f"\n{indent}" parts.append(sep.join([first, *rest])) if docstring.long_description: parts.append(docstring.long_description) if docstring.blank_after_long_description: parts.append("") process_sect( "Parameters", [item for item in docstring.params or [] if item.args[0] == "param"], ) process_sect( "Attributes", [item for item in docstring.params or [] if item.args[0] == "attribute"], ) process_sect( "Methods", [item for item in docstring.params or [] if item.args[0] == "method"], ) process_sect( "Returns", list(docstring.many_returns or []), ) process_sect( "Yields", list(docstring.many_yields or []), ) if docstring.returns and not docstring.many_returns: ret = docstring.returns parts.append(titles["Yields" if ret else "Returns"]) parts.append("-" * len(parts[-1])) process_one(ret) process_sect( "Receives", [item for item in docstring.params or [] if item.args[0] == "receives"], ) process_sect( "Other Parameters", [item for item in docstring.params or [] if item.args[0] == "other_param"], ) process_sect( "Raises", [item for item in docstring.raises or [] if item.args[0] == "raises"], ) process_sect( "Warns", [item for item in docstring.raises or [] if item.args[0] == "warns"], ) process_examples(docstring.examples, parts) for meta in docstring.meta: if isinstance( meta, ( DocstringDeprecated, DocstringParam, DocstringReturns, DocstringRaises, DocstringYields, DocstringExample, ), ): continue # Already handled title = titles[meta.args[0].replace("_", "").title()] parts.append(title) parts.append("-" * len(title)) if meta.description: parts.append(meta.description) parts.append("") while parts and not parts[-1]: parts.pop() return "\n".join(parts)