Source code for pymend.docstring_parser.google

"""Google-style docstring parsing."""

import inspect
import re
from collections import OrderedDict
from collections.abc import Mapping, Sequence
from enum import IntEnum
from typing import NamedTuple, Optional

from .common import (
    EXAMPLES_KEYWORDS,
    PARAM_KEYWORDS,
    RAISES_KEYWORDS,
    RETURNS_KEYWORDS,
    YIELDS_KEYWORDS,
    Docstring,
    DocstringExample,
    DocstringMeta,
    DocstringParam,
    DocstringRaises,
    DocstringReturns,
    DocstringStyle,
    DocstringYields,
    KeyReturnDict,
    MainSections,
    ParseError,
    RenderingStyle,
    append_description,
    split_description,
)



[docs]
class SectionType(IntEnum):
    """Types of sections."""

    SINGULAR = 0
    """For sections like examples."""

    MULTIPLE = 1
    """For sections like params."""

    SINGULAR_OR_MULTIPLE = 2
    """For sections like returns or yields."""




[docs]
class Section(NamedTuple):
    """A docstring section."""

    title: str
    key: str
    type_info: SectionType



GOOGLE_TYPED_ARG_REGEX = re.compile(r"\s*(.+?)\s*\(\s*(.*[^\s]+)\s*\)")
GOOGLE_ARG_DESC_REGEX = re.compile(r".*\. Defaults to (.+)\.")
MULTIPLE_PATTERN = re.compile(
    # Match anything that has leading whitespace and then contiguous non-whitespace
    # (non colon) character followed by a colon.
    #  somecontiguoustype: some description
    r"(\s*[^:\s]+:)"
    # Match anything that has some contiguous text, then something in parens,
    # immediately followed by a colon.
    r"|(\s*[^:\s]+\s+\(.+\):)"
    # Allow whitespace if we have a closing ] before the color, optionally with a )
    # some var name (list[int, int]): some description
    r"|([^:]*\]:.*)"
    # Allow for arbitrary chaining of pipe character for type annotations int | str
    # Where the individual types are allowed to have spaces as long as they start
    # and end without one ([^\s|][^\|]*[^\s|])
    r"|(\s*[^\s|][^\|]*[^\s|](\s*\|\s*[^\s|][^\|]*[^\s|])+:)"
)

DEFAULT_SECTIONS = [
    Section("Arguments", "param", SectionType.MULTIPLE),
    Section("Args", "param", SectionType.MULTIPLE),
    Section("Parameters", "param", SectionType.MULTIPLE),
    Section("Params", "param", SectionType.MULTIPLE),
    Section("Raises", "raises", SectionType.MULTIPLE),
    Section("Exceptions", "raises", SectionType.MULTIPLE),
    Section("Except", "raises", SectionType.MULTIPLE),
    Section("Attributes", "attribute", SectionType.MULTIPLE),
    Section("Generics", "generics", SectionType.MULTIPLE),
    Section("Example", "examples", SectionType.SINGULAR),
    Section("Examples", "examples", SectionType.SINGULAR),
    Section("Returns", "returns", SectionType.SINGULAR_OR_MULTIPLE),
    Section("Yields", "yields", SectionType.SINGULAR_OR_MULTIPLE),
]



[docs]
class GoogleParser:
    """Parser for Google-style docstrings."""

    def __init__(
        self, sections: Optional[list[Section]] = None, *, title_colon: bool = True
    ) -> None:
        """Set up sections.

        Parameters
        ----------
        sections : Optional[list[Section]]
            Recognized sections or None to defaults.
        title_colon : bool
            Require colon after section title. (Default value = True)
        """
        if not sections:
            sections = DEFAULT_SECTIONS
        self.sections = {s.title: s for s in sections}
        # Maps section keys to the actually used titles.
        self.section_titles: dict[str, str] = {}
        self.title_colon = title_colon
        self._setup()


[docs]
    def canonical_titles(self) -> KeyReturnDict[str, str]:
        """Get the canonical title for a section key.

        Parameters
        ----------
        title : str
            Section key.

        Returns
        -------
        KeyReturnDict[str, str]
            Canonical title.
        """
        return KeyReturnDict(
            {
                title: self.section_titles[key]
                for title in self.sections
                if (key := self.sections[title].key) in self.section_titles
            }
        )


    def _setup(self) -> None:
        """Set up parser with the colon type and title regex."""
        colon = ":" if self.title_colon else ""
        self.titles_re = re.compile(
            "^("
            + "|".join(f"({t})" for t in self.sections)
            + ")"
            + colon
            + "[ \t\r\f\v]*$",
            flags=re.MULTILINE,
        )

    @staticmethod
    def _build_single_meta(section: Section, desc: str) -> DocstringMeta:
        """Build docstring element for single line sections.

        Parameters
        ----------
        section : Section
            The section that is being processed.
        desc : str
            docstring element text

        Returns
        -------
        DocstringMeta
            Docstring meta wrapper.

        Raises
        ------
        ParseError
            If the section represents a parameter section.
            In that case we would not expect to be in the single line function.
        """
        if section.key in RETURNS_KEYWORDS:
            return DocstringReturns(
                args=[section.key],
                description=desc,
                type_name=None,
                is_generator=False,
            )
        if section.key in YIELDS_KEYWORDS:
            return DocstringYields(
                args=[section.key],
                description=desc,
                type_name=None,
                is_generator=True,
            )
        if section.key in RAISES_KEYWORDS:
            return DocstringRaises(args=[section.key], description=desc, type_name=None)
        if section.key in EXAMPLES_KEYWORDS:
            return DocstringExample(args=[section.key], snippet=None, description=desc)
        if section.key in PARAM_KEYWORDS:
            msg = "Expected parameter name."
            raise ParseError(msg)
        return DocstringMeta(args=[section.key], description=desc)

    def _prepare_multi_meta(self, section: Section, text: str) -> tuple[str, str]:
        """Check text for consistency and split into before and desc.

        Parameters
        ----------
        section : Section
            The section that is being processed.
        text : str
            docstring element text

        Returns
        -------
        before : str
            The part before the colon.
        desc : str
            The description of the element.

        Raises
        ------
        ParseError
            If the text did not match the multi pattern regex.
        ParseError
            If there is no colon in the text.
        """
        if not MULTIPLE_PATTERN.match(text):
            msg = (
                "Could not match multi pattern to split "
                f"chunk part {text!r} for section {section.title}."
            )
            raise ParseError(msg)
        if ":" not in text:
            msg = f"Expected a colon in {text!r} for title {section.title}."
            raise ParseError(msg)

        # Split spec and description
        before, desc = text.split(":", 1)
        if desc:
            desc = desc[1:] if desc[0] == " " else desc
            if "\n" in desc:
                first_line, rest = desc.split("\n", 1)
                desc = first_line + "\n" + inspect.cleandoc(rest)
            desc = desc.strip("\n")
        return before, desc

    def _build_multi_meta(self, section: Section, text: str) -> DocstringMeta:
        """Build docstring element for multiline section.

        Parameters
        ----------
        section : Section
            The section that is being processed.
        text : str
            title of section containing element

        Returns
        -------
        DocstringMeta
            docstring meta element

        Raises
        ------
        ParseError
            If the text lacks a colon ':'
        """
        before, desc = self._prepare_multi_meta(section, text)

        if section.key in PARAM_KEYWORDS:
            match = GOOGLE_TYPED_ARG_REGEX.match(before)
            if match:
                arg_name, type_name = match.group(1, 2)
                if type_name.endswith(", optional"):
                    is_optional = True
                    type_name = type_name[:-10]
                elif type_name.endswith("?"):
                    is_optional = True
                    type_name = type_name[:-1]
                else:
                    is_optional = False
            else:
                arg_name, type_name = before, None
                is_optional = None

            match = GOOGLE_ARG_DESC_REGEX.match(desc)
            default = match.group(1) if match else None

            return DocstringParam(
                args=[section.key, before],
                description=desc,
                arg_name=arg_name,
                type_name=type_name,
                is_optional=is_optional,
                default=default,
            )
        if section.key in RETURNS_KEYWORDS | YIELDS_KEYWORDS:
            match = GOOGLE_TYPED_ARG_REGEX.match(before)
            if match:
                arg_name, type_name = match.group(1, 2)
            else:
                arg_name, type_name = None, before
            if section.key in RETURNS_KEYWORDS:
                return DocstringReturns(
                    args=[section.key, arg_name or type_name],
                    description=desc,
                    return_name=arg_name,
                    type_name=type_name,
                    is_generator=False,
                )
            return DocstringYields(
                args=[section.key, arg_name or type_name],
                description=desc,
                yield_name=arg_name,
                type_name=type_name,
                is_generator=True,
            )
        if section.key in RAISES_KEYWORDS:
            return DocstringRaises(
                args=[section.key, before], description=desc, type_name=before
            )
        return DocstringMeta(args=[section.key, before], description=desc)


[docs]
    def add_section(self, section: Section) -> None:
        """Add or replace a section.

        Parameters
        ----------
        section : Section
            The new section.
        """
        self.sections[section.title] = section
        self._setup()


    def _split_sections(self, meta_chunk: str) -> Mapping[str, str]:
        """Split the cunk into sections as determined by the titles..

        Parameters
        ----------
        meta_chunk : str
            Part of the docstring NOT holding the description.

        Returns
        -------
        Mapping[str, str]
            Mapping between sectrion title and part of the docstring that deals with it.

        Raises
        ------
        ParseError
            If multiple titles are found for the same section.
        """
        chunks: Mapping[str, str] = OrderedDict()
        matches = list(self.titles_re.finditer(meta_chunk))
        if not matches:
            return chunks
        splits = [
            (matches[j].end(), matches[j + 1].start()) for j in range(len(matches) - 1)
        ]
        splits.append((matches[-1].end(), len(meta_chunk)))
        for j, (start, end) in enumerate(splits):
            title = matches[j].group(1)
            if title not in self.sections:
                continue
            key = self.sections[title].key
            if key in self.section_titles:
                msg = (
                    "Duplicated titles for identical section:"
                    f" {title} and {self.section_titles[key]}"
                )
                raise ParseError(msg)
            self.section_titles[key] = title

            # Clear Any Unknown Meta
            # Ref: https://github.com/rr-/docstring_parser/issues/29
            meta_details = meta_chunk[start:end]
            unknown_meta = re.search(r"\n\S", meta_details)
            if unknown_meta is not None:
                meta_details = meta_details[: unknown_meta.start()]

            chunks[title] = meta_details.strip("\n")
        return chunks

    def _determine_indent(self, chunk: str) -> str:
        """Determine indent.

        Parameters
        ----------
        chunk : str
            Chunk to determine the indent for.

        Returns
        -------
        str
            String representing the indent.

        Raises
        ------
        ParseError
            If no indent could be determined.
        """
        indent_match = re.search(r"^\s*", chunk)
        if not indent_match:
            msg = f"Can't infer indent from '{chunk}'"
            raise ParseError(msg)
        return indent_match.group()

    def _get_chunks(self, text: str) -> tuple[str, str]:
        """Split docstring into description and meta part.

        Parameters
        ----------
        text : str
            Docstring text to split.

        Returns
        -------
        tuple[str, str]
            Docstring representing the description and the rest.
        """
        if match := self.titles_re.search(text):
            return text[: match.start()], text[match.start() :]
        return text, ""

    def _get_multi_chunk_splits(
        self, chunk: str, title: str, indent: str
    ) -> list[tuple[int, int]]:
        """Get the starting and ending position for each element of a multi chunk.

        Parameters
        ----------
        chunk : str
            Full chunk to split.
        title : str
            Title of the section represented by the chunk.
        indent : str
            Indent before each element of the chunk.

        Returns
        -------
        list[tuple[int, int]]
            List of all start and end positions of each element of the chunk.

        Raises
        ------
        ParseError
            If no entry could be found with the expected indent.
        """
        # Split based on lines which have exactly that indent
        c_matches = list(re.finditer(rf"^{indent}(?=\S)", chunk, flags=re.MULTILINE))
        if not c_matches:
            msg = (
                f'No specification for "{title}": "{chunk}"'
                " Maybe check your indentation?"
            )
            raise ParseError(msg)
        c_splits = [
            (c_cur.end(), c_next.start())
            for c_cur, c_next in zip(c_matches, c_matches[1:])
        ]
        c_splits.append((c_matches[-1].end(), len(chunk)))
        return c_splits


[docs]
    def parse(self, text: Optional[str]) -> Docstring:
        """Parse the Google-style docstring into its components.

        Parameters
        ----------
        text : Optional[str]
            docstring text

        Returns
        -------
        Docstring
            parsed docstring

        Raises
        ------
        ParseError
            If no specification could be found for a title, chunk pair.
        """
        ret = Docstring(style=DocstringStyle.GOOGLE)
        if not text:
            return ret

        # Clean according to PEP-0257
        text = inspect.cleandoc(text)

        desc_chunk, meta_chunk = self._get_chunks(text)

        # Break description into short and long parts
        split_description(ret, desc_chunk)

        # Split by sections determined by titles
        chunks = self._split_sections(meta_chunk)

        if not chunks:
            return ret

        # Add elements from each chunk
        for title, chunk in chunks.items():
            # Determine indent
            indent = self._determine_indent(chunk)
            section = self.sections[title]
            # Check for singular elements
            if section.type_info == SectionType.SINGULAR:
                part = inspect.cleandoc(chunk)
                ret.meta.append(self._build_single_meta(section, part))
                continue

            # Split based on lines which have exactly that indent
            c_splits = self._get_multi_chunk_splits(chunk, title, indent)
            if section.type_info == SectionType.MULTIPLE:
                for start, end in c_splits:
                    part = chunk[start:end].strip("\n")
                    ret.meta.append(self._build_multi_meta(section, part))
            else:  # SectionType.SINGULAR_OR_MULTIPLE
                # Try to handle it as a multiple section with multiple entries
                try:
                    metas = [
                        self._build_multi_meta(section, chunk[start:end].strip("\n"))
                        for start, end in c_splits
                    ]
                # Fall back to a singular entry for multi or single section
                except ParseError:
                    part = inspect.cleandoc(chunk)
                    if MULTIPLE_PATTERN.match(part):
                        ret.meta.append(self._build_multi_meta(section, part))
                    else:
                        ret.meta.append(self._build_single_meta(section, part))
                else:
                    ret.meta.extend(metas)
        ret.section_titles = self.canonical_titles()
        return ret





[docs]
def parse(text: Optional[str]) -> Docstring:
    """Parse the Google-style docstring into its components.

    Parameters
    ----------
    text : Optional[str]
        docstring text

    Returns
    -------
    Docstring
        parsed docstring
    """
    return GoogleParser().parse(text)




[docs]
def compose(  # noqa: PLR0915
    docstring: Docstring,
    rendering_style: RenderingStyle = RenderingStyle.COMPACT,
    indent: str = "    ",
) -> str:
    """Render a parsed docstring into docstring text.

    Parameters
    ----------
    docstring : Docstring
        parsed docstring representation
    rendering_style : RenderingStyle
        the style to render docstrings (Default value = RenderingStyle.COMPACT)
    indent : str
        the characters used as indentation in the
        docstring string (Default value = '    ')

    Returns
    -------
    str
        docstring text
    """
    titles: KeyReturnDict[str, str] = (
        docstring.section_titles
        if docstring.style == DocstringStyle.GOOGLE
        else KeyReturnDict()
    )

    def process_one(one: MainSections) -> None:
        """Build the output text for one entry in a section.

        Parameters
        ----------
        one : MainSections
            Docstring for which to build the raw text.
        """
        head = ""

        if isinstance(one, DocstringParam):
            head += one.arg_name or ""
        elif isinstance(one, DocstringReturns):
            head += one.return_name or ""
        elif isinstance(one, DocstringYields):
            head += one.yield_name or ""

        if isinstance(one, DocstringParam) and one.is_optional:
            optional = (
                "?" if rendering_style == RenderingStyle.COMPACT else ", optional"
            )
        else:
            optional = ""

        if one.type_name and head:
            head += f" ({one.type_name}{optional}):"
        elif one.type_name:
            head += f"{one.type_name}{optional}:"
        elif head:
            head += ":"

        if head:
            head = indent + head

        if one.description and rendering_style == RenderingStyle.EXPANDED:
            body = f"\n{indent}{indent}".join([head, *one.description.splitlines()])
            parts.append(body)
        elif one.description:
            (first, *rest) = one.description.splitlines()
            body = f"\n{indent}{indent}".join(
                [(f"{head} {first}" if head else f"{indent}{first}"), *rest]
            )
            parts.append(body)
        else:
            parts.append(head)

    def process_sect(name: str, args: Sequence[MainSections]) -> None:
        """Build the output for a docstring section.

        Parameters
        ----------
        name : str
            Section for which to build the output.
        args : Sequence[MainSections]
            List of individual elements of that section.
        """
        name = titles[name]
        if args:
            parts.append(f"{name}:")
            for arg in args:
                process_one(arg)
            parts.append("")

    parts: list[str] = []
    append_description(docstring, parts)

    process_sect("Args", [p for p in docstring.params or [] if p.args[0] == "param"])

    process_sect(
        "Attributes",
        [p for p in docstring.params or [] if p.args[0] == "attribute"],
    )

    process_sect(
        "Generics",
        [p for p in docstring.params or [] if p.args[0] == "generics"],
    )

    process_sect(
        "Returns",
        docstring.many_returns,
    )

    process_sect("Yields", docstring.many_yields)

    process_sect("Raises", docstring.raises or [])

    if docstring.returns and not docstring.many_returns:
        ret = docstring.returns
        parts.append(f'{titles["Yields" if ret else "Returns"]:}')
        parts.append("-" * len(parts[-1]))
        process_one(ret)

    for meta in docstring.meta:
        if isinstance(
            meta, (DocstringParam, DocstringReturns, DocstringRaises, DocstringYields)
        ):
            continue  # Already handled
        parts.append(f'{titles[meta.args[0].replace("_", "").title()]}:')
        if meta.description:
            lines = [(indent + line).rstrip() for line in meta.description.splitlines()]
            parts.append("\n".join(lines))
        parts.append("")

    while parts and not parts[-1]:
        parts.pop()

    return "\n".join(parts)