Search code examples
pythonreflectiondocstringsphinx-napoleon

Extract Google style docstring into dataclass using Sphinx Napoleon


I am trying to programmatically ingest ("reflect") Google style docstrings. I am using sphinx.ext.napoleon, as seemingly not many tools do this. I am following this example with the below function:

from sphinx.ext.napoleon import Config, GoogleDocstring


def foo(arg: int | None = 5) -> None:
    """Stub summary.

    Args:
        arg(int): Optional integer defaulted to 5.
    """


docstring = GoogleDocstring(foo.__doc__)
print(docstring)

However, my usage doesn't automagically convert the printed output to reST style like the Sphinx example does.

So this leads me to my question. How can one programmatically ingest the summary, extended description, arg names, and arg descriptions from a Google Style docstring? Ideally they are converted into some sort of data structure (e.g. dict or dataclass).


Solution

  • Instead, you can try to use built-in inspect module to get the docstring, like this:

    import inspect
    
    docstring = GoogleDocstring(inspect.getdoc(foo))
    print(docstring)
    

    This will print in following format:

    Stub summary.
    
    :param arg: Optional integer defaulted to 5.
    :type arg: int
    

    The difference between inspect.getdoc(foo) and foo.__doc__ seems to be indentation:

    print(foo.__doc__)
    
    Stub summary.
    
        Args:
            arg(int): Optional integer defaulted to 5.
    
    print(inspect.getdoc(foo))
    
    Stub summary.
    
    Args:
        arg(int): Optional integer defaulted to 5.
    

    To use __doc__ attribute, you can apply prepare_docstring function, like this:

    from sphinx.util.docstrings import prepare_docstring
    
    docstring = GoogleDocstring(prepare_docstring(foo.__doc__))
    print(docstring)
    

    Then, you can either write your own parser or use 3rd party libraries, like doctrans, docstring_parser, etc. For sake of example and simplicity, I've taken the solution below from doctrans source. Since, it supports more than required, as well as, I didn't want to install and pollute the system, thus, I've just used the code directly:

    import re
    import sys
    
    PARAM_OR_RETURNS_REGEX = re.compile(":(?:param|returns?)")
    RETURNS_REGEX = re.compile(":returns?: (?P<doc>.*)", re.DOTALL)
    PARAM_REGEX = re.compile(
        r":param (?P<name>[\*\w]+): (?P<doc>.*?)"
        r"(?:(?=:param)|(?=:return)|(?=:raises)|\Z)",
        re.DOTALL,
    )
    
    
    def trim(docstring):
        """Trim function from PEP-257."""
        if not docstring:
            return ""
        # Convert tabs to spaces (following the normal Python rules)
        # and split into a list of lines:
        lines = docstring.expandtabs().splitlines()
        # Determine minimum indentation (first line doesn't count):
        indent = sys.maxsize
        for line in lines[1:]:
            stripped = line.lstrip()
            if stripped:
                indent = min(indent, len(line) - len(stripped))
        # Remove indentation (first line is special):
        trimmed = [lines[0].strip()]
        if indent < sys.maxsize:
            for line in lines[1:]:
                trimmed.append(line[indent:].rstrip())
        # Strip off trailing and leading blank lines:
        while trimmed and not trimmed[-1]:
            trimmed.pop()
        while trimmed and not trimmed[0]:
            trimmed.pop(0)
    
        # Current code/unittests expects a line return at
        # end of multiline docstrings
        # workaround expected behavior from unittests
        if "\n" in docstring:
            trimmed.append("")
    
        # Return a single string:
        return "\n".join(trimmed)
    
    
    def reindent(string):
        return "\n".join(line.strip() for line in string.strip().split("\n"))
    
    
    def doc_to_type_doc(name, doc):
        doc = trim(doc).splitlines()
        docs, typ = [], []
        for line in doc:
            if line.startswith(":type"):
                line = line[len(":type ") :]
                colon_at = line.find(":")
                found_name = line[:colon_at]
                assert name == found_name, f"{name!r} != {found_name!r}"
                line = line[colon_at + 2 :]
                typ.append(
                    line[3:-3] if line.startswith("```") and line.endswith("```") else line
                )
            elif len(typ):
                typ.append(line)
            else:
                docs.append(line)
        return dict(doc="\n".join(docs), **{"typ": "\n".join(typ)} if len(typ) else {})
    
    
    def parse_docstring(docstring):
        """Parse the docstring into its components.
    
        :returns: a dictionary of form
                  {
                      'short_description': ...,
                      'long_description': ...,
                      'params': [{'name': ..., 'doc': ..., 'typ': ...}, ...],
                      "returns': {'name': ..., 'typ': ...}
                  }
        """
        short_description = long_description = returns = ""
        params = []
    
        if docstring:
            docstring = trim(docstring.lstrip("\n"))
    
            lines = docstring.split("\n", 1)
            short_description = lines[0]
    
            if len(lines) > 1:
                long_description = lines[1].strip()
    
                params_returns_desc = None
    
                match = PARAM_OR_RETURNS_REGEX.search(long_description)
                if match:
                    long_desc_end = match.start()
                    params_returns_desc = long_description[long_desc_end:].strip()
                    long_description = long_description[:long_desc_end].rstrip()
    
                if params_returns_desc:
                    params = [
                        dict(name=name, **doc_to_type_doc(name, doc))
                        for name, doc in PARAM_REGEX.findall(params_returns_desc)
                    ]
    
                    match = RETURNS_REGEX.search(params_returns_desc)
                    if match:
                        returns = reindent(match.group("doc"))
                    if returns:
                        r_dict = {"name": ""}
                        for idx, char in enumerate(returns):
                            if char == ":":
                                r_dict["typ"] = returns[idx + len(":rtype:") :].strip()
                                if r_dict["typ"].startswith("```") and r_dict[
                                    "typ"
                                ].endswith("```"):
                                    r_dict["typ"] = r_dict["typ"][3:-3]
                                break
                            r_dict["name"] += char
                        r_dict["name"] = r_dict["name"].rstrip()
                        returns = r_dict
    
        return {
            "short_description": short_description,
            "long_description": long_description,
            "params": params,
            "returns": returns,
        }
    
    
    parse_docstring("\n".join(docstring.lines()))