add whitespace checking script for pydocs

psifertex · psifertex · commit 6ef2486e4ae1 · 2025-11-10T23:22:35.000-05:00
diff --git a/scripts/check_docstring_formatting.py b/scripts/check_docstring_formatting.py
@@ -0,0 +1,210 @@
+#!/usr/bin/env python3
+"""
+Script to identify docstring formatting issues in Python files.
+
+Checks for:
+1. Bullet lists (lines starting with *, -, +) without blank line before them
+2. Numbered lists (lines starting with digits and .) without blank line before them
+3. Code blocks (lines starting with >>>) without blank line before them
+4. reStructuredText directives (lines starting with ..) without blank line before them
+
+These are common reStructuredText/Sphinx formatting issues that can cause
+documentation to render incorrectly.
+
+The script attempts to avoid false positives by:
+- Skipping content inside literal blocks (after :: markers)
+- Ignoring items that follow Sphinx field markers (:param:, :Example:, etc.)
+- Handling Python interactive session output (lines between >>> prompts)
+- Recognizing indented continuations
+
+Known limitations:
+- May flag some valid trailing >>> prompts in code examples
+- Line numbers are approximate (offset from docstring start)
+- Some complex nested structures may not be handled perfectly
+
+Usage:
+  python check_docstring_formatting.py [directory]
+
+If no directory is specified, defaults to ../python relative to this script.
+"""
+
+import os
+import re
+import ast
+import sys
+from pathlib import Path
+
+
+def get_docstrings_from_file(filepath):
+    """Extract all docstrings from a Python file with their line numbers."""
+    try:
+        with open(filepath, 'r', encoding='utf-8') as f:
+            content = f.read()
+
+        tree = ast.parse(content, filename=str(filepath))
+        docstrings = []
+
+        for node in ast.walk(tree):
+            # Only check nodes that can have docstrings
+            if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef, ast.Module)):
+                try:
+                    docstring = ast.get_docstring(node, clean=False)
+                    if docstring:
+                        # Get the line number where the docstring starts
+                        if isinstance(node, ast.Module):
+                            # Module docstring is at the top
+                            line_num = 1
+                        else:
+                            # For functions/classes, it's the first statement
+                            line_num = node.body[0].lineno if node.body else node.lineno
+
+                        docstrings.append((line_num, docstring, type(node).__name__))
+                except:
+                    # Skip if we can't get the docstring
+                    pass
+
+        return docstrings
+    except Exception as e:
+        # Only show actual parse errors, not docstring extraction issues
+        if "parsing" in str(e).lower() or "syntax" in str(e).lower():
+            print(f"Error parsing {filepath}: {e}", file=sys.stderr)
+        return []
+
+
+def check_docstring_formatting(docstring):
+    """
+    Check for formatting issues in a docstring.
+
+    Returns a list of (line_offset, issue_description) tuples.
+    """
+    issues = []
+    lines = docstring.split('\n')
+
+    # Patterns that should have a blank line before them
+    patterns = [
+        (r'^\s*[\*\-\+]\s+', 'bullet list item'),
+        (r'^\s*\d+\.\s+', 'numbered list item'),
+        (r'^\s*>>>', 'code block'),
+        (r'^\s*\.\.\s+', 'reStructuredText directive'),
+    ]
+
+    # Sphinx field patterns that can contain code blocks or lists
+    sphinx_field_pattern = r'^\s*:[A-Za-z_][A-Za-z0-9_]*:'
+
+    # Track if we're in a literal block (started by ::)
+    in_literal_block = False
+    literal_block_indent = 0
+
+    for i, line in enumerate(lines):
+        # Skip the first line (always part of the opening)
+        if i == 0:
+            continue
+
+        current_indent = len(line) - len(line.lstrip())
+        stripped = line.strip()
+
+        # Check if previous line ended with :: (literal block marker)
+        if i > 0:
+            prev_line = lines[i - 1]
+            if prev_line.rstrip().endswith('::'):
+                in_literal_block = True
+                literal_block_indent = len(prev_line) - len(prev_line.lstrip())
+
+        # If we're in a literal block and dedented, we're out
+        if in_literal_block and stripped and current_indent <= literal_block_indent:
+            in_literal_block = False
+
+        # Skip checks if we're inside a literal block
+        if in_literal_block:
+            continue
+
+        # Check each pattern
+        for pattern, description in patterns:
+            if re.match(pattern, line):
+                # Check if previous line is blank or also matches a list pattern
+                prev_line = lines[i - 1] if i > 0 else ''
+
+                # If previous line is not blank
+                if prev_line.strip() != '':
+                    # Check if previous line is also a list item (which is OK)
+                    is_prev_list = any(re.match(p[0], prev_line) for p in patterns)
+
+                    # Check if previous line is a Sphinx field (like :Example:, :param:, etc.)
+                    is_sphinx_field = re.match(sphinx_field_pattern, prev_line)
+
+                    # Check if we're indented under a previous section
+                    # If current line is more indented than previous non-blank line, it's likely continuation
+                    prev_indent = len(prev_line) - len(prev_line.lstrip())
+                    is_indented_continuation = current_indent > prev_indent
+
+                    # Special case: >>> code blocks can have output lines between prompts
+                    # If current line is >>> and previous line has same or greater indent (but isn't also >>>),
+                    # it's likely output from previous command
+                    is_code_output = (description == 'code block' and
+                                     prev_indent >= 0 and
+                                     not prev_line.strip().startswith('>>>') and
+                                     not prev_line.strip().startswith('...'))
+
+                    if not is_prev_list and not is_sphinx_field and not is_indented_continuation and not is_code_output:
+                        issues.append((i + 1, f"{description} without blank line before it"))
+                break  # Only report one issue per line
+
+    return issues
+
+
+def find_python_files(root_dir):
+    """Find all Python files in the given directory."""
+    root = Path(root_dir)
+    return list(root.rglob('*.py'))
+
+
+def main():
+    # Default to checking the python directory relative to this script
+    script_dir = Path(__file__).parent
+    python_dir = script_dir.parent / 'python'
+
+    if len(sys.argv) > 1:
+        python_dir = Path(sys.argv[1])
+
+    if not python_dir.exists():
+        print(f"Error: Directory {python_dir} does not exist", file=sys.stderr)
+        sys.exit(1)
+
+    print(f"Checking Python files in: {python_dir}")
+    print("=" * 80)
+
+    files_with_issues = 0
+    total_issues = 0
+
+    for py_file in sorted(find_python_files(python_dir)):
+        docstrings = get_docstrings_from_file(py_file)
+        file_issues = []
+
+        for doc_line_num, docstring, node_type in docstrings:
+            issues = check_docstring_formatting(docstring)
+            if issues:
+                for line_offset, issue_desc in issues:
+                    # Calculate absolute line number in file
+                    # This is approximate since we don't have exact positions
+                    abs_line = doc_line_num + line_offset
+                    file_issues.append((abs_line, issue_desc, node_type))
+
+        if file_issues:
+            files_with_issues += 1
+            total_issues += len(file_issues)
+
+            # Make path relative to python_dir for cleaner output
+            rel_path = py_file.relative_to(python_dir.parent)
+            print(f"\n{rel_path}:")
+
+            for line_num, issue_desc, node_type in sorted(file_issues):
+                print(f"  Line ~{line_num} ({node_type}): {issue_desc}")
+
+    print("\n" + "=" * 80)
+    print(f"Summary: Found {total_issues} issues in {files_with_issues} files")
+
+    return 0 if total_issues == 0 else 1
+
+
+if __name__ == '__main__':
+    sys.exit(main())