Super Update

CN-RBL · CN-RBL · commit ab37279ad33a · 2026-02-15T12:59:49.000+08:00
我嘞个逗，完成了ptools
diff --git a/file/ptools/main.py b/file/ptools/main.py
@@ -11,6 +11,8 @@
 import os
 from lxml import html as html2
 from lxml import etree
+import re
+from io import StringIO
 
 __version__: str = "Beta 0.1"
 
@@ -102,9 +104,11 @@ def main() -> int:
             title = title[0].text_content() if title else "Untitled"
             template_content = template_content.replace("%%title%%", title)
             html = template_content.replace("%%content%%", html)
+        # 先写入原始HTML
         with open(output_path, "w", encoding="utf-8") as f:
             f.write(html)
-                # pretty print
+
+        # 询问是否格式化（此时文件已关闭，可安全读取）
         pretty_input: str = console.input(
             "Is it necessary to format the output HTML file?(Y/N): "
         )
@@ -113,53 +117,112 @@ def main() -> int:
                 raw_html = f.read()
 
             def pretty_print_html(html_str: str) -> str:
-                """智能格式化HTML，保留完整结构和DOCTYPE。"""
-                from io import StringIO
+                """格式化HTML，保留DOCTYPE和注释，缩进4空格，pre/code开始标签同行，并处理自定义标记%%c:class%%"""
+                # 内部函数：处理文本中的标记，并添加class到所属元素
+                def process_text(text, owner):
+                    # 跳过pre/code内的标记
+                    if owner is not None and owner.tag in ('pre', 'code'):
+                        return text
+                    pattern = r'%%c:([^%]+)%%'
+                    match = re.search(pattern, text)
+                    if match:
+                        class_name = match.group(1).strip()
+                        # 移除标记（只移除第一个）
+                        new_text = re.sub(pattern, '', text, count=1)
+                        # 为owner添加class
+                        if owner is not None:
+                            existing = owner.get('class', '')
+                            if existing:
+                                owner.set('class', f"{existing} {class_name}")
+                            else:
+                                owner.set('class', class_name)
+                        return new_text
+                    return text
+
+                # 递归遍历元素树，处理text和tail中的标记
+                def process_markup(element, skip=False):
+                    # 处理element的text（如果不跳过）
+                    if not skip:
+                        if element.text and '%%' in element.text:
+                            element.text = process_text(element.text, element)
+                    # 处理子元素：如果本元素是pre/code，则子元素应跳过
+                    child_skip = skip or element.tag in ('pre', 'code')
+                    for child in element:
+                        process_markup(child, skip=child_skip)
+                    # 处理element的tail（始终处理，但会检查owner是否为pre/code）
+                    if element.tail and '%%' in element.tail:
+                        parent = element.getparent()
+                        if parent is not None:
+                            element.tail = process_text(element.tail, parent)
+
+                # 1. 提取DOCTYPE及其之前的内容（如注释）
+                doctype_match = re.search(r'(<!DOCTYPE[^>]*>)', html_str, re.IGNORECASE)
+                if doctype_match:
+                    doctype = doctype_match.group(1)
+                    before_doctype = html_str[:doctype_match.start()]  # DOCTYPE前的注释等
+                    after_doctype = html_str[doctype_match.end():]     # DOCTYPE后的内容
+                else:
+                    doctype = ''
+                    before_doctype = ''
+                    after_doctype = html_str
+
                 try:
-                    # 使用 etree.HTMLParser 解析，它能保留 DOCTYPE
-                    parser = etree.HTMLParser(remove_blank_text=False)  # 保留空白以便格式化
-                    tree = etree.parse(StringIO(html_str), parser)
-                    doctype = tree.docinfo.doctype if tree.docinfo.doctype else ''
-                    root = tree.getroot()
-                    # 格式化根元素
-                    formatted_root = etree.tostring(
-                        root,
-                        encoding='unicode',
-                        pretty_print=True,
-                        method='html'
-                    )
-                    # 如果存在 DOCTYPE，则拼接到前面
-                    if doctype:
-                        return doctype + '\n' + formatted_root
-                    else:
-                        return formatted_root
+                    # 2. 将剩余部分解析为完整HTML文档（自动补全缺失的html/body）
+                    root = html2.document_fromstring(after_doctype)
+
+                    # 3. 使用4个空格进行层级缩进
+                    etree.indent(root, space="    ")
+
+                    # 4. 调整 <pre><code> 格式：使其开始标签在同一行
+                    for pre in root.xpath('.//pre'):
+                        if len(pre) > 0 and pre[0].tag == 'code':
+                            pre.text = None          # 清除pre本身的缩进文本
+                            pre[0].tail = '\n'       # code后换行
+
+                    # 5. 处理自定义标记 %%c:class%%
+                    process_markup(root)
+
+                    # 6. 序列化为字符串（无需pretty_print，缩进已手动添加）
+                    formatted_root = etree.tostring(root, encoding='unicode', method='html')
+
+                    # 7. 拼接：前置注释 + DOCTYPE + 换行 + 格式化后的文档
+                    return before_doctype + doctype + '\n' + formatted_root
+
                 except Exception as e:
-                    # 如果解析为完整文档失败（例如纯片段），回退到片段处理
                     logging.warning(f"完整文档解析失败，尝试片段模式: {e}")
+                    # 降级方案：使用fragments_fromstring确保内容不丢失
                     try:
                         fragments = html2.fragments_fromstring(html_str)
                         pretty_parts = []
                         for frag in fragments:
                             if isinstance(frag, str):
                                 pretty_parts.append(frag)
                             else:
+                                # 对片段内的元素也尝试indent
+                                try:
+                                    etree.indent(frag, space="    ")
+                                except:
+                                    pass
+                                # 调整pre/code格式
+                                for pre in frag.xpath('.//pre'):
+                                    if len(pre) > 0 and pre[0].tag == 'code':
+                                        pre.text = None
+                                        pre[0].tail = '\n'
+                                # 处理标记
+                                process_markup(frag)
                                 pretty_parts.append(
-                                    etree.tostring(
-                                        frag,
-                                        encoding='unicode',
-                                        pretty_print=True,
-                                        method='html'
-                                    )
+                                    etree.tostring(frag, encoding='unicode', method='html')
                                 )
                         return ''.join(pretty_parts)
                     except Exception as e2:
-                        logging.error(f"片段解析也失败，返回原始内容: {e2}")
+                        logging.error(f"格式化失败，保留原始内容: {e2}")
                         return html_str
 
             pretty_html = pretty_print_html(raw_html)
             with open(output_path, "w", encoding="utf-8") as f:
                 f.write(pretty_html)
             console.print(pretty_html)
+
         logging.info(f'Converted "{path}" to "{output_path}". OK!')
     logging.info("Finished main process.")
     return 0