Skip to content

Commit ab37279

Browse files
committed
Super Update
我嘞个逗,完成了ptools
1 parent 0341a97 commit ab37279

1 file changed

Lines changed: 91 additions & 28 deletions

File tree

file/ptools/main.py

Lines changed: 91 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@
1111
import os
1212
from lxml import html as html2
1313
from lxml import etree
14+
import re
15+
from io import StringIO
1416

1517
__version__: str = "Beta 0.1"
1618

@@ -102,9 +104,11 @@ def main() -> int:
102104
title = title[0].text_content() if title else "Untitled"
103105
template_content = template_content.replace("%%title%%", title)
104106
html = template_content.replace("%%content%%", html)
107+
# 先写入原始HTML
105108
with open(output_path, "w", encoding="utf-8") as f:
106109
f.write(html)
107-
# pretty print
110+
111+
# 询问是否格式化(此时文件已关闭,可安全读取)
108112
pretty_input: str = console.input(
109113
"Is it necessary to format the output HTML file?(Y/N): "
110114
)
@@ -113,53 +117,112 @@ def main() -> int:
113117
raw_html = f.read()
114118

115119
def pretty_print_html(html_str: str) -> str:
116-
"""智能格式化HTML,保留完整结构和DOCTYPE。"""
117-
from io import StringIO
120+
"""格式化HTML,保留DOCTYPE和注释,缩进4空格,pre/code开始标签同行,并处理自定义标记%%c:class%%"""
121+
# 内部函数:处理文本中的标记,并添加class到所属元素
122+
def process_text(text, owner):
123+
# 跳过pre/code内的标记
124+
if owner is not None and owner.tag in ('pre', 'code'):
125+
return text
126+
pattern = r'%%c:([^%]+)%%'
127+
match = re.search(pattern, text)
128+
if match:
129+
class_name = match.group(1).strip()
130+
# 移除标记(只移除第一个)
131+
new_text = re.sub(pattern, '', text, count=1)
132+
# 为owner添加class
133+
if owner is not None:
134+
existing = owner.get('class', '')
135+
if existing:
136+
owner.set('class', f"{existing} {class_name}")
137+
else:
138+
owner.set('class', class_name)
139+
return new_text
140+
return text
141+
142+
# 递归遍历元素树,处理text和tail中的标记
143+
def process_markup(element, skip=False):
144+
# 处理element的text(如果不跳过)
145+
if not skip:
146+
if element.text and '%%' in element.text:
147+
element.text = process_text(element.text, element)
148+
# 处理子元素:如果本元素是pre/code,则子元素应跳过
149+
child_skip = skip or element.tag in ('pre', 'code')
150+
for child in element:
151+
process_markup(child, skip=child_skip)
152+
# 处理element的tail(始终处理,但会检查owner是否为pre/code)
153+
if element.tail and '%%' in element.tail:
154+
parent = element.getparent()
155+
if parent is not None:
156+
element.tail = process_text(element.tail, parent)
157+
158+
# 1. 提取DOCTYPE及其之前的内容(如注释)
159+
doctype_match = re.search(r'(<!DOCTYPE[^>]*>)', html_str, re.IGNORECASE)
160+
if doctype_match:
161+
doctype = doctype_match.group(1)
162+
before_doctype = html_str[:doctype_match.start()] # DOCTYPE前的注释等
163+
after_doctype = html_str[doctype_match.end():] # DOCTYPE后的内容
164+
else:
165+
doctype = ''
166+
before_doctype = ''
167+
after_doctype = html_str
168+
118169
try:
119-
# 使用 etree.HTMLParser 解析,它能保留 DOCTYPE
120-
parser = etree.HTMLParser(remove_blank_text=False) # 保留空白以便格式化
121-
tree = etree.parse(StringIO(html_str), parser)
122-
doctype = tree.docinfo.doctype if tree.docinfo.doctype else ''
123-
root = tree.getroot()
124-
# 格式化根元素
125-
formatted_root = etree.tostring(
126-
root,
127-
encoding='unicode',
128-
pretty_print=True,
129-
method='html'
130-
)
131-
# 如果存在 DOCTYPE,则拼接到前面
132-
if doctype:
133-
return doctype + '\n' + formatted_root
134-
else:
135-
return formatted_root
170+
# 2. 将剩余部分解析为完整HTML文档(自动补全缺失的html/body)
171+
root = html2.document_fromstring(after_doctype)
172+
173+
# 3. 使用4个空格进行层级缩进
174+
etree.indent(root, space=" ")
175+
176+
# 4. 调整 <pre><code> 格式:使其开始标签在同一行
177+
for pre in root.xpath('.//pre'):
178+
if len(pre) > 0 and pre[0].tag == 'code':
179+
pre.text = None # 清除pre本身的缩进文本
180+
pre[0].tail = '\n' # code后换行
181+
182+
# 5. 处理自定义标记 %%c:class%%
183+
process_markup(root)
184+
185+
# 6. 序列化为字符串(无需pretty_print,缩进已手动添加)
186+
formatted_root = etree.tostring(root, encoding='unicode', method='html')
187+
188+
# 7. 拼接:前置注释 + DOCTYPE + 换行 + 格式化后的文档
189+
return before_doctype + doctype + '\n' + formatted_root
190+
136191
except Exception as e:
137-
# 如果解析为完整文档失败(例如纯片段),回退到片段处理
138192
logging.warning(f"完整文档解析失败,尝试片段模式: {e}")
193+
# 降级方案:使用fragments_fromstring确保内容不丢失
139194
try:
140195
fragments = html2.fragments_fromstring(html_str)
141196
pretty_parts = []
142197
for frag in fragments:
143198
if isinstance(frag, str):
144199
pretty_parts.append(frag)
145200
else:
201+
# 对片段内的元素也尝试indent
202+
try:
203+
etree.indent(frag, space=" ")
204+
except:
205+
pass
206+
# 调整pre/code格式
207+
for pre in frag.xpath('.//pre'):
208+
if len(pre) > 0 and pre[0].tag == 'code':
209+
pre.text = None
210+
pre[0].tail = '\n'
211+
# 处理标记
212+
process_markup(frag)
146213
pretty_parts.append(
147-
etree.tostring(
148-
frag,
149-
encoding='unicode',
150-
pretty_print=True,
151-
method='html'
152-
)
214+
etree.tostring(frag, encoding='unicode', method='html')
153215
)
154216
return ''.join(pretty_parts)
155217
except Exception as e2:
156-
logging.error(f"片段解析也失败,返回原始内容: {e2}")
218+
logging.error(f"格式化失败,保留原始内容: {e2}")
157219
return html_str
158220

159221
pretty_html = pretty_print_html(raw_html)
160222
with open(output_path, "w", encoding="utf-8") as f:
161223
f.write(pretty_html)
162224
console.print(pretty_html)
225+
163226
logging.info(f'Converted "{path}" to "{output_path}". OK!')
164227
logging.info("Finished main process.")
165228
return 0

0 commit comments

Comments
 (0)