1111import os
1212from lxml import html as html2
1313from lxml import etree
14+ import re
15+ from io import StringIO
1416
1517__version__ : str = "Beta 0.1"
1618
@@ -102,9 +104,11 @@ def main() -> int:
102104 title = title [0 ].text_content () if title else "Untitled"
103105 template_content = template_content .replace ("%%title%%" , title )
104106 html = template_content .replace ("%%content%%" , html )
107+ # 先写入原始HTML
105108 with open (output_path , "w" , encoding = "utf-8" ) as f :
106109 f .write (html )
107- # pretty print
110+
111+ # 询问是否格式化(此时文件已关闭,可安全读取)
108112 pretty_input : str = console .input (
109113 "Is it necessary to format the output HTML file?(Y/N): "
110114 )
@@ -113,53 +117,112 @@ def main() -> int:
113117 raw_html = f .read ()
114118
115119 def pretty_print_html (html_str : str ) -> str :
116- """智能格式化HTML,保留完整结构和DOCTYPE。"""
117- from io import StringIO
120+ """格式化HTML,保留DOCTYPE和注释,缩进4空格,pre/code开始标签同行,并处理自定义标记%%c:class%%"""
121+ # 内部函数:处理文本中的标记,并添加class到所属元素
122+ def process_text (text , owner ):
123+ # 跳过pre/code内的标记
124+ if owner is not None and owner .tag in ('pre' , 'code' ):
125+ return text
126+ pattern = r'%%c:([^%]+)%%'
127+ match = re .search (pattern , text )
128+ if match :
129+ class_name = match .group (1 ).strip ()
130+ # 移除标记(只移除第一个)
131+ new_text = re .sub (pattern , '' , text , count = 1 )
132+ # 为owner添加class
133+ if owner is not None :
134+ existing = owner .get ('class' , '' )
135+ if existing :
136+ owner .set ('class' , f"{ existing } { class_name } " )
137+ else :
138+ owner .set ('class' , class_name )
139+ return new_text
140+ return text
141+
142+ # 递归遍历元素树,处理text和tail中的标记
143+ def process_markup (element , skip = False ):
144+ # 处理element的text(如果不跳过)
145+ if not skip :
146+ if element .text and '%%' in element .text :
147+ element .text = process_text (element .text , element )
148+ # 处理子元素:如果本元素是pre/code,则子元素应跳过
149+ child_skip = skip or element .tag in ('pre' , 'code' )
150+ for child in element :
151+ process_markup (child , skip = child_skip )
152+ # 处理element的tail(始终处理,但会检查owner是否为pre/code)
153+ if element .tail and '%%' in element .tail :
154+ parent = element .getparent ()
155+ if parent is not None :
156+ element .tail = process_text (element .tail , parent )
157+
158+ # 1. 提取DOCTYPE及其之前的内容(如注释)
159+ doctype_match = re .search (r'(<!DOCTYPE[^>]*>)' , html_str , re .IGNORECASE )
160+ if doctype_match :
161+ doctype = doctype_match .group (1 )
162+ before_doctype = html_str [:doctype_match .start ()] # DOCTYPE前的注释等
163+ after_doctype = html_str [doctype_match .end ():] # DOCTYPE后的内容
164+ else :
165+ doctype = ''
166+ before_doctype = ''
167+ after_doctype = html_str
168+
118169 try :
119- # 使用 etree.HTMLParser 解析,它能保留 DOCTYPE
120- parser = etree .HTMLParser (remove_blank_text = False ) # 保留空白以便格式化
121- tree = etree .parse (StringIO (html_str ), parser )
122- doctype = tree .docinfo .doctype if tree .docinfo .doctype else ''
123- root = tree .getroot ()
124- # 格式化根元素
125- formatted_root = etree .tostring (
126- root ,
127- encoding = 'unicode' ,
128- pretty_print = True ,
129- method = 'html'
130- )
131- # 如果存在 DOCTYPE,则拼接到前面
132- if doctype :
133- return doctype + '\n ' + formatted_root
134- else :
135- return formatted_root
170+ # 2. 将剩余部分解析为完整HTML文档(自动补全缺失的html/body)
171+ root = html2 .document_fromstring (after_doctype )
172+
173+ # 3. 使用4个空格进行层级缩进
174+ etree .indent (root , space = " " )
175+
176+ # 4. 调整 <pre><code> 格式:使其开始标签在同一行
177+ for pre in root .xpath ('.//pre' ):
178+ if len (pre ) > 0 and pre [0 ].tag == 'code' :
179+ pre .text = None # 清除pre本身的缩进文本
180+ pre [0 ].tail = '\n ' # code后换行
181+
182+ # 5. 处理自定义标记 %%c:class%%
183+ process_markup (root )
184+
185+ # 6. 序列化为字符串(无需pretty_print,缩进已手动添加)
186+ formatted_root = etree .tostring (root , encoding = 'unicode' , method = 'html' )
187+
188+ # 7. 拼接:前置注释 + DOCTYPE + 换行 + 格式化后的文档
189+ return before_doctype + doctype + '\n ' + formatted_root
190+
136191 except Exception as e :
137- # 如果解析为完整文档失败(例如纯片段),回退到片段处理
138192 logging .warning (f"完整文档解析失败,尝试片段模式: { e } " )
193+ # 降级方案:使用fragments_fromstring确保内容不丢失
139194 try :
140195 fragments = html2 .fragments_fromstring (html_str )
141196 pretty_parts = []
142197 for frag in fragments :
143198 if isinstance (frag , str ):
144199 pretty_parts .append (frag )
145200 else :
201+ # 对片段内的元素也尝试indent
202+ try :
203+ etree .indent (frag , space = " " )
204+ except :
205+ pass
206+ # 调整pre/code格式
207+ for pre in frag .xpath ('.//pre' ):
208+ if len (pre ) > 0 and pre [0 ].tag == 'code' :
209+ pre .text = None
210+ pre [0 ].tail = '\n '
211+ # 处理标记
212+ process_markup (frag )
146213 pretty_parts .append (
147- etree .tostring (
148- frag ,
149- encoding = 'unicode' ,
150- pretty_print = True ,
151- method = 'html'
152- )
214+ etree .tostring (frag , encoding = 'unicode' , method = 'html' )
153215 )
154216 return '' .join (pretty_parts )
155217 except Exception as e2 :
156- logging .error (f"片段解析也失败,返回原始内容 : { e2 } " )
218+ logging .error (f"格式化失败,保留原始内容 : { e2 } " )
157219 return html_str
158220
159221 pretty_html = pretty_print_html (raw_html )
160222 with open (output_path , "w" , encoding = "utf-8" ) as f :
161223 f .write (pretty_html )
162224 console .print (pretty_html )
225+
163226 logging .info (f'Converted "{ path } " to "{ output_path } ". OK!' )
164227 logging .info ("Finished main process." )
165228 return 0
0 commit comments