@@ -149,6 +149,7 @@ def test_utf8_kanji(self):
149149 sample = load_sample ("utf-8-kanji.sample.html" )
150150 doc = Document (sample )
151151 res = doc .summary ()
152+ assert 0 < len (res ) < 10000
152153
153154 def test_author_present (self ):
154155 sample = load_sample ("the-hurricane-rubin-carter-denzel-washington.html" )
@@ -180,3 +181,53 @@ def test_keep_images_absent_by_defautl(self):
180181 doc = Document (sample )
181182
182183 assert "<img" not in doc .summary ()
184+
185+ def test_cjk_summary (self ):
186+ """Check we can extract CJK text correctly."""
187+ html = """
188+ <html>
189+ <head>
190+ <title>这是标题</title>
191+ </head>
192+ <body>
193+ <div>一些无关紧要的内容</div>
194+ <div class="article-content">
195+ <h1>主要文章标题</h1>
196+ <p>这是主要内容的第一段。</p>
197+ <p>これはコンテンツの第2段落です。</p>
198+ <p>이것은 콘텐츠의 세 번째 단락입니다.</p>
199+ <p>This is the fourth paragraph.</p>
200+ </div>
201+ <div>More irrelevant stuff</div>
202+ </body>
203+ </html>
204+ """
205+ doc = Document (html )
206+ summary = doc .summary ()
207+ # Check that the main CJK content is present in the summary
208+ self .assertTrue ("这是主要内容的第一段" in summary )
209+ self .assertTrue ("これはコンテンツの第2段落です" in summary )
210+ self .assertTrue ("이것은 콘텐츠의 세 번째 단락입니다" in summary )
211+ # Check that irrelevant content is mostly gone
212+ self .assertFalse ("一些无关紧要的内容" in summary )
213+
214+ def test_shorten_title_delimiter_bug (self ):
215+ """Test that shorten_title handles delimiters correctly when the last part is valid.
216+
217+ This specifically targets a potential bug where 'p1' might be used instead of 'pl'.
218+ """
219+ html = """
220+ <html>
221+ <head>
222+ <title>Short Part | これは長いです</title>
223+ </head>
224+ <body>
225+ <div>Content</div>
226+ </body>
227+ </html>
228+ """
229+ doc = Document (html )
230+ # With the bug, this call might raise NameError: name 'p1' is not defined
231+ # With the fix, it should correctly return the last part.
232+ short_title = doc .short_title ()
233+ self .assertEqual (short_title , "これは長いです" )
0 commit comments