Merge pull request #190 from botlabsDev/add_option_to_keep_all_images

buriy · web-flow · commit 87f97436c78f · 2025-01-13T01:14:20.000+07:00
Fix issue #89, introduce flag option to keep images in summary.
diff --git a/readability/readability.py b/readability/readability.py
@@ -210,12 +210,13 @@ def get_clean_html(self):
         """
         return clean_attributes(tounicode(self.html, method="html"))
 
-    def summary(self, html_partial=False):
+    def summary(self, html_partial=False, keep_all_images=False):
         """
         Given a HTML file, extracts the text of the article.
 
         :param html_partial: return only the div of the document, don't wrap
                              in html and body tags.
+        :param keep_all_images: Keep all images in summary.
 
         Warning: It mutates internal DOM representation of the HTML document,
         so it is better to call other API methods before this one.
@@ -257,7 +258,7 @@ def summary(self, html_partial=False):
                         article = self.html.find("body")
                         if article is None:
                             article = self.html
-                cleaned_article = self.sanitize(article, candidates)
+                cleaned_article = self.sanitize(article, candidates, keep_all_images)
 
                 article_length = len(cleaned_article or "")
                 retry_length = self.retry_length
@@ -502,7 +503,7 @@ def reverse_tags(self, node, *tag_names):
         for tag_name in tag_names:
             yield from reversed(node.findall(".//%s" % tag_name))
 
-    def sanitize(self, node, candidates):
+    def sanitize(self, node, candidates, keep_all_images=False):
         MIN_LEN = self.min_text_length
         for header in self.tags(node, "h1", "h2", "h3", "h4", "h5", "h6"):
             if self.class_weight(header) < 0 or self.get_link_density(header) > 0.33:
@@ -563,8 +564,8 @@ def sanitize(self, node, candidates):
                 to_remove = False
                 reason = ""
 
-                # if el.tag == 'div' and counts["img"] >= 1:
-                #    continue
+                if keep_all_images and el.tag == 'div' and counts["img"] >= 1:
+                    continue
                 if counts["p"] and counts["img"] > 1 + counts["p"] * 1.3:
                     reason = "too many images (%s)" % counts["img"]
                     to_remove = True
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -1,4 +1,6 @@
 lxml
+lxml_html_clean
+pytest
 chardet
 nose
 pep8
diff --git a/tests/samples/summary-keep-all-images.sample.html b/tests/samples/summary-keep-all-images.sample.html
@@ -0,0 +1,29 @@
+<!DOCTYPE html>
+<html lang="en">
+<head></head>
+<body>
+<h2>
+    <span>
+        H2 Headline H2 Headline H2 Headline H2 Headline H2 Headline H2 Headline H2 Headline H2 Headline H2 Headline H2 Headline
+    </span>
+</h2>
+<p>
+    <spa>
+        Text Text Text Text Text Text Text Text Text Text
+    </spa>
+</p>
+<div>
+    <span>
+        <a>
+            <img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAIAAAACUFjqAAABhGlDQ1BJQ0MgcHJvZmlsZQAAKJF9kT1Iw1AUhU9TpSLVDnYQcchQnSyIFXHUKhShQqgVWnUweekfNDEkKS6OgmvBwZ/FqoOLs64OroIg+APiLjgpukiJ9yWFFjFeeLyP8+45vHcfIDSqTLO6xgFNt81MKinm8iti6BUBhBFBPxIys4xZSUrDt77uqZvqLs6z/Pv+rD61YDEgIBLPMMO0ideJpzZtg/M+cZSVZZX4nHjMpAsSP3Jd8fiNc8llgWdGzWxmjjhKLJY6WOlgVjY14knimKrplC/kPFY5b3HWqjXWuid/YbigLy9xndYwUljAIiSIUFBDBVXYiNOuk2IhQ+dJH/+Q65fIpZCrAkaOeWxAg+z6wf/g92ytYmLCSwonge4Xx/kYAUK7QLPuON/HjtM8AYLPwJXe9m80gOlP0uttLXYERLaBi+u2puwBlzvA4JMhm7IrBWkJxSLwfkbflAcGboHeVW9urXOcPgBZmlX6Bjg4BEZLlL3m8+6ezrn929Oa3w9e03KfJqsuOAAAAAlwSFlzAAAuIwAALiMBeKU/dgAAAAd0SU1FB+kBDA8PKt1W5MYAAAAZdEVYdENvbW1lbnQAQ3JlYXRlZCB3aXRoIEdJTVBXgQ4XAAAAFUlEQVQY02P8x+rFgBswMeAFI1UaAJ65AWFYB2G5AAAAAElFTkSuQmCC"
+            />
+         </a>
+    </span>
+</div>
+<p>
+    <spa>
+        Text Text Text Text Text Text Text Text Text Text
+    </spa>
+</p>
+</body>
+</html>
diff --git a/tests/test_article_only.py b/tests/test_article_only.py
@@ -133,3 +133,24 @@ def test_author_absent(self):
         sample = load_sample("si-game.sample.html")
         doc = Document(sample)
         assert '[no-author]' == doc.author()
+
+    def test_keep_images_present(self):
+        sample = load_sample("summary-keep-all-images.sample.html")
+
+        doc = Document(sample)
+
+        assert "<img" in doc.summary(keep_all_images=True)
+
+    def test_keep_images_absent(self):
+        sample = load_sample("summary-keep-all-images.sample.html")
+
+        doc = Document(sample)
+
+        assert "<img" not in doc.summary(keep_all_images=False)
+
+    def test_keep_images_absent_by_defautl(self):
+        sample = load_sample("summary-keep-all-images.sample.html")
+
+        doc = Document(sample)
+
+        assert "<img" not in doc.summary()

-Original file line number
+Diff line change
@@ @@ -1,4 +1,6 @@ @@
 lxml
 +lxml_html_clean
 +pytest
 chardet
 nose
 pep8