@@ -210,12 +210,13 @@ def get_clean_html(self):
210210 """
211211 return clean_attributes (tounicode (self .html , method = "html" ))
212212
213- def summary (self , html_partial = False ):
213+ def summary (self , html_partial = False , keep_all_images = False ):
214214 """
215215 Given a HTML file, extracts the text of the article.
216216
217217 :param html_partial: return only the div of the document, don't wrap
218218 in html and body tags.
219+ :param keep_all_images: Keep all images in summary.
219220
220221 Warning: It mutates internal DOM representation of the HTML document,
221222 so it is better to call other API methods before this one.
@@ -257,7 +258,7 @@ def summary(self, html_partial=False):
257258 article = self .html .find ("body" )
258259 if article is None :
259260 article = self .html
260- cleaned_article = self .sanitize (article , candidates )
261+ cleaned_article = self .sanitize (article , candidates , keep_all_images )
261262
262263 article_length = len (cleaned_article or "" )
263264 retry_length = self .retry_length
@@ -502,7 +503,7 @@ def reverse_tags(self, node, *tag_names):
502503 for tag_name in tag_names :
503504 yield from reversed (node .findall (".//%s" % tag_name ))
504505
505- def sanitize (self , node , candidates ):
506+ def sanitize (self , node , candidates , keep_all_images = False ):
506507 MIN_LEN = self .min_text_length
507508 for header in self .tags (node , "h1" , "h2" , "h3" , "h4" , "h5" , "h6" ):
508509 if self .class_weight (header ) < 0 or self .get_link_density (header ) > 0.33 :
@@ -563,8 +564,8 @@ def sanitize(self, node, candidates):
563564 to_remove = False
564565 reason = ""
565566
566- # if el.tag == 'div' and counts["img"] >= 1:
567- # continue
567+ if keep_all_images and el .tag == 'div' and counts ["img" ] >= 1 :
568+ continue
568569 if counts ["p" ] and counts ["img" ] > 1 + counts ["p" ] * 1.3 :
569570 reason = "too many images (%s)" % counts ["img" ]
570571 to_remove = True
0 commit comments