11#!/usr/bin/env python
2- from __future__ import print_function
32import logging
43import re
54import sys
5+ import urllib .request
6+ import urllib .parse
7+ import urllib .error
68
9+ from lxml .etree import tostring
710from lxml .etree import tounicode
811from lxml .etree import _ElementTree
912from lxml .html import document_fromstring
1720from .htmls import get_title
1821from .htmls import get_author
1922from .htmls import shorten_title
20- from .compat import str_ , bytes_ , tostring_ , pattern_type
2123from .debug import describe , text_content
2224
2325
@@ -80,16 +82,16 @@ def text_length(i):
8082def compile_pattern (elements ):
8183 if not elements :
8284 return None
83- elif isinstance (elements , pattern_type ):
85+ elif isinstance (elements , re . Pattern ):
8486 return elements
85- elif isinstance (elements , (str_ , bytes_ )):
86- if isinstance (elements , bytes_ ):
87- elements = str_ (elements , "utf-8" )
88- elements = elements .split (u "," )
87+ elif isinstance (elements , (str , bytes )):
88+ if isinstance (elements , bytes ):
89+ elements = str (elements , "utf-8" )
90+ elements = elements .split ("," )
8991 if isinstance (elements , (list , tuple )):
90- return re .compile (u "|" .join ([re .escape (x .strip ()) for x in elements ]), re .U )
92+ return re .compile ("|" .join ([re .escape (x .strip ()) for x in elements ]), re .U )
9193 else :
92- raise Exception ("Unknown type for the pattern: {}" . format ( type (elements )) )
94+ raise Exception (f "Unknown type for the pattern: { type (elements )} " )
9395 # assume string or string like object
9496
9597
@@ -242,19 +244,15 @@ def summary(self, html_partial=False):
242244 log .info ("ruthless removal did not work. " )
243245 ruthless = False
244246 log .debug (
245- (
246247 "ended up stripping too much - "
247248 "going for a safer _parse"
248- )
249249 )
250250 # try again
251251 continue
252252 else :
253253 log .debug (
254- (
255254 "Ruthless and lenient parsing did not work. "
256255 "Returning raw html"
257- )
258256 )
259257 article = self .html .find ("body" )
260258 if article is None :
@@ -272,11 +270,7 @@ def summary(self, html_partial=False):
272270 return cleaned_article
273271 except Exception as e :
274272 log .exception ("error getting summary: " )
275- if sys .version_info [0 ] == 2 :
276- from .compat .two import raise_with_traceback
277- else :
278- from .compat .three import raise_with_traceback
279- raise_with_traceback (Unparseable , sys .exc_info ()[2 ], str_ (e ))
273+ raise Unparseable (str (e )).with_traceback (sys .exc_info ()[2 ])
280274
281275 def get_article (self , candidates , best_candidate , html_partial = False ):
282276 # Now that we have the top candidate, look through its siblings for
@@ -338,7 +332,7 @@ def select_best_candidate(self, candidates):
338332 )
339333 for candidate in sorted_candidates [:5 ]:
340334 elem = candidate ["elem" ]
341- log .debug ("Top 5 : % 6.3f %s" % (candidate ["content_score" ], describe (elem )))
335+ log .debug ("Top 5 : {: 6.3f} {}" . format (candidate ["content_score" ], describe (elem )))
342336
343337 best_candidate = sorted_candidates [0 ]
344338 return best_candidate
@@ -454,7 +448,7 @@ def score_node(self, elem):
454448
455449 def remove_unlikely_candidates (self ):
456450 for elem in self .html .findall (".//*" ):
457- s = "%s %s" % (elem .get ("class" , "" ), elem .get ("id" , "" ))
451+ s = "{} {}" . format (elem .get ("class" , "" ), elem .get ("id" , "" ))
458452 if len (s ) < 2 :
459453 continue
460454 if (
@@ -474,7 +468,8 @@ def transform_misused_divs_into_paragraphs(self):
474468 # This results in incorrect results in case there is an <img>
475469 # buried within an <a> for example
476470 if not REGEXES ["divToPElementsRe" ].search (
477- str_ (b"" .join (map (tostring_ , list (elem ))))
471+ str (b"" .join (tostring (s , encoding = 'utf-8' ) for s in elem ))
472+ # str(b"".join(map(tostring_, list(elem))))
478473 ):
479474 # log.debug("Altering %s to p" % (describe(elem)))
480475 elem .tag = "p"
@@ -501,13 +496,11 @@ def transform_misused_divs_into_paragraphs(self):
501496
502497 def tags (self , node , * tag_names ):
503498 for tag_name in tag_names :
504- for e in node .findall (".//%s" % tag_name ):
505- yield e
499+ yield from node .findall (".//%s" % tag_name )
506500
507501 def reverse_tags (self , node , * tag_names ):
508502 for tag_name in tag_names :
509- for e in reversed (node .findall (".//%s" % tag_name )):
510- yield e
503+ yield from reversed (node .findall (".//%s" % tag_name ))
511504
512505 def sanitize (self , node , candidates ):
513506 MIN_LEN = self .min_text_length
@@ -594,13 +587,13 @@ def sanitize(self, node, candidates):
594587 )
595588 to_remove = True
596589 elif weight < 25 and link_density > 0.2 :
597- reason = "too many links % .3f for its weight %s" % (
590+ reason = "too many links {: .3f} for its weight {}" . format (
598591 link_density ,
599592 weight ,
600593 )
601594 to_remove = True
602595 elif weight >= 25 and link_density > 0.5 :
603- reason = "too many links % .3f for its weight %s" % (
596+ reason = "too many links {: .3f} for its weight {}" . format (
604597 link_density ,
605598 weight ,
606599 )
@@ -726,18 +719,10 @@ def main():
726719 file = None
727720 if options .url :
728721 headers = {"User-Agent" : "Mozilla/5.0" }
729- if sys .version_info [0 ] == 3 :
730- import urllib .request , urllib .parse , urllib .error
731-
732- request = urllib .request .Request (options .url , None , headers )
733- file = urllib .request .urlopen (request )
734- else :
735- import urllib2
736-
737- request = urllib2 .Request (options .url , None , headers )
738- file = urllib2 .urlopen (request )
722+ request = urllib .request .Request (options .url , None , headers )
723+ file = urllib .request .urlopen (request )
739724 else :
740- file = open (args [0 ], "rt" )
725+ file = open (args [0 ])
741726 try :
742727 doc = Document (
743728 file .read (),
@@ -751,14 +736,8 @@ def main():
751736 result = "<h2>" + doc .short_title () + "</h2><br/>" + doc .summary ()
752737 open_in_browser (result )
753738 else :
754- enc = (
755- sys .__stdout__ .encoding or "utf-8"
756- ) # XXX: this hack could not always work, better to set PYTHONIOENCODING
757739 result = "Title:" + doc .short_title () + "\n " + doc .summary ()
758- if sys .version_info [0 ] == 3 :
759- print (result )
760- else :
761- print (result .encode (enc , "replace" ))
740+ print (result )
762741 finally :
763742 file .close ()
764743
0 commit comments