Skip to content

Commit 1f22397

Browse files
authored
Merge pull request #187 from dbowring/pyproject2
Python Version Support, lxml[html_clean] fix, docs fix
2 parents 9ee6ef3 + acb2f3d commit 1f22397

10 files changed

Lines changed: 54 additions & 154 deletions

File tree

.travis.yml

Lines changed: 0 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -4,47 +4,16 @@ cache: pip
44

55
matrix:
66
include:
7-
- name: "Python 2.7 on Linux"
8-
python: 2.7
9-
env: PIP=pip
10-
- name: "Python 3.5 on Linux"
11-
python: 3.5
12-
- name: "Python 3.6 on Linux"
13-
python: 3.6
14-
- name: "Python 3.7 on Linux"
15-
python: 3.7
167
- name: "Python 3.8 on Linux"
178
dist: xenial
189
python: 3.8
1910
- name: "Python 3.9 Nightly on Linux"
2011
dist: bionic
2112
python: nightly
22-
- name: "Pypy on Linux"
23-
python: pypy
24-
env: PIP=pip
2513
- name: "Pypy 3 on Linux"
2614
python: pypy3
27-
- name: "Python 3.7 on older macOS"
28-
os: osx
29-
osx_image: xcode9.4
30-
language: shell
31-
env: TOXENV=py37
32-
before_install:
33-
- sw_vers
34-
- python3 --version
35-
- pip3 --version
36-
- name: "Python 3.7 on macOS"
37-
os: osx
38-
osx_image: xcode11
39-
language: shell
40-
env: TOXENV=py37
41-
before_install:
42-
- sw_vers
43-
- python3 --version
44-
- pip3 --version
4515
allow_failures:
4616
- python: nightly
47-
- python: pypy
4817
- python: pypy3
4918
- os: osx
5019

doc/source/conf.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
#!/usr/bin/env python3
2-
# -*- coding: utf-8 -*-
32
#
43
# readability documentation build configuration file, created by
54
# sphinx-quickstart on Thu Mar 23 16:29:38 2017.
@@ -38,7 +37,7 @@
3837
"sphinx.ext.doctest",
3938
"sphinx.ext.intersphinx",
4039
"sphinx.ext.todo",
41-
"recommonmark",
40+
"myst_parser",
4241
]
4342

4443
# Add any paths that contain templates here, relative to this directory.
@@ -72,7 +71,7 @@
7271
#
7372
# This is also used if you do content translation via gettext catalogs.
7473
# Usually you set "language" from the command line for these cases.
75-
language = None
74+
language = "en"
7675

7776
# List of patterns, relative to source directory, that match files and
7877
# directories to ignore when looking for source files.

readability/compat/__init__.py

Lines changed: 0 additions & 26 deletions
This file was deleted.

readability/compat/three.py

Lines changed: 0 additions & 6 deletions
This file was deleted.

readability/compat/two.py

Lines changed: 0 additions & 6 deletions
This file was deleted.

readability/encoding.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -39,11 +39,10 @@ def get_encoding(page):
3939
for declared_encoding in declared_encodings:
4040
try:
4141
# Python3 only
42-
if sys.version_info[0] == 3:
43-
# declared_encoding will actually be bytes but .decode() only
44-
# accepts `str` type. Decode blindly with ascii because no one should
45-
# ever use non-ascii characters in the name of an encoding.
46-
declared_encoding = declared_encoding.decode("ascii", "replace")
42+
# declared_encoding will actually be bytes but .decode() only
43+
# accepts `str` type. Decode blindly with ascii because no one should
44+
# ever use non-ascii characters in the name of an encoding.
45+
declared_encoding = declared_encoding.decode("ascii", "replace")
4746

4847
encoding = fix_charset(declared_encoding)
4948
# Now let's decode the page

readability/htmls.py

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,12 @@
44

55
from .cleaners import normalize_spaces, clean_attributes
66
from .encoding import get_encoding
7-
from .compat import str_
87

98
utf8_parser = lxml.html.HTMLParser(encoding="utf-8")
109

1110

1211
def build_doc(page):
13-
if isinstance(page, str_):
12+
if isinstance(page, str):
1413
encoding = None
1514
decoded_page = page
1615
else:
@@ -30,14 +29,14 @@ def js_re(src, pattern, flags, repl):
3029

3130
def normalize_entities(cur_title):
3231
entities = {
33-
u"\u2014": "-",
34-
u"\u2013": "-",
35-
u"—": "-",
36-
u"–": "-",
37-
u"\u00A0": " ",
38-
u"\u00AB": '"',
39-
u"\u00BB": '"',
40-
u""": '"',
32+
"\u2014": "-",
33+
"\u2013": "-",
34+
"—": "-",
35+
"–": "-",
36+
"\u00A0": " ",
37+
"\u00AB": '"',
38+
"\u00BB": '"',
39+
""": '"',
4140
}
4241
for c, r in entities.items():
4342
if c in cur_title:

readability/readability.py

Lines changed: 24 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,12 @@
11
#!/usr/bin/env python
2-
from __future__ import print_function
32
import logging
43
import re
54
import sys
5+
import urllib.request
6+
import urllib.parse
7+
import urllib.error
68

9+
from lxml.etree import tostring
710
from lxml.etree import tounicode
811
from lxml.etree import _ElementTree
912
from lxml.html import document_fromstring
@@ -17,7 +20,6 @@
1720
from .htmls import get_title
1821
from .htmls import get_author
1922
from .htmls import shorten_title
20-
from .compat import str_, bytes_, tostring_, pattern_type
2123
from .debug import describe, text_content
2224

2325

@@ -80,16 +82,16 @@ def text_length(i):
8082
def compile_pattern(elements):
8183
if not elements:
8284
return None
83-
elif isinstance(elements, pattern_type):
85+
elif isinstance(elements, re.Pattern):
8486
return elements
85-
elif isinstance(elements, (str_, bytes_)):
86-
if isinstance(elements, bytes_):
87-
elements = str_(elements, "utf-8")
88-
elements = elements.split(u",")
87+
elif isinstance(elements, (str, bytes)):
88+
if isinstance(elements, bytes):
89+
elements = str(elements, "utf-8")
90+
elements = elements.split(",")
8991
if isinstance(elements, (list, tuple)):
90-
return re.compile(u"|".join([re.escape(x.strip()) for x in elements]), re.U)
92+
return re.compile("|".join([re.escape(x.strip()) for x in elements]), re.U)
9193
else:
92-
raise Exception("Unknown type for the pattern: {}".format(type(elements)))
94+
raise Exception(f"Unknown type for the pattern: {type(elements)}")
9395
# assume string or string like object
9496

9597

@@ -242,19 +244,15 @@ def summary(self, html_partial=False):
242244
log.info("ruthless removal did not work. ")
243245
ruthless = False
244246
log.debug(
245-
(
246247
"ended up stripping too much - "
247248
"going for a safer _parse"
248-
)
249249
)
250250
# try again
251251
continue
252252
else:
253253
log.debug(
254-
(
255254
"Ruthless and lenient parsing did not work. "
256255
"Returning raw html"
257-
)
258256
)
259257
article = self.html.find("body")
260258
if article is None:
@@ -272,11 +270,7 @@ def summary(self, html_partial=False):
272270
return cleaned_article
273271
except Exception as e:
274272
log.exception("error getting summary: ")
275-
if sys.version_info[0] == 2:
276-
from .compat.two import raise_with_traceback
277-
else:
278-
from .compat.three import raise_with_traceback
279-
raise_with_traceback(Unparseable, sys.exc_info()[2], str_(e))
273+
raise Unparseable(str(e)).with_traceback(sys.exc_info()[2])
280274

281275
def get_article(self, candidates, best_candidate, html_partial=False):
282276
# Now that we have the top candidate, look through its siblings for
@@ -338,7 +332,7 @@ def select_best_candidate(self, candidates):
338332
)
339333
for candidate in sorted_candidates[:5]:
340334
elem = candidate["elem"]
341-
log.debug("Top 5 : %6.3f %s" % (candidate["content_score"], describe(elem)))
335+
log.debug("Top 5 : {:6.3f} {}".format(candidate["content_score"], describe(elem)))
342336

343337
best_candidate = sorted_candidates[0]
344338
return best_candidate
@@ -454,7 +448,7 @@ def score_node(self, elem):
454448

455449
def remove_unlikely_candidates(self):
456450
for elem in self.html.findall(".//*"):
457-
s = "%s %s" % (elem.get("class", ""), elem.get("id", ""))
451+
s = "{} {}".format(elem.get("class", ""), elem.get("id", ""))
458452
if len(s) < 2:
459453
continue
460454
if (
@@ -474,7 +468,8 @@ def transform_misused_divs_into_paragraphs(self):
474468
# This results in incorrect results in case there is an <img>
475469
# buried within an <a> for example
476470
if not REGEXES["divToPElementsRe"].search(
477-
str_(b"".join(map(tostring_, list(elem))))
471+
str(b"".join(tostring(s, encoding='utf-8') for s in elem))
472+
# str(b"".join(map(tostring_, list(elem))))
478473
):
479474
# log.debug("Altering %s to p" % (describe(elem)))
480475
elem.tag = "p"
@@ -501,13 +496,11 @@ def transform_misused_divs_into_paragraphs(self):
501496

502497
def tags(self, node, *tag_names):
503498
for tag_name in tag_names:
504-
for e in node.findall(".//%s" % tag_name):
505-
yield e
499+
yield from node.findall(".//%s" % tag_name)
506500

507501
def reverse_tags(self, node, *tag_names):
508502
for tag_name in tag_names:
509-
for e in reversed(node.findall(".//%s" % tag_name)):
510-
yield e
503+
yield from reversed(node.findall(".//%s" % tag_name))
511504

512505
def sanitize(self, node, candidates):
513506
MIN_LEN = self.min_text_length
@@ -594,13 +587,13 @@ def sanitize(self, node, candidates):
594587
)
595588
to_remove = True
596589
elif weight < 25 and link_density > 0.2:
597-
reason = "too many links %.3f for its weight %s" % (
590+
reason = "too many links {:.3f} for its weight {}".format(
598591
link_density,
599592
weight,
600593
)
601594
to_remove = True
602595
elif weight >= 25 and link_density > 0.5:
603-
reason = "too many links %.3f for its weight %s" % (
596+
reason = "too many links {:.3f} for its weight {}".format(
604597
link_density,
605598
weight,
606599
)
@@ -726,18 +719,10 @@ def main():
726719
file = None
727720
if options.url:
728721
headers = {"User-Agent": "Mozilla/5.0"}
729-
if sys.version_info[0] == 3:
730-
import urllib.request, urllib.parse, urllib.error
731-
732-
request = urllib.request.Request(options.url, None, headers)
733-
file = urllib.request.urlopen(request)
734-
else:
735-
import urllib2
736-
737-
request = urllib2.Request(options.url, None, headers)
738-
file = urllib2.urlopen(request)
722+
request = urllib.request.Request(options.url, None, headers)
723+
file = urllib.request.urlopen(request)
739724
else:
740-
file = open(args[0], "rt")
725+
file = open(args[0])
741726
try:
742727
doc = Document(
743728
file.read(),
@@ -751,14 +736,8 @@ def main():
751736
result = "<h2>" + doc.short_title() + "</h2><br/>" + doc.summary()
752737
open_in_browser(result)
753738
else:
754-
enc = (
755-
sys.__stdout__.encoding or "utf-8"
756-
) # XXX: this hack could not always work, better to set PYTHONIOENCODING
757739
result = "Title:" + doc.short_title() + "\n" + doc.summary()
758-
if sys.version_info[0] == 3:
759-
print(result)
760-
else:
761-
print(result.encode(enc, "replace"))
740+
print(result)
762741
finally:
763742
file.close()
764743

setup.py

Lines changed: 12 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,9 @@
11
#!/usr/bin/env python
22

3-
from __future__ import print_function
43
import codecs
54
import os
65
import re
76
from setuptools import setup
8-
import sys
9-
10-
lxml_requirement = "lxml"
11-
if sys.platform == "darwin":
12-
import platform
13-
14-
mac_ver = platform.mac_ver()[0]
15-
mac_major, mac_minor = mac_ver.split('.')[:2]
16-
if int(mac_major) == 10 and int(mac_minor) < 9:
17-
print("Using lxml<2.4")
18-
lxml_requirement = "lxml<2.4"
197

208
speed_deps = [
219
"cchardet",
@@ -59,8 +47,13 @@ def find_version(*file_paths):
5947
long_description_content_type='text/x-rst',
6048
license="Apache License 2.0",
6149
url="http://github.com/buriy/python-readability",
62-
packages=["readability", "readability.compat"],
63-
install_requires=["chardet", lxml_requirement, "cssselect"],
50+
packages=["readability"],
51+
install_requires=[
52+
"chardet",
53+
"lxml[html_clean]",
54+
"lxml-html-clean; python_version < '3.11'",
55+
"cssselect"
56+
],
6457
tests_require=test_deps,
6558
extras_require=extras,
6659
classifiers=[
@@ -72,12 +65,12 @@ def find_version(*file_paths):
7265
"Topic :: Internet",
7366
"Topic :: Software Development :: Libraries :: Python Modules",
7467
"Programming Language :: Python",
75-
"Programming Language :: Python :: 2",
76-
"Programming Language :: Python :: 2.7",
7768
"Programming Language :: Python :: 3",
78-
"Programming Language :: Python :: 3.5",
79-
"Programming Language :: Python :: 3.6",
80-
"Programming Language :: Python :: 3.7",
8169
"Programming Language :: Python :: 3.8",
70+
"Programming Language :: Python :: 3.9",
71+
"Programming Language :: Python :: 3.10",
72+
"Programming Language :: Python :: 3.11",
73+
"Programming Language :: Python :: 3.12",
74+
"Programming Language :: Python :: Implementation :: PyPy",
8275
],
8376
)

0 commit comments

Comments
 (0)