forked from timbertson/python-readability
-
Notifications
You must be signed in to change notification settings - Fork 353
Expand file tree
/
Copy pathhtmls.py
More file actions
161 lines (127 loc) · 4.46 KB
/
htmls.py
File metadata and controls
161 lines (127 loc) · 4.46 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
from lxml.html import tostring
import lxml.html
import re
from .cleaners import normalize_spaces, clean_attributes
from .encoding import get_encoding
utf8_parser = lxml.html.HTMLParser(encoding="utf-8")
def build_doc(page):
if isinstance(page, str):
encoding = None
decoded_page = page
else:
encoding = get_encoding(page) or "utf-8"
decoded_page = page.decode(encoding, "replace")
# XXX: we have to do .decode and .encode even for utf-8 pages to remove bad characters
doc = lxml.html.document_fromstring(
decoded_page.encode("utf-8", "replace"), parser=utf8_parser
)
return doc, encoding
def js_re(src, pattern, flags, repl):
return re.compile(pattern, flags).sub(src, repl.replace("$", "\\"))
def normalize_entities(cur_title):
entities = {
"\u2014": "-",
"\u2013": "-",
"—": "-",
"–": "-",
"\u00A0": " ",
"\u00AB": '"',
"\u00BB": '"',
""": '"',
}
for c, r in entities.items():
if c in cur_title:
cur_title = cur_title.replace(c, r)
return cur_title
def norm_title(title):
return normalize_entities(normalize_spaces(title))
def get_title(doc):
title = doc.find(".//title")
if title is None or title.text is None or len(title.text) == 0:
return "[no-title]"
return norm_title(title.text)
def get_author(doc):
author = doc.find(".//meta[@name='author']")
if author is None or 'content' not in author.keys() or \
len(author.get('content')) == 0:
return "[no-author]"
return author.get('content')
def add_match(collection, text, orig):
text = norm_title(text)
if len(text.split()) >= 2 and len(text) >= 15:
if text.replace('"', "") in orig.replace('"', ""):
collection.add(text)
TITLE_CSS_HEURISTICS = [
"#title",
"#head",
"#heading",
".pageTitle",
".news_title",
".title",
".head",
".heading",
".contentheading",
".small_header_red",
]
def shorten_title(doc):
title = doc.find(".//title")
if title is None or title.text is None or len(title.text) == 0:
return ""
title = orig = norm_title(title.text)
candidates = set()
for item in [".//h1", ".//h2", ".//h3"]:
for e in list(doc.iterfind(item)):
if e.text:
add_match(candidates, e.text, orig)
if e.text_content():
add_match(candidates, e.text_content(), orig)
for item in TITLE_CSS_HEURISTICS:
for e in doc.cssselect(item):
if e.text:
add_match(candidates, e.text, orig)
if e.text_content():
add_match(candidates, e.text_content(), orig)
cjk = re.compile('[\u4e00-\u9fff]+')
if candidates:
title = sorted(candidates, key=len)[-1]
else:
for delimiter in [" | ", " - ", " :: ", " / "]:
if delimiter in title:
parts = orig.split(delimiter)
p0 = parts[0]
pl = parts[-1]
if (len(p0.split()) >= 4) or (len(p0) >= 4 and cjk.search(p0)):
title = p0
break
elif (len(pl.split()) >= 4) or (len(pl) >= 4 and cjk.search(pl)):
title = pl
break
else:
if ": " in title:
p1 = orig.split(": ")[-1]
if (len(p1.split()) >= 4) or (len(p1) >= 4 and cjk.search(p1)):
title = p1
else:
title = orig.split(": ", 1)[1]
if cjk.search(title):
if not (4 <= len(title) < 100): # Allow length >= 4, cap at 100
return orig
elif not 15 < len(title) < 150:
return orig
return title
# is it necessary? Cleaner from LXML is initialized correctly in cleaners.py
def get_body(doc):
for elem in doc.xpath(".//script | .//link | .//style"):
elem.drop_tree()
# tostring() always return utf-8 encoded string
# FIXME: isn't better to use tounicode?
raw_html = tostring(doc.body or doc)
if isinstance(raw_html, bytes):
raw_html = raw_html.decode()
cleaned = clean_attributes(raw_html)
try:
# BeautifulSoup(cleaned) #FIXME do we really need to try loading it?
return cleaned
except Exception: # FIXME find the equivalent lxml error
# logging.error("cleansing broke html content: %s\n---------\n%s" % (raw_html, cleaned))
return raw_html