python-readability/readability/htmls.py at c8d8011f3d4c69d7667a52395237e56e66af8ea4 · buriy/python-readability · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
from lxml.html import tostring
import lxml.html
import re

from .cleaners import normalize_spaces, clean_attributes
from .encoding import get_encoding

utf8_parser = lxml.html.HTMLParser(encoding="utf-8")


def build_doc(page):
    if isinstance(page, str):
        encoding = None
        decoded_page = page
    else:
        encoding = get_encoding(page) or "utf-8"
        decoded_page = page.decode(encoding, "replace")

    # XXX: we have to do .decode and .encode even for utf-8 pages to remove bad characters
    doc = lxml.html.document_fromstring(
        decoded_page.encode("utf-8", "replace"), parser=utf8_parser
    )
    return doc, encoding


def js_re(src, pattern, flags, repl):
    return re.compile(pattern, flags).sub(src, repl.replace("$", "\\"))


def normalize_entities(cur_title):
    entities = {
        "\u2014": "-",
        "\u2013": "-",
        "&mdash;": "-",
        "&ndash;": "-",
        "\u00A0": " ",
        "\u00AB": '"',
        "\u00BB": '"',
        "&quot;": '"',
    }
    for c, r in entities.items():
        if c in cur_title:
            cur_title = cur_title.replace(c, r)

    return cur_title


def norm_title(title):
    return normalize_entities(normalize_spaces(title))


def get_title(doc):
    title = doc.find(".//title")
    if title is None or title.text is None or len(title.text) == 0:
        return "[no-title]"

    return norm_title(title.text)


def get_author(doc):
    author = doc.find(".//meta[@name='author']")
    if author is None or 'content' not in author.keys() or \
       len(author.get('content')) == 0:
        return "[no-author]"

    return author.get('content')


def add_match(collection, text, orig):
    text = norm_title(text)
    if len(text.split()) >= 2 and len(text) >= 15:
        if text.replace('"', "") in orig.replace('"', ""):
            collection.add(text)


TITLE_CSS_HEURISTICS = [
    "#title",
    "#head",
    "#heading",
    ".pageTitle",
    ".news_title",
    ".title",
    ".head",
    ".heading",
    ".contentheading",
    ".small_header_red",
]


def shorten_title(doc):
    title = doc.find(".//title")
    if title is None or title.text is None or len(title.text) == 0:
        return ""

    title = orig = norm_title(title.text)

    candidates = set()

    for item in [".//h1", ".//h2", ".//h3"]:
        for e in list(doc.iterfind(item)):
            if e.text:
                add_match(candidates, e.text, orig)
            if e.text_content():
                add_match(candidates, e.text_content(), orig)

    for item in TITLE_CSS_HEURISTICS:
        for e in doc.cssselect(item):
            if e.text:
                add_match(candidates, e.text, orig)
            if e.text_content():
                add_match(candidates, e.text_content(), orig)

    cjk = re.compile('[\u4e00-\u9fff]+')

    if candidates:
        title = sorted(candidates, key=len)[-1]
    else:
        for delimiter in [" | ", " - ", " :: ", " / "]:
            if delimiter in title:
                parts = orig.split(delimiter)
                p0 = parts[0]
                pl = parts[-1]
                if (len(p0.split()) >= 4) or (len(p0) >= 4 and cjk.search(p0)):
                    title = p0
                    break
                elif (len(pl.split()) >= 4) or (len(pl) >= 4 and cjk.search(pl)):
                    title = pl
                    break
        else:
            if ": " in title:
                p1 = orig.split(": ")[-1]
                if (len(p1.split()) >= 4) or (len(p1) >= 4 and cjk.search(p1)):
                    title = p1
                else:
                    title = orig.split(": ", 1)[1]

    if cjk.search(title):
        if not (4 <= len(title) < 100):  # Allow length >= 4, cap at 100
            return orig
    elif not 15 < len(title) < 150:
        return orig

    return title


# is it necessary? Cleaner from LXML is initialized correctly in cleaners.py
def get_body(doc):
    for elem in doc.xpath(".//script | .//link | .//style"):
        elem.drop_tree()
    # tostring() always return utf-8 encoded string
    # FIXME: isn't better to use tounicode?
    raw_html = tostring(doc.body or doc)
    if isinstance(raw_html, bytes):
        raw_html = raw_html.decode()
    cleaned = clean_attributes(raw_html)
    try:
        # BeautifulSoup(cleaned) #FIXME do we really need to try loading it?
        return cleaned
    except Exception:  # FIXME find the equivalent lxml error
        # logging.error("cleansing broke html content: %s\n---------\n%s" % (raw_html, cleaned))
        return raw_html