Skip to content

Commit 97402b5

Browse files
beledouxdenisKangOl
authored andcommitted
FIX: Allow multiple inlined image data links in html clean
Add a lazy quantifier in the regex `_find_image_dataurls` to match as few characters as possible, to make it stop at the first occurence of `;base64,` e.g. ```py >>> _find_image_dataurls = re.compile(r'data:image/(.+);base64,', re.I).findall >>> _find_image_dataurls('<div style="background: url(data:image/jpeg;base64,foo); background-image: url(data:image/jpeg;base64,foo);"></div>') ['jpeg;base64,foo); background-image: url(data:image/jpeg'] ``` ```py >>> _find_image_dataurls = re.compile(r'data:image/(.+?);base64,', re.I).findall >>> _find_image_dataurls('<div style="background: url(data:image/jpeg;base64,foo); background-image: url(data:image/jpeg;base64,foo);"></div>') ['jpeg', 'jpeg'] ``` This allows to have multiple image data links on the same line, which happens for instance in inline styles. Without this change, `_has_javascript_scheme` returns `True` because the count of safe image urls is lower than the number of possible malicious scheme. Then, the whole style is dropped as considered malicious. Co-authored-by: Christophe Simonis <chs@odoo.com>
1 parent 2dfd5ac commit 97402b5

2 files changed

Lines changed: 26 additions & 1 deletion

File tree

lxml_html_clean/clean.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@
5454
# All kinds of schemes besides just javascript: that can cause
5555
# execution:
5656
_find_image_dataurls = re.compile(
57-
r'data:image/(.+);base64,', re.I).findall
57+
r'data:image/(.+?);base64,', re.I).findall
5858
_possibly_malicious_schemes = re.compile(
5959
r'(javascript|jscript|livescript|vbscript|data|about|mocha):',
6060
re.I).findall

tests/test_clean.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -255,6 +255,31 @@ def test_image_data_links_in_style(self):
255255
cleaned,
256256
"%s -> %s" % (url, cleaned))
257257

258+
def test_image_data_links_in_inline_style(self):
259+
safe_attrs = set(lxml.html.defs.safe_attrs)
260+
safe_attrs.add('style')
261+
262+
cleaner = Cleaner(
263+
safe_attrs_only=True,
264+
safe_attrs=safe_attrs)
265+
266+
data = b'123'
267+
data_b64 = base64.b64encode(data).decode('ASCII')
268+
url = "url(data:image/jpeg;base64,%s)" % data_b64
269+
styles = [
270+
"background: %s" % url,
271+
"background: %s; background-image: %s" % (url, url),
272+
]
273+
for style in styles:
274+
html = '<div style="%s"></div>' % style
275+
s = lxml.html.fragment_fromstring(html)
276+
277+
cleaned = lxml.html.tostring(cleaner.clean_html(s))
278+
self.assertEqual(
279+
html.encode("UTF-8"),
280+
cleaned,
281+
"%s -> %s" % (style, cleaned))
282+
258283
def test_formaction_attribute_in_button_input(self):
259284
# The formaction attribute overrides the form's action and should be
260285
# treated as a malicious link attribute

0 commit comments

Comments
 (0)