Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: update html cleaning rules #444

Merged
merged 5 commits into from
Jul 10, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 43 additions & 1 deletion openedx/core/djangolib/markup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import markupsafe
import bleach
import re
from lxml.html.clean import Cleaner
from mako.filters import decode

Expand All @@ -14,6 +15,47 @@
Text = markupsafe.escape # pylint: disable=invalid-name


class HTMLCleaner(Cleaner):
"""
HTMLCleaner extends lxml.html.clean.Cleaner to sanitize HTML content while preserving valid URLs
and removing unsafe JavaScript links.

Attributes:
-----------
_is_url : Callable[[str], Optional[re.Match]]
A regular expression pattern used to identify valid URLs. This pattern matches strings that
start with 'http', 'https', 'ftp', or 'file' schemes, case-insensitively.
"""
def _remove_javascript_link(self, link: str):
"""
Checks if the given link is a valid URL. If it is, the link is returned unchanged.
ahmed-arb marked this conversation as resolved.
Show resolved Hide resolved
Otherwise, the method delegates to the parent class's method to remove the JavaScript link.

Parameters:
-----------
link : str
The hyperlink (href attribute value) to be checked and potentially sanitized.

Returns:
--------
Optional[str]
The original link if it is a valid URL; otherwise, the result of the parent class's method
to handle the link.

Example:
--------
'https://www.example.com/javascript:something' Valid
'javascript:alert("hello")' Invalid
'http://example.com/path/to/page' Valid
'ftp://ftp.example.com/resource' Valid
'file://localhost/path/to/file' Valid
"""
is_url = re.compile(r"^(?:https?|ftp|file)://", re.I).search(link.strip())
if is_url:
return link
super()._remove_javascript_link(link)
ahmed-arb marked this conversation as resolved.
Show resolved Hide resolved


def HTML(html): # pylint: disable=invalid-name
"""
Mark a string as already HTML, so that it won't be escaped before output.
Expand Down Expand Up @@ -70,6 +112,6 @@ def clean_dangerous_html(html):
"""
if not html:
return html
cleaner = Cleaner(style=True, inline_style=False, safe_attrs_only=False)
cleaner = HTMLCleaner(style=True, inline_style=False, safe_attrs_only=False)
html = cleaner.clean_html(html)
return HTML(html)
49 changes: 48 additions & 1 deletion openedx/core/djangolib/tests/test_markup.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from django.utils.translation import ngettext
from mako.template import Template

from openedx.core.djangolib.markup import HTML, Text, strip_all_tags_but_br
from openedx.core.djangolib.markup import HTML, HTMLCleaner, Text, strip_all_tags_but_br


@ddt.ddt
Expand Down Expand Up @@ -157,3 +157,50 @@ def test_clean_dengers_html_filter(self):
assert not html_soup.find('form')
assert not html_soup.find('blink')
assert not html_soup.find('object')


class TestHTMLCleaner(unittest.TestCase):
"""
Tests that Url links are being cleaned properly and no useful link is removed.
"""

def setUp(self):
self.cleaner = HTMLCleaner(style=True, inline_style=False, safe_attrs_only=False)
ahmed-arb marked this conversation as resolved.
Show resolved Hide resolved

def test_valid_urls(self):
ahmed-arb marked this conversation as resolved.
Show resolved Hide resolved
https_url = "https://example.com"
http_url = "http://example.com/path/to/page"
ftp_url = "ftp://ftp.example.com/resource"
file_url = "file://localhost/path/to/file"

cleaned_url = self.cleaner._remove_javascript_link(https_url)
self.assertEqual(cleaned_url, https_url)

cleaned_url = self.cleaner._remove_javascript_link(http_url)
self.assertEqual(cleaned_url, http_url)

cleaned_url = self.cleaner._remove_javascript_link(ftp_url)
self.assertEqual(cleaned_url, ftp_url)

cleaned_url = self.cleaner._remove_javascript_link(file_url)
self.assertEqual(cleaned_url, file_url)

def test_javascript_link(self):
cleaned_url = self.cleaner._remove_javascript_link("javascript:alert('Hello')")
self.assertIsNone(cleaned_url)

def test_mixed_case_scheme(self):
"""
Javascript can be executed this way so this code should be removed.
"""
url = "javascript:alert('hello') https://example.com"
cleaned_url = self.cleaner._remove_javascript_link(url)
self.assertIsNone(cleaned_url)

def test_sub_scheme_match(self):
"""
Javascript cannot be executed this way so these urls are safe.
"""
url = "https://example.com/data:something"
cleaned_url = self.cleaner._remove_javascript_link(url)
self.assertEqual(cleaned_url, url)
1 change: 0 additions & 1 deletion openedx/features/wikimedia_features/messenger/apps.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ class MessengerConfig(AppConfig):
PluginSettings.CONFIG: {
ProjectType.LMS: {
SettingsType.COMMON: {PluginSettings.RELATIVE_PATH: 'settings.common'},
SettingsType.TEST: {PluginSettings.RELATIVE_PATH: 'settings.test'},
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why we need to remove this?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This module does not exist.

}
}
}
Expand Down