From 8cc9bb84632acdf0942667c7e7de02e5e2125946 Mon Sep 17 00:00:00 2001 From: Ahmed Khalid <106074266+ahmed-arb@users.noreply.github.com> Date: Wed, 3 Jul 2024 17:29:22 +0500 Subject: [PATCH 1/4] fix: update html cleaning rules --- openedx/core/djangolib/markup.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/openedx/core/djangolib/markup.py b/openedx/core/djangolib/markup.py index c214728d89e7..1336509eb629 100644 --- a/openedx/core/djangolib/markup.py +++ b/openedx/core/djangolib/markup.py @@ -5,6 +5,7 @@ import markupsafe import bleach +import re from lxml.html.clean import Cleaner from mako.filters import decode @@ -14,6 +15,14 @@ Text = markupsafe.escape # pylint: disable=invalid-name +class HTMLCleaner(Cleaner): + _is_url = re.compile(r"^(?:https?|ftp|file)://", re.I).search + def _remove_javascript_link(self, link: str): + if self._is_url(link.strip()): + return link + super()._remove_javascript_link(link) + + def HTML(html): # pylint: disable=invalid-name """ Mark a string as already HTML, so that it won't be escaped before output. @@ -70,6 +79,6 @@ def clean_dangerous_html(html): """ if not html: return html - cleaner = Cleaner(style=True, inline_style=False, safe_attrs_only=False) + cleaner = HTMLCleaner(style=True, inline_style=False, safe_attrs_only=False) html = cleaner.clean_html(html) return HTML(html) From 956f27547db077a917b57c0c6f333600914fc3e4 Mon Sep 17 00:00:00 2001 From: Ahmed Khalid <106074266+ahmed-arb@users.noreply.github.com> Date: Thu, 4 Jul 2024 12:47:21 +0500 Subject: [PATCH 2/4] chore: add docstrings and code cleaning --- openedx/core/djangolib/markup.py | 37 ++++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/openedx/core/djangolib/markup.py b/openedx/core/djangolib/markup.py index 1336509eb629..fb38f7774314 100644 --- a/openedx/core/djangolib/markup.py +++ b/openedx/core/djangolib/markup.py @@ -16,9 +16,42 @@ class HTMLCleaner(Cleaner): - _is_url = re.compile(r"^(?:https?|ftp|file)://", re.I).search + """ + HTMLCleaner extends lxml.html.clean.Cleaner to sanitize HTML content while preserving valid URLs + and removing unsafe JavaScript links. + + Attributes: + ----------- + _is_url : Callable[[str], Optional[re.Match]] + A regular expression pattern used to identify valid URLs. This pattern matches strings that + start with 'http', 'https', 'ftp', or 'file' schemes, case-insensitively. + """ def _remove_javascript_link(self, link: str): - if self._is_url(link.strip()): + """ + Checks if the given link is a valid URL. If it is, the link is returned unchanged. + Otherwise, the method delegates to the parent class's method to remove the JavaScript link. + + Parameters: + ----------- + link : str + The hyperlink (href attribute value) to be checked and potentially sanitized. + + Returns: + -------- + Optional[str] + The original link if it is a valid URL; otherwise, the result of the parent class's method + to handle the link. + + Example: + -------- + 'https://www.example.com/javascript:something' Valid + 'javascript:alert("hello")' Invalid + 'http://example.com/path/to/page' Valid + 'ftp://ftp.example.com/resource' Valid + 'file://localhost/path/to/file' Valid + """ + is_url = re.compile(r"^(?:https?|ftp|file)://", re.I).search(link.strip()) + if is_url: return link super()._remove_javascript_link(link) From 592c7e3e1e753f33c561ff1acf7bed49430c91e9 Mon Sep 17 00:00:00 2001 From: Ahmed Khalid <106074266+ahmed-arb@users.noreply.github.com> Date: Thu, 4 Jul 2024 15:12:10 +0500 Subject: [PATCH 3/4] chore: fixes test cases and adds few new tests --- openedx/core/djangolib/tests/test_markup.py | 49 ++++++++++++++++++- .../wikimedia_features/messenger/apps.py | 1 - 2 files changed, 48 insertions(+), 2 deletions(-) diff --git a/openedx/core/djangolib/tests/test_markup.py b/openedx/core/djangolib/tests/test_markup.py index d3775deae652..d1cce2b89a65 100644 --- a/openedx/core/djangolib/tests/test_markup.py +++ b/openedx/core/djangolib/tests/test_markup.py @@ -11,7 +11,7 @@ from django.utils.translation import ngettext from mako.template import Template -from openedx.core.djangolib.markup import HTML, Text, strip_all_tags_but_br +from openedx.core.djangolib.markup import HTML, HTMLCleaner, Text, strip_all_tags_but_br @ddt.ddt @@ -157,3 +157,50 @@ def test_clean_dengers_html_filter(self): assert not html_soup.find('form') assert not html_soup.find('blink') assert not html_soup.find('object') + + +class TestHTMLCleaner(unittest.TestCase): + """ + Tests that Url links are being cleaned properly and no useful link is removed. + """ + + def setUp(self): + self.cleaner = HTMLCleaner(style=True, inline_style=False, safe_attrs_only=False) + + def test_valid_urls(self): + https_url = "https://example.com" + http_url = "http://example.com/path/to/page" + ftp_url = "ftp://ftp.example.com/resource" + file_url = "file://localhost/path/to/file" + + cleaned_url = self.cleaner._remove_javascript_link(https_url) + self.assertEqual(cleaned_url, https_url) + + cleaned_url = self.cleaner._remove_javascript_link(http_url) + self.assertEqual(cleaned_url, http_url) + + cleaned_url = self.cleaner._remove_javascript_link(ftp_url) + self.assertEqual(cleaned_url, ftp_url) + + cleaned_url = self.cleaner._remove_javascript_link(file_url) + self.assertEqual(cleaned_url, file_url) + + def test_javascript_link(self): + cleaned_url = self.cleaner._remove_javascript_link("javascript:alert('Hello')") + self.assertIsNone(cleaned_url) + + def test_mixed_case_scheme(self): + """ + Javascript can be executed this way so this code should be removed. + """ + url = "javascript:alert('hello') https://example.com" + cleaned_url = self.cleaner._remove_javascript_link(url) + self.assertIsNone(cleaned_url) + + def test_sub_scheme_match(self): + """ + Javascript cannot be executed this way so these urls are safe. + """ + url = "https://example.com/data:something" + cleaned_url = self.cleaner._remove_javascript_link(url) + self.assertEqual(cleaned_url, url) diff --git a/openedx/features/wikimedia_features/messenger/apps.py b/openedx/features/wikimedia_features/messenger/apps.py index 23f870cbd6cf..80cb14a8d375 100644 --- a/openedx/features/wikimedia_features/messenger/apps.py +++ b/openedx/features/wikimedia_features/messenger/apps.py @@ -20,7 +20,6 @@ class MessengerConfig(AppConfig): PluginSettings.CONFIG: { ProjectType.LMS: { SettingsType.COMMON: {PluginSettings.RELATIVE_PATH: 'settings.common'}, - SettingsType.TEST: {PluginSettings.RELATIVE_PATH: 'settings.test'}, } } } From c37c05016e3f58721b9276085e92a9eaed6975af Mon Sep 17 00:00:00 2001 From: Ahmed Khalid <106074266+ahmed-arb@users.noreply.github.com> Date: Fri, 5 Jul 2024 13:46:34 +0500 Subject: [PATCH 4/4] chore: simplify test cases and refactor docstrings --- openedx/core/djangolib/markup.py | 46 ++++++++----------- openedx/core/djangolib/tests/test_markup.py | 50 ++++++++------------- 2 files changed, 37 insertions(+), 59 deletions(-) diff --git a/openedx/core/djangolib/markup.py b/openedx/core/djangolib/markup.py index fb38f7774314..475d961686e3 100644 --- a/openedx/core/djangolib/markup.py +++ b/openedx/core/djangolib/markup.py @@ -17,43 +17,35 @@ class HTMLCleaner(Cleaner): """ - HTMLCleaner extends lxml.html.clean.Cleaner to sanitize HTML content while preserving valid URLs - and removing unsafe JavaScript links. - - Attributes: - ----------- - _is_url : Callable[[str], Optional[re.Match]] - A regular expression pattern used to identify valid URLs. This pattern matches strings that - start with 'http', 'https', 'ftp', or 'file' schemes, case-insensitively. + HTMLCleaner extends lxml.html.clean.Cleaner to sanitize HTML content while preserving + valid URLs and removing unsafe JavaScript links. """ def _remove_javascript_link(self, link: str): """ - Checks if the given link is a valid URL. If it is, the link is returned unchanged. - Otherwise, the method delegates to the parent class's method to remove the JavaScript link. + Overrides the parent class's method to preserve valid URLs. - Parameters: - ----------- - link : str - The hyperlink (href attribute value) to be checked and potentially sanitized. + This method uses a regular expression to identify valid URLs that start with 'http', 'https', + 'ftp', or 'file' schemes. If the link is a valid URL, it is returned unchanged. Otherwise, + the parent class's method is used to remove the JavaScript link. + + Args: + link (str): The hyperlink (href attribute value) to be checked and potentially sanitized. Returns: - -------- - Optional[str] - The original link if it is a valid URL; otherwise, the result of the parent class's method - to handle the link. - - Example: - -------- - 'https://www.example.com/javascript:something' Valid - 'javascript:alert("hello")' Invalid - 'http://example.com/path/to/page' Valid - 'ftp://ftp.example.com/resource' Valid - 'file://localhost/path/to/file' Valid + str: The original link or empty string. + + Examples: + Valid URLs: + 'https://www.example.com/javascript:something' + 'file://localhost/path/to/file' + Invalid URLs: + 'javascript:alert("hello")' + 'javascript:alert("hello") https://www.example.com/page' """ is_url = re.compile(r"^(?:https?|ftp|file)://", re.I).search(link.strip()) if is_url: return link - super()._remove_javascript_link(link) + return super()._remove_javascript_link(link) def HTML(html): # pylint: disable=invalid-name diff --git a/openedx/core/djangolib/tests/test_markup.py b/openedx/core/djangolib/tests/test_markup.py index d1cce2b89a65..acc5beb0890d 100644 --- a/openedx/core/djangolib/tests/test_markup.py +++ b/openedx/core/djangolib/tests/test_markup.py @@ -158,49 +158,35 @@ def test_clean_dengers_html_filter(self): assert not html_soup.find('blink') assert not html_soup.find('object') - +@ddt.ddt class TestHTMLCleaner(unittest.TestCase): """ Tests that Url links are being cleaned properly and no useful link is removed. """ def setUp(self): - self.cleaner = HTMLCleaner(style=True, inline_style=False, safe_attrs_only=False) - - def test_valid_urls(self): - https_url = "https://example.com" - http_url = "http://example.com/path/to/page" - ftp_url = "ftp://ftp.example.com/resource" - file_url = "file://localhost/path/to/file" - - cleaned_url = self.cleaner._remove_javascript_link(https_url) - self.assertEqual(cleaned_url, https_url) - - cleaned_url = self.cleaner._remove_javascript_link(http_url) - self.assertEqual(cleaned_url, http_url) - - cleaned_url = self.cleaner._remove_javascript_link(ftp_url) - self.assertEqual(cleaned_url, ftp_url) - - cleaned_url = self.cleaner._remove_javascript_link(file_url) - self.assertEqual(cleaned_url, file_url) + self.cleaner = HTMLCleaner() - def test_javascript_link(self): - cleaned_url = self.cleaner._remove_javascript_link("javascript:alert('Hello')") - self.assertIsNone(cleaned_url) - - def test_mixed_case_scheme(self): + @ddt.data( + "https://example.com/data:something", # Javascript cannot be executed this way so these urls are safe. + "http://example.com/path/to/page", + "ftp://ftp.example.com/resource", + "file://localhost/path/to/file", + ) + def test_valid_urls(self, url): """ - Javascript can be executed this way so this code should be removed. + Test that valid URLs are preserved. """ - url = "javascript:alert('hello') https://example.com" cleaned_url = self.cleaner._remove_javascript_link(url) - self.assertIsNone(cleaned_url) + self.assertEqual(cleaned_url, url) - def test_sub_scheme_match(self): + @ddt.data( + "javascript:alert('Hello')", + "mocha:some_code https://example.com", # Javascript can be executed this way so this code should be removed. + ) + def test_javascript_is_removed(self, url): """ - Javascript cannot be executed this way so these urls are safe. + Test that malicious Javascript is removed. """ - url = "https://example.com/data:something" cleaned_url = self.cleaner._remove_javascript_link(url) - self.assertEqual(cleaned_url, url) + self.assertEqual(cleaned_url, '')