From 0633e78696b9b00e38f2179ef4be392cf12ff693 Mon Sep 17 00:00:00 2001 From: Ahmed Khalid <106074266+ahmed-arb@users.noreply.github.com> Date: Wed, 10 Jul 2024 17:25:35 +0500 Subject: [PATCH] fix: update html cleaning rules (#444) * fix: update html cleaning rules * chore: add docstrings and code cleaning * chore: fixes test cases and adds few new tests * chore: simplify test cases and refactor docstrings --- openedx/core/djangolib/markup.py | 36 ++++++++++++++++++- openedx/core/djangolib/tests/test_markup.py | 35 +++++++++++++++++- .../wikimedia_features/messenger/apps.py | 1 - 3 files changed, 69 insertions(+), 3 deletions(-) diff --git a/openedx/core/djangolib/markup.py b/openedx/core/djangolib/markup.py index c214728d89e7..475d961686e3 100644 --- a/openedx/core/djangolib/markup.py +++ b/openedx/core/djangolib/markup.py @@ -5,6 +5,7 @@ import markupsafe import bleach +import re from lxml.html.clean import Cleaner from mako.filters import decode @@ -14,6 +15,39 @@ Text = markupsafe.escape # pylint: disable=invalid-name +class HTMLCleaner(Cleaner): + """ + HTMLCleaner extends lxml.html.clean.Cleaner to sanitize HTML content while preserving + valid URLs and removing unsafe JavaScript links. + """ + def _remove_javascript_link(self, link: str): + """ + Overrides the parent class's method to preserve valid URLs. + + This method uses a regular expression to identify valid URLs that start with 'http', 'https', + 'ftp', or 'file' schemes. If the link is a valid URL, it is returned unchanged. Otherwise, + the parent class's method is used to remove the JavaScript link. + + Args: + link (str): The hyperlink (href attribute value) to be checked and potentially sanitized. + + Returns: + str: The original link or empty string. + + Examples: + Valid URLs: + 'https://www.example.com/javascript:something' + 'file://localhost/path/to/file' + Invalid URLs: + 'javascript:alert("hello")' + 'javascript:alert("hello") https://www.example.com/page' + """ + is_url = re.compile(r"^(?:https?|ftp|file)://", re.I).search(link.strip()) + if is_url: + return link + return super()._remove_javascript_link(link) + + def HTML(html): # pylint: disable=invalid-name """ Mark a string as already HTML, so that it won't be escaped before output. @@ -70,6 +104,6 @@ def clean_dangerous_html(html): """ if not html: return html - cleaner = Cleaner(style=True, inline_style=False, safe_attrs_only=False) + cleaner = HTMLCleaner(style=True, inline_style=False, safe_attrs_only=False) html = cleaner.clean_html(html) return HTML(html) diff --git a/openedx/core/djangolib/tests/test_markup.py b/openedx/core/djangolib/tests/test_markup.py index d3775deae652..acc5beb0890d 100644 --- a/openedx/core/djangolib/tests/test_markup.py +++ b/openedx/core/djangolib/tests/test_markup.py @@ -11,7 +11,7 @@ from django.utils.translation import ngettext from mako.template import Template -from openedx.core.djangolib.markup import HTML, Text, strip_all_tags_but_br +from openedx.core.djangolib.markup import HTML, HTMLCleaner, Text, strip_all_tags_but_br @ddt.ddt @@ -157,3 +157,36 @@ def test_clean_dengers_html_filter(self): assert not html_soup.find('form') assert not html_soup.find('blink') assert not html_soup.find('object') + +@ddt.ddt +class TestHTMLCleaner(unittest.TestCase): + """ + Tests that Url links are being cleaned properly and no useful link is removed. + """ + + def setUp(self): + self.cleaner = HTMLCleaner() + + @ddt.data( + "https://example.com/data:something", # Javascript cannot be executed this way so these urls are safe. + "http://example.com/path/to/page", + "ftp://ftp.example.com/resource", + "file://localhost/path/to/file", + ) + def test_valid_urls(self, url): + """ + Test that valid URLs are preserved. + """ + cleaned_url = self.cleaner._remove_javascript_link(url) + self.assertEqual(cleaned_url, url) + + @ddt.data( + "javascript:alert('Hello')", + "mocha:some_code https://example.com", # Javascript can be executed this way so this code should be removed. + ) + def test_javascript_is_removed(self, url): + """ + Test that malicious Javascript is removed. + """ + cleaned_url = self.cleaner._remove_javascript_link(url) + self.assertEqual(cleaned_url, '') diff --git a/openedx/features/wikimedia_features/messenger/apps.py b/openedx/features/wikimedia_features/messenger/apps.py index 23f870cbd6cf..80cb14a8d375 100644 --- a/openedx/features/wikimedia_features/messenger/apps.py +++ b/openedx/features/wikimedia_features/messenger/apps.py @@ -20,7 +20,6 @@ class MessengerConfig(AppConfig): PluginSettings.CONFIG: { ProjectType.LMS: { SettingsType.COMMON: {PluginSettings.RELATIVE_PATH: 'settings.common'}, - SettingsType.TEST: {PluginSettings.RELATIVE_PATH: 'settings.test'}, } } }