wikimedia · ahmed-arb · Jul 10, 2024 · Jul 3, 2024 · Jul 4, 2024 · Jul 4, 2024
diff --git a/openedx/core/djangolib/markup.py b/openedx/core/djangolib/markup.py
@@ -5,6 +5,7 @@
 
 import markupsafe
 import bleach
+import re
 from lxml.html.clean import Cleaner
 from mako.filters import decode
 
@@ -14,6 +15,47 @@
 Text = markupsafe.escape                        # pylint: disable=invalid-name
 
 
+class HTMLCleaner(Cleaner):
+    """
+    HTMLCleaner extends lxml.html.clean.Cleaner to sanitize HTML content while preserving valid URLs
+    and removing unsafe JavaScript links.
+
+    Attributes:
+    -----------
+    _is_url : Callable[[str], Optional[re.Match]]
+        A regular expression pattern used to identify valid URLs. This pattern matches strings that
+        start with 'http', 'https', 'ftp', or 'file' schemes, case-insensitively.
+    """
+    def _remove_javascript_link(self, link: str):
+        """
+        Checks if the given link is a valid URL. If it is, the link is returned unchanged.
+        Otherwise, the method delegates to the parent class's method to remove the JavaScript link.
+
+        Parameters:
+        -----------
+        link : str
+            The hyperlink (href attribute value) to be checked and potentially sanitized.
+
+        Returns:
+        --------
+        Optional[str]
+            The original link if it is a valid URL; otherwise, the result of the parent class's method
+            to handle the link.
+
+        Example:
+        --------
+        'https://www.example.com/javascript:something'   Valid
+        'javascript:alert("hello")' Invalid
+        'http://example.com/path/to/page'   Valid
+        'ftp://ftp.example.com/resource'   Valid
+        'file://localhost/path/to/file'   Valid
+        """
+        is_url = re.compile(r"^(?:https?|ftp|file)://", re.I).search(link.strip())
+        if is_url:
+            return link
+        super()._remove_javascript_link(link)
+
+
 def HTML(html):                                 # pylint: disable=invalid-name
     """
     Mark a string as already HTML, so that it won't be escaped before output.
@@ -70,6 +112,6 @@ def clean_dangerous_html(html):
     """
     if not html:
         return html
-    cleaner = Cleaner(style=True, inline_style=False, safe_attrs_only=False)
+    cleaner = HTMLCleaner(style=True, inline_style=False, safe_attrs_only=False)
     html = cleaner.clean_html(html)
     return HTML(html)
diff --git a/openedx/core/djangolib/tests/test_markup.py b/openedx/core/djangolib/tests/test_markup.py
@@ -11,7 +11,7 @@
 from django.utils.translation import ngettext
 from mako.template import Template
 
-from openedx.core.djangolib.markup import HTML, Text, strip_all_tags_but_br
+from openedx.core.djangolib.markup import HTML, HTMLCleaner, Text, strip_all_tags_but_br
 
 
 @ddt.ddt
@@ -157,3 +157,50 @@ def test_clean_dengers_html_filter(self):
         assert not html_soup.find('form')
         assert not html_soup.find('blink')
         assert not html_soup.find('object')
+
+
+class TestHTMLCleaner(unittest.TestCase):
+    """
+    Tests that Url links are being cleaned properly and no useful link is removed.
+    """
+
+    def setUp(self):
+        self.cleaner = HTMLCleaner(style=True, inline_style=False, safe_attrs_only=False)
+
+    def test_valid_urls(self):
+        https_url = "https://example.com"
+        http_url = "http://example.com/path/to/page"
+        ftp_url = "ftp://ftp.example.com/resource"
+        file_url = "file://localhost/path/to/file"
+
+        cleaned_url = self.cleaner._remove_javascript_link(https_url)
+        self.assertEqual(cleaned_url, https_url)
+
+        cleaned_url = self.cleaner._remove_javascript_link(http_url)
+        self.assertEqual(cleaned_url, http_url)
+
+        cleaned_url = self.cleaner._remove_javascript_link(ftp_url)
+        self.assertEqual(cleaned_url, ftp_url)
+
+        cleaned_url = self.cleaner._remove_javascript_link(file_url)
+        self.assertEqual(cleaned_url, file_url)
+
+    def test_javascript_link(self):
+        cleaned_url = self.cleaner._remove_javascript_link("javascript:alert('Hello')")
+        self.assertIsNone(cleaned_url)
+
+    def test_mixed_case_scheme(self):
+        """
+        Javascript can be executed this way so this code should be removed.
+        """
+        url = "javascript:alert('hello') https://example.com"
+        cleaned_url = self.cleaner._remove_javascript_link(url)
+        self.assertIsNone(cleaned_url)
+
+    def test_sub_scheme_match(self):
+        """
+        Javascript cannot be executed this way so these urls are safe.
+        """
+        url = "https://example.com/data:something"
+        cleaned_url = self.cleaner._remove_javascript_link(url)
+        self.assertEqual(cleaned_url, url)
diff --git a/openedx/features/wikimedia_features/messenger/apps.py b/openedx/features/wikimedia_features/messenger/apps.py
@@ -20,7 +20,6 @@ class MessengerConfig(AppConfig):
         PluginSettings.CONFIG: {
             ProjectType.LMS: {
                 SettingsType.COMMON: {PluginSettings.RELATIVE_PATH: 'settings.common'},
-                SettingsType.TEST: {PluginSettings.RELATIVE_PATH: 'settings.test'},
             }
         }
     }