From 8cc9bb84632acdf0942667c7e7de02e5e2125946 Mon Sep 17 00:00:00 2001
From: Ahmed Khalid <106074266+ahmed-arb@users.noreply.github.com>
Date: Wed, 3 Jul 2024 17:29:22 +0500
Subject: [PATCH 1/4] fix: update html cleaning rules

---
 openedx/core/djangolib/markup.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/openedx/core/djangolib/markup.py b/openedx/core/djangolib/markup.py
index c214728d89e7..1336509eb629 100644
--- a/openedx/core/djangolib/markup.py
+++ b/openedx/core/djangolib/markup.py
@@ -5,6 +5,7 @@
 
 import markupsafe
 import bleach
+import re
 from lxml.html.clean import Cleaner
 from mako.filters import decode
 
@@ -14,6 +15,14 @@
 Text = markupsafe.escape                        # pylint: disable=invalid-name
 
 
+class HTMLCleaner(Cleaner):
+    _is_url = re.compile(r"^(?:https?|ftp|file)://", re.I).search
+    def _remove_javascript_link(self, link: str):
+        if self._is_url(link.strip()):
+            return link
+        super()._remove_javascript_link(link)
+        
+
 def HTML(html):                                 # pylint: disable=invalid-name
     """
     Mark a string as already HTML, so that it won't be escaped before output.
@@ -70,6 +79,6 @@ def clean_dangerous_html(html):
     """
     if not html:
         return html
-    cleaner = Cleaner(style=True, inline_style=False, safe_attrs_only=False)
+    cleaner = HTMLCleaner(style=True, inline_style=False, safe_attrs_only=False)
     html = cleaner.clean_html(html)
     return HTML(html)

From 956f27547db077a917b57c0c6f333600914fc3e4 Mon Sep 17 00:00:00 2001
From: Ahmed Khalid <106074266+ahmed-arb@users.noreply.github.com>
Date: Thu, 4 Jul 2024 12:47:21 +0500
Subject: [PATCH 2/4] chore: add docstrings and code cleaning

---
 openedx/core/djangolib/markup.py | 37 ++++++++++++++++++++++++++++++--
 1 file changed, 35 insertions(+), 2 deletions(-)

diff --git a/openedx/core/djangolib/markup.py b/openedx/core/djangolib/markup.py
index 1336509eb629..fb38f7774314 100644
--- a/openedx/core/djangolib/markup.py
+++ b/openedx/core/djangolib/markup.py
@@ -16,9 +16,42 @@
 
 
 class HTMLCleaner(Cleaner):
-    _is_url = re.compile(r"^(?:https?|ftp|file)://", re.I).search
+    """
+    HTMLCleaner extends lxml.html.clean.Cleaner to sanitize HTML content while preserving valid URLs
+    and removing unsafe JavaScript links.
+
+    Attributes:
+    -----------
+    _is_url : Callable[[str], Optional[re.Match]]
+        A regular expression pattern used to identify valid URLs. This pattern matches strings that
+        start with 'http', 'https', 'ftp', or 'file' schemes, case-insensitively.
+    """
     def _remove_javascript_link(self, link: str):
-        if self._is_url(link.strip()):
+        """
+        Checks if the given link is a valid URL. If it is, the link is returned unchanged.
+        Otherwise, the method delegates to the parent class's method to remove the JavaScript link.
+
+        Parameters:
+        -----------
+        link : str
+            The hyperlink (href attribute value) to be checked and potentially sanitized.
+
+        Returns:
+        --------
+        Optional[str]
+            The original link if it is a valid URL; otherwise, the result of the parent class's method
+            to handle the link.
+
+        Example:
+        --------
+        'https://www.example.com/javascript:something'   Valid
+        'javascript:alert("hello")' Invalid
+        'http://example.com/path/to/page'   Valid
+        'ftp://ftp.example.com/resource'   Valid
+        'file://localhost/path/to/file'   Valid
+        """
+        is_url = re.compile(r"^(?:https?|ftp|file)://", re.I).search(link.strip())
+        if is_url:
             return link
         super()._remove_javascript_link(link)
         

From 592c7e3e1e753f33c561ff1acf7bed49430c91e9 Mon Sep 17 00:00:00 2001
From: Ahmed Khalid <106074266+ahmed-arb@users.noreply.github.com>
Date: Thu, 4 Jul 2024 15:12:10 +0500
Subject: [PATCH 3/4] chore: fixes test cases and adds few new tests

---
 openedx/core/djangolib/tests/test_markup.py   | 49 ++++++++++++++++++-
 .../wikimedia_features/messenger/apps.py      |  1 -
 2 files changed, 48 insertions(+), 2 deletions(-)

diff --git a/openedx/core/djangolib/tests/test_markup.py b/openedx/core/djangolib/tests/test_markup.py
index d3775deae652..d1cce2b89a65 100644
--- a/openedx/core/djangolib/tests/test_markup.py
+++ b/openedx/core/djangolib/tests/test_markup.py
@@ -11,7 +11,7 @@
 from django.utils.translation import ngettext
 from mako.template import Template
 
-from openedx.core.djangolib.markup import HTML, Text, strip_all_tags_but_br
+from openedx.core.djangolib.markup import HTML, HTMLCleaner, Text, strip_all_tags_but_br
 
 
 @ddt.ddt
@@ -157,3 +157,50 @@ def test_clean_dengers_html_filter(self):
         assert not html_soup.find('form')
         assert not html_soup.find('blink')
         assert not html_soup.find('object')
+
+
+class TestHTMLCleaner(unittest.TestCase):
+    """
+    Tests that Url links are being cleaned properly and no useful link is removed.
+    """
+
+    def setUp(self):
+        self.cleaner = HTMLCleaner(style=True, inline_style=False, safe_attrs_only=False)
+
+    def test_valid_urls(self):
+        https_url = "https://example.com"
+        http_url = "http://example.com/path/to/page"
+        ftp_url = "ftp://ftp.example.com/resource"
+        file_url = "file://localhost/path/to/file"
+
+        cleaned_url = self.cleaner._remove_javascript_link(https_url)
+        self.assertEqual(cleaned_url, https_url)
+
+        cleaned_url = self.cleaner._remove_javascript_link(http_url)
+        self.assertEqual(cleaned_url, http_url)
+
+        cleaned_url = self.cleaner._remove_javascript_link(ftp_url)
+        self.assertEqual(cleaned_url, ftp_url)
+
+        cleaned_url = self.cleaner._remove_javascript_link(file_url)
+        self.assertEqual(cleaned_url, file_url)
+
+    def test_javascript_link(self):
+        cleaned_url = self.cleaner._remove_javascript_link("javascript:alert('Hello')")
+        self.assertIsNone(cleaned_url)
+
+    def test_mixed_case_scheme(self):
+        """
+        Javascript can be executed this way so this code should be removed.
+        """
+        url = "javascript:alert('hello') https://example.com"
+        cleaned_url = self.cleaner._remove_javascript_link(url)
+        self.assertIsNone(cleaned_url)
+
+    def test_sub_scheme_match(self):
+        """
+        Javascript cannot be executed this way so these urls are safe.
+        """
+        url = "https://example.com/data:something"
+        cleaned_url = self.cleaner._remove_javascript_link(url)
+        self.assertEqual(cleaned_url, url)
diff --git a/openedx/features/wikimedia_features/messenger/apps.py b/openedx/features/wikimedia_features/messenger/apps.py
index 23f870cbd6cf..80cb14a8d375 100644
--- a/openedx/features/wikimedia_features/messenger/apps.py
+++ b/openedx/features/wikimedia_features/messenger/apps.py
@@ -20,7 +20,6 @@ class MessengerConfig(AppConfig):
         PluginSettings.CONFIG: {
             ProjectType.LMS: {
                 SettingsType.COMMON: {PluginSettings.RELATIVE_PATH: 'settings.common'},
-                SettingsType.TEST: {PluginSettings.RELATIVE_PATH: 'settings.test'},
             }
         }
     }

From c37c05016e3f58721b9276085e92a9eaed6975af Mon Sep 17 00:00:00 2001
From: Ahmed Khalid <106074266+ahmed-arb@users.noreply.github.com>
Date: Fri, 5 Jul 2024 13:46:34 +0500
Subject: [PATCH 4/4] chore: simplify test cases and refactor docstrings

---
 openedx/core/djangolib/markup.py            | 46 ++++++++-----------
 openedx/core/djangolib/tests/test_markup.py | 50 ++++++++-------------
 2 files changed, 37 insertions(+), 59 deletions(-)

diff --git a/openedx/core/djangolib/markup.py b/openedx/core/djangolib/markup.py
index fb38f7774314..475d961686e3 100644
--- a/openedx/core/djangolib/markup.py
+++ b/openedx/core/djangolib/markup.py
@@ -17,43 +17,35 @@
 
 class HTMLCleaner(Cleaner):
     """
-    HTMLCleaner extends lxml.html.clean.Cleaner to sanitize HTML content while preserving valid URLs
-    and removing unsafe JavaScript links.
-
-    Attributes:
-    -----------
-    _is_url : Callable[[str], Optional[re.Match]]
-        A regular expression pattern used to identify valid URLs. This pattern matches strings that
-        start with 'http', 'https', 'ftp', or 'file' schemes, case-insensitively.
+    HTMLCleaner extends lxml.html.clean.Cleaner to sanitize HTML content while preserving
+    valid URLs and removing unsafe JavaScript links.
     """
     def _remove_javascript_link(self, link: str):
         """
-        Checks if the given link is a valid URL. If it is, the link is returned unchanged.
-        Otherwise, the method delegates to the parent class's method to remove the JavaScript link.
+        Overrides the parent class's method to preserve valid URLs.
 
-        Parameters:
-        -----------
-        link : str
-            The hyperlink (href attribute value) to be checked and potentially sanitized.
+        This method uses a regular expression to identify valid URLs that start with 'http', 'https',
+        'ftp', or 'file' schemes. If the link is a valid URL, it is returned unchanged. Otherwise,
+        the parent class's method is used to remove the JavaScript link.
+
+        Args:
+            link (str): The hyperlink (href attribute value) to be checked and potentially sanitized.
 
         Returns:
-        --------
-        Optional[str]
-            The original link if it is a valid URL; otherwise, the result of the parent class's method
-            to handle the link.
-
-        Example:
-        --------
-        'https://www.example.com/javascript:something'   Valid
-        'javascript:alert("hello")' Invalid
-        'http://example.com/path/to/page'   Valid
-        'ftp://ftp.example.com/resource'   Valid
-        'file://localhost/path/to/file'   Valid
+            str: The original link or empty string.
+
+        Examples:
+            Valid URLs:
+                'https://www.example.com/javascript:something'
+                'file://localhost/path/to/file'
+            Invalid URLs:
+                'javascript:alert("hello")'
+                'javascript:alert("hello") https://www.example.com/page'
         """
         is_url = re.compile(r"^(?:https?|ftp|file)://", re.I).search(link.strip())
         if is_url:
             return link
-        super()._remove_javascript_link(link)
+        return super()._remove_javascript_link(link)
         
 
 def HTML(html):                                 # pylint: disable=invalid-name
diff --git a/openedx/core/djangolib/tests/test_markup.py b/openedx/core/djangolib/tests/test_markup.py
index d1cce2b89a65..acc5beb0890d 100644
--- a/openedx/core/djangolib/tests/test_markup.py
+++ b/openedx/core/djangolib/tests/test_markup.py
@@ -158,49 +158,35 @@ def test_clean_dengers_html_filter(self):
         assert not html_soup.find('blink')
         assert not html_soup.find('object')
 
-
+@ddt.ddt
 class TestHTMLCleaner(unittest.TestCase):
     """
     Tests that Url links are being cleaned properly and no useful link is removed.
     """
 
     def setUp(self):
-        self.cleaner = HTMLCleaner(style=True, inline_style=False, safe_attrs_only=False)
-
-    def test_valid_urls(self):
-        https_url = "https://example.com"
-        http_url = "http://example.com/path/to/page"
-        ftp_url = "ftp://ftp.example.com/resource"
-        file_url = "file://localhost/path/to/file"
-
-        cleaned_url = self.cleaner._remove_javascript_link(https_url)
-        self.assertEqual(cleaned_url, https_url)
-
-        cleaned_url = self.cleaner._remove_javascript_link(http_url)
-        self.assertEqual(cleaned_url, http_url)
-
-        cleaned_url = self.cleaner._remove_javascript_link(ftp_url)
-        self.assertEqual(cleaned_url, ftp_url)
-
-        cleaned_url = self.cleaner._remove_javascript_link(file_url)
-        self.assertEqual(cleaned_url, file_url)
+        self.cleaner = HTMLCleaner()
 
-    def test_javascript_link(self):
-        cleaned_url = self.cleaner._remove_javascript_link("javascript:alert('Hello')")
-        self.assertIsNone(cleaned_url)
-
-    def test_mixed_case_scheme(self):
+    @ddt.data(
+        "https://example.com/data:something", # Javascript cannot be executed this way so these urls are safe.
+        "http://example.com/path/to/page",
+        "ftp://ftp.example.com/resource",
+        "file://localhost/path/to/file",
+    )
+    def test_valid_urls(self, url):
         """
-        Javascript can be executed this way so this code should be removed.
+        Test that valid URLs are preserved.
         """
-        url = "javascript:alert('hello') https://example.com"
         cleaned_url = self.cleaner._remove_javascript_link(url)
-        self.assertIsNone(cleaned_url)
+        self.assertEqual(cleaned_url, url)
 
-    def test_sub_scheme_match(self):
+    @ddt.data(
+        "javascript:alert('Hello')",
+        "mocha:some_code https://example.com", # Javascript can be executed this way so this code should be removed.
+    )
+    def test_javascript_is_removed(self, url):
         """
-        Javascript cannot be executed this way so these urls are safe.
+        Test that malicious Javascript is removed.
         """
-        url = "https://example.com/data:something"
         cleaned_url = self.cleaner._remove_javascript_link(url)
-        self.assertEqual(cleaned_url, url)
+        self.assertEqual(cleaned_url, '')