Skip to content

Commit

Permalink
chore: simplify test cases and refactor docstrings
Browse files Browse the repository at this point in the history
  • Loading branch information
ahmed-arb committed Jul 5, 2024
1 parent 592c7e3 commit c37c050
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 59 deletions.
46 changes: 19 additions & 27 deletions openedx/core/djangolib/markup.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,43 +17,35 @@

class HTMLCleaner(Cleaner):
"""
HTMLCleaner extends lxml.html.clean.Cleaner to sanitize HTML content while preserving valid URLs
and removing unsafe JavaScript links.
Attributes:
-----------
_is_url : Callable[[str], Optional[re.Match]]
A regular expression pattern used to identify valid URLs. This pattern matches strings that
start with 'http', 'https', 'ftp', or 'file' schemes, case-insensitively.
HTMLCleaner extends lxml.html.clean.Cleaner to sanitize HTML content while preserving
valid URLs and removing unsafe JavaScript links.
"""
def _remove_javascript_link(self, link: str):
"""
Checks if the given link is a valid URL. If it is, the link is returned unchanged.
Otherwise, the method delegates to the parent class's method to remove the JavaScript link.
Overrides the parent class's method to preserve valid URLs.
Parameters:
-----------
link : str
The hyperlink (href attribute value) to be checked and potentially sanitized.
This method uses a regular expression to identify valid URLs that start with 'http', 'https',
'ftp', or 'file' schemes. If the link is a valid URL, it is returned unchanged. Otherwise,
the parent class's method is used to remove the JavaScript link.
Args:
link (str): The hyperlink (href attribute value) to be checked and potentially sanitized.
Returns:
--------
Optional[str]
The original link if it is a valid URL; otherwise, the result of the parent class's method
to handle the link.
Example:
--------
'https://www.example.com/javascript:something' Valid
'javascript:alert("hello")' Invalid
'http://example.com/path/to/page' Valid
'ftp://ftp.example.com/resource' Valid
'file://localhost/path/to/file' Valid
str: The original link or empty string.
Examples:
Valid URLs:
'https://www.example.com/javascript:something'
'file://localhost/path/to/file'
Invalid URLs:
'javascript:alert("hello")'
'javascript:alert("hello") https://www.example.com/page'
"""
is_url = re.compile(r"^(?:https?|ftp|file)://", re.I).search(link.strip())
if is_url:
return link
super()._remove_javascript_link(link)
return super()._remove_javascript_link(link)


def HTML(html): # pylint: disable=invalid-name
Expand Down
50 changes: 18 additions & 32 deletions openedx/core/djangolib/tests/test_markup.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,49 +158,35 @@ def test_clean_dengers_html_filter(self):
assert not html_soup.find('blink')
assert not html_soup.find('object')


@ddt.ddt
class TestHTMLCleaner(unittest.TestCase):
"""
Tests that Url links are being cleaned properly and no useful link is removed.
"""

def setUp(self):
self.cleaner = HTMLCleaner(style=True, inline_style=False, safe_attrs_only=False)

def test_valid_urls(self):
https_url = "https://example.com"
http_url = "http://example.com/path/to/page"
ftp_url = "ftp://ftp.example.com/resource"
file_url = "file://localhost/path/to/file"

cleaned_url = self.cleaner._remove_javascript_link(https_url)
self.assertEqual(cleaned_url, https_url)

cleaned_url = self.cleaner._remove_javascript_link(http_url)
self.assertEqual(cleaned_url, http_url)

cleaned_url = self.cleaner._remove_javascript_link(ftp_url)
self.assertEqual(cleaned_url, ftp_url)

cleaned_url = self.cleaner._remove_javascript_link(file_url)
self.assertEqual(cleaned_url, file_url)
self.cleaner = HTMLCleaner()

def test_javascript_link(self):
cleaned_url = self.cleaner._remove_javascript_link("javascript:alert('Hello')")
self.assertIsNone(cleaned_url)

def test_mixed_case_scheme(self):
@ddt.data(
"https://example.com/data:something", # Javascript cannot be executed this way so these urls are safe.
"http://example.com/path/to/page",
"ftp://ftp.example.com/resource",
"file://localhost/path/to/file",
)
def test_valid_urls(self, url):
"""
Javascript can be executed this way so this code should be removed.
Test that valid URLs are preserved.
"""
url = "javascript:alert('hello') https://example.com"
cleaned_url = self.cleaner._remove_javascript_link(url)
self.assertIsNone(cleaned_url)
self.assertEqual(cleaned_url, url)

def test_sub_scheme_match(self):
@ddt.data(
"javascript:alert('Hello')",
"mocha:some_code https://example.com", # Javascript can be executed this way so this code should be removed.
)
def test_javascript_is_removed(self, url):
"""
Javascript cannot be executed this way so these urls are safe.
Test that malicious Javascript is removed.
"""
url = "https://example.com/data:something"
cleaned_url = self.cleaner._remove_javascript_link(url)
self.assertEqual(cleaned_url, url)
self.assertEqual(cleaned_url, '')

0 comments on commit c37c050

Please sign in to comment.