Markdown link fix (#8328)

* Improve cleaning of markdown content * Update unit test with new check
2025-12-31 08:17:58 +00:00 · 2024-10-22 13:06:43 +11:00
parent ddea9fa4b9
commit cb0248d159
2 changed files with 44 additions and 13 deletions
--- a/src/backend/InvenTree/InvenTree/helpers.py
+++ b/src/backend/InvenTree/InvenTree/helpers.py
@@ -21,6 +21,7 @@ from django.http import StreamingHttpResponse
 from django.utils import timezone
 from django.utils.translation import gettext_lazy as _
 import bleach
 import pytz
 import regex
 from bleach import clean
@@ -829,7 +830,6 @@ def clean_markdown(value: str):
    This function will remove javascript and other potentially harmful content from the markdown string.
    """
    import markdown
    from markdownify.templatetags.markdownify import markdownify
    try:
        markdownify_settings = settings.MARKDOWNIFY['default']
@@ -848,8 +848,34 @@ def clean_markdown(value: str):
        output_format='html',
    )
-    # Clean the HTML content (for comparison). Ideally, this should be the same as the original content
+    # Bleach settings
-    clean_html = markdownify(value)
+    whitelist_tags = markdownify_settings.get(
        'WHITELIST_TAGS', bleach.sanitizer.ALLOWED_TAGS
    )
    whitelist_attrs = markdownify_settings.get(
        'WHITELIST_ATTRS', bleach.sanitizer.ALLOWED_ATTRIBUTES
    )
    whitelist_styles = markdownify_settings.get(
        'WHITELIST_STYLES', bleach.css_sanitizer.ALLOWED_CSS_PROPERTIES
    )
    whitelist_protocols = markdownify_settings.get(
        'WHITELIST_PROTOCOLS', bleach.sanitizer.ALLOWED_PROTOCOLS
    )
    strip = markdownify_settings.get('STRIP', True)
    css_sanitizer = bleach.css_sanitizer.CSSSanitizer(
        allowed_css_properties=whitelist_styles
    )
    cleaner = bleach.Cleaner(
        tags=whitelist_tags,
        attributes=whitelist_attrs,
        css_sanitizer=css_sanitizer,
        protocols=whitelist_protocols,
        strip=strip,
    )
    # Clean the HTML content (for comparison). This must be the same as the original content
    clean_html = cleaner.clean(html)
    if html != clean_html:
        raise ValidationError(_('Data contains prohibited markdown content'))
--- a/src/backend/InvenTree/company/test_api.py
+++ b/src/backend/InvenTree/company/test_api.py
@@ -157,6 +157,7 @@ class CompanyTest(InvenTreeAPITestCase):
    def test_company_notes(self):
        """Test the markdown 'notes' field for the Company model."""
        pk = Company.objects.first().pk
        url = reverse('api-company-detail', kwargs={'pk': pk})
        # Attempt to inject malicious markdown into the "notes" field
        xss = [
@@ -166,16 +167,23 @@ class CompanyTest(InvenTreeAPITestCase):
        ]
        for note in xss:
-            response = self.patch(
+            response = self.patch(url, {'notes': note}, expected_code=400)
                reverse('api-company-detail', kwargs={'pk': pk}),
                {'notes': note},
                expected_code=400,
            )
            self.assertIn(
                'Data contains prohibited markdown content', str(response.data)
            )
        # Tests with disallowed tags
        invalid_tags = [
            '<iframe src="javascript:alert(123)"></iframe>',
            '<canvas>A disallowed tag!</canvas>',
        ]
        for note in invalid_tags:
            response = self.patch(url, {'notes': note}, expected_code=400)
            self.assertIn('Remove HTML tags from this value', str(response.data))
        # The following markdown is safe, and should be accepted
        good = [
            'This is a **bold** statement',
@@ -184,14 +192,11 @@ class CompanyTest(InvenTreeAPITestCase):
            'This is an ![image](https://www.google.com/test.jpg)',
            'This is a `code` block',
            'This text has ~~strikethrough~~ formatting',
            'This text has a raw link - https://www.google.com - and should still pass the test',
        ]
        for note in good:
-            response = self.patch(
+            response = self.patch(url, {'notes': note}, expected_code=200)
                reverse('api-company-detail', kwargs={'pk': pk}),
                {'notes': note},
                expected_code=200,
            )
            self.assertEqual(response.data['notes'], note)