From fab846e3cc29fa31e19502b9e2b6a5881a52cb46 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Tue, 22 Oct 2024 13:17:04 +1100
Subject: [PATCH] Markdown link fix (#8328) (#8329)

* Improve cleaning of markdown content

* Update unit test with new check

(cherry picked from commit cb0248d15944544d66b5b047b905275d76954a00)

Co-authored-by: Oliver <oliver.henry.walters@gmail.com>
---
 src/backend/InvenTree/InvenTree/helpers.py | 32 ++++++++++++++++++++--
 src/backend/InvenTree/company/test_api.py  | 25 ++++++++++-------
 2 files changed, 44 insertions(+), 13 deletions(-)
diff --git a/src/backend/InvenTree/InvenTree/helpers.py b/src/backend/InvenTree/InvenTree/helpers.py
index 61b16d8145..2cdaf82913 100644
--- a/src/backend/InvenTree/InvenTree/helpers.py
+++ b/src/backend/InvenTree/InvenTree/helpers.py
@@ -21,6 +21,7 @@ from django.core.files.storage import Storage, default_storage
 from django.http import StreamingHttpResponse
 from django.utils.translation import gettext_lazy as _
 
+import bleach
 import pytz
 import regex
 from bleach import clean
@@ -816,7 +817,6 @@ def clean_markdown(value: str):
     This function will remove javascript and other potentially harmful content from the markdown string.
     """
     import markdown
-    from markdownify.templatetags.markdownify import markdownify
 
     try:
         markdownify_settings = settings.MARKDOWNIFY['default']
@@ -835,8 +835,34 @@ def clean_markdown(value: str):
         output_format='html',
     )
 
-    # Clean the HTML content (for comparison). Ideally, this should be the same as the original content
-    clean_html = markdownify(value)
+    # Bleach settings
+    whitelist_tags = markdownify_settings.get(
+        'WHITELIST_TAGS', bleach.sanitizer.ALLOWED_TAGS
+    )
+    whitelist_attrs = markdownify_settings.get(
+        'WHITELIST_ATTRS', bleach.sanitizer.ALLOWED_ATTRIBUTES
+    )
+    whitelist_styles = markdownify_settings.get(
+        'WHITELIST_STYLES', bleach.css_sanitizer.ALLOWED_CSS_PROPERTIES
+    )
+    whitelist_protocols = markdownify_settings.get(
+        'WHITELIST_PROTOCOLS', bleach.sanitizer.ALLOWED_PROTOCOLS
+    )
+    strip = markdownify_settings.get('STRIP', True)
+
+    css_sanitizer = bleach.css_sanitizer.CSSSanitizer(
+        allowed_css_properties=whitelist_styles
+    )
+    cleaner = bleach.Cleaner(
+        tags=whitelist_tags,
+        attributes=whitelist_attrs,
+        css_sanitizer=css_sanitizer,
+        protocols=whitelist_protocols,
+        strip=strip,
+    )
+
+    # Clean the HTML content (for comparison). This must be the same as the original content
+    clean_html = cleaner.clean(html)
 
     if html != clean_html:
         raise ValidationError(_('Data contains prohibited markdown content'))
diff --git a/src/backend/InvenTree/company/test_api.py b/src/backend/InvenTree/company/test_api.py
index ec06ddc40f..e6013f0f8f 100644
--- a/src/backend/InvenTree/company/test_api.py
+++ b/src/backend/InvenTree/company/test_api.py
@@ -159,6 +159,7 @@ class CompanyTest(InvenTreeAPITestCase):
     def test_company_notes(self):
         """Test the markdown 'notes' field for the Company model."""
         pk = Company.objects.first().pk
+        url = reverse('api-company-detail', kwargs={'pk': pk})
 
         # Attempt to inject malicious markdown into the "notes" field
         xss = [
@@ -168,16 +169,23 @@ class CompanyTest(InvenTreeAPITestCase):
         ]
 
         for note in xss:
-            response = self.patch(
-                reverse('api-company-detail', kwargs={'pk': pk}),
-                {'notes': note},
-                expected_code=400,
-            )
+            response = self.patch(url, {'notes': note}, expected_code=400)
 
             self.assertIn(
                 'Data contains prohibited markdown content', str(response.data)
             )
 
+        # Tests with disallowed tags
+        invalid_tags = [
+            '<iframe src="javascript:alert(123)"></iframe>',
+            '<canvas>A disallowed tag!</canvas>',
+        ]
+
+        for note in invalid_tags:
+            response = self.patch(url, {'notes': note}, expected_code=400)
+
+            self.assertIn('Remove HTML tags from this value', str(response.data))
+
         # The following markdown is safe, and should be accepted
         good = [
             'This is a **bold** statement',
@@ -186,14 +194,11 @@ class CompanyTest(InvenTreeAPITestCase):
             'This is an ![image](https://www.google.com/test.jpg)',
             'This is a `code` block',
             'This text has ~~strikethrough~~ formatting',
+            'This text has a raw link - https://www.google.com - and should still pass the test',
         ]
 
         for note in good:
-            response = self.patch(
-                reverse('api-company-detail', kwargs={'pk': pk}),
-                {'notes': note},
-                expected_code=200,
-            )
+            response = self.patch(url, {'notes': note}, expected_code=200)
 
             self.assertEqual(response.data['notes'], note)