From 6e37f0cd8ba5fc527412f18f66cd6a37015fa690 Mon Sep 17 00:00:00 2001 From: Oliver Date: Mon, 7 Oct 2024 20:03:39 +1100 Subject: [PATCH] Markdown xss backport (#8244) * Update helpers.py * Update mixins.py * format * format * Allow horizontal rule in markdown * More instructive error msg * Specify output_format to markdown.markdown Ref: https://python-markdown.github.io/reference/markdown/serializers/ * Cleanup * Adjust allowable markdown tags * Add unit test for malicious markdown XSS * Allow
 tag

---------

Co-authored-by: Matthias Mair 
---
 src/backend/InvenTree/InvenTree/helpers.py  | 34 +++++++++++++++++
 src/backend/InvenTree/InvenTree/mixins.py   | 15 ++++++--
 src/backend/InvenTree/InvenTree/settings.py |  8 +++-
 src/backend/InvenTree/company/test_api.py   | 41 +++++++++++++++++++++
 4 files changed, 94 insertions(+), 4 deletions(-)

diff --git a/src/backend/InvenTree/InvenTree/helpers.py b/src/backend/InvenTree/InvenTree/helpers.py
index 4558594436..61b16d8145 100644
--- a/src/backend/InvenTree/InvenTree/helpers.py
+++ b/src/backend/InvenTree/InvenTree/helpers.py
@@ -810,6 +810,40 @@ def remove_non_printable_characters(
     return cleaned
 
 
+def clean_markdown(value: str):
+    """Clean a markdown string.
+
+    This function will remove javascript and other potentially harmful content from the markdown string.
+    """
+    import markdown
+    from markdownify.templatetags.markdownify import markdownify
+
+    try:
+        markdownify_settings = settings.MARKDOWNIFY['default']
+    except (AttributeError, KeyError):
+        markdownify_settings = {}
+
+    extensions = markdownify_settings.get('MARKDOWN_EXTENSIONS', [])
+    extension_configs = markdownify_settings.get('MARKDOWN_EXTENSION_CONFIGS', {})
+
+    # Generate raw HTML from provided markdown (without sanitizing)
+    # Note: The 'html' output_format is required to generate self closing tags, e.g.  instead of 
+    html = markdown.markdown(
+        value or '',
+        extensions=extensions,
+        extension_configs=extension_configs,
+        output_format='html',
+    )
+
+    # Clean the HTML content (for comparison). Ideally, this should be the same as the original content
+    clean_html = markdownify(value)
+
+    if html != clean_html:
+        raise ValidationError(_('Data contains prohibited markdown content'))
+
+    return value
+
+
 def hash_barcode(barcode_data):
     """Calculate a 'unique' hash for a barcode string.
 
diff --git a/src/backend/InvenTree/InvenTree/mixins.py b/src/backend/InvenTree/InvenTree/mixins.py
index fd63726d96..d02a96dfac 100644
--- a/src/backend/InvenTree/InvenTree/mixins.py
+++ b/src/backend/InvenTree/InvenTree/mixins.py
@@ -6,7 +6,11 @@ from rest_framework import generics, mixins, status
 from rest_framework.response import Response
 
 from InvenTree.fields import InvenTreeNotesField
-from InvenTree.helpers import remove_non_printable_characters, strip_html_tags
+from InvenTree.helpers import (
+    clean_markdown,
+    remove_non_printable_characters,
+    strip_html_tags,
+)
 
 
 class CleanMixin:
@@ -57,6 +61,7 @@ class CleanMixin:
 
         # By default, newline characters are removed
         remove_newline = True
+        is_markdown = False
 
         try:
             if hasattr(self, 'serializer_class'):
@@ -64,11 +69,12 @@ class CleanMixin:
                 field = model._meta.get_field(field)
 
                 # The following field types allow newline characters
-                allow_newline = [InvenTreeNotesField]
+                allow_newline = [(InvenTreeNotesField, True)]
 
                 for field_type in allow_newline:
-                    if issubclass(type(field), field_type):
+                    if issubclass(type(field), field_type[0]):
                         remove_newline = False
+                        is_markdown = field_type[1]
                         break
 
         except AttributeError:
@@ -80,6 +86,9 @@ class CleanMixin:
             cleaned, remove_newline=remove_newline
         )
 
+        if is_markdown:
+            cleaned = clean_markdown(cleaned)
+
         return cleaned
 
     def clean_data(self, data: dict) -> dict:
diff --git a/src/backend/InvenTree/InvenTree/settings.py b/src/backend/InvenTree/InvenTree/settings.py
index 6cccc3bd49..7d38b7cff9 100644
--- a/src/backend/InvenTree/InvenTree/settings.py
+++ b/src/backend/InvenTree/InvenTree/settings.py
@@ -1231,23 +1231,29 @@ MARKDOWNIFY = {
             'abbr',
             'b',
             'blockquote',
+            'code',
             'em',
             'h1',
             'h2',
             'h3',
+            'h4',
+            'h5',
+            'hr',
             'i',
             'img',
             'li',
             'ol',
             'p',
+            'pre',
+            's',
             'strong',
-            'ul',
             'table',
             'thead',
             'tbody',
             'th',
             'tr',
             'td',
+            'ul',
         ],
     }
 }
diff --git a/src/backend/InvenTree/company/test_api.py b/src/backend/InvenTree/company/test_api.py
index 78064a3bc5..ec06ddc40f 100644
--- a/src/backend/InvenTree/company/test_api.py
+++ b/src/backend/InvenTree/company/test_api.py
@@ -156,6 +156,47 @@ class CompanyTest(InvenTreeAPITestCase):
             len(self.get(url, data={'active': False}, expected_code=200).data), 1
         )
 
+    def test_company_notes(self):
+        """Test the markdown 'notes' field for the Company model."""
+        pk = Company.objects.first().pk
+
+        # Attempt to inject malicious markdown into the "notes" field
+        xss = [
+            '[Click me](javascript:alert(123))',
+            '![x](javascript:alert(123))',
+            '![Uh oh...]("onerror="alert(\'XSS\'))',
+        ]
+
+        for note in xss:
+            response = self.patch(
+                reverse('api-company-detail', kwargs={'pk': pk}),
+                {'notes': note},
+                expected_code=400,
+            )
+
+            self.assertIn(
+                'Data contains prohibited markdown content', str(response.data)
+            )
+
+        # The following markdown is safe, and should be accepted
+        good = [
+            'This is a **bold** statement',
+            'This is a *italic* statement',
+            'This is a [link](https://www.google.com)',
+            'This is an ![image](https://www.google.com/test.jpg)',
+            'This is a `code` block',
+            'This text has ~~strikethrough~~ formatting',
+        ]
+
+        for note in good:
+            response = self.patch(
+                reverse('api-company-detail', kwargs={'pk': pk}),
+                {'notes': note},
+                expected_code=200,
+            )
+
+            self.assertEqual(response.data['notes'], note)
+
 
 class ContactTest(InvenTreeAPITestCase):
     """Tests for the Contact models."""