From fab846e3cc29fa31e19502b9e2b6a5881a52cb46 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Tue, 22 Oct 2024 13:17:04 +1100 Subject: [PATCH] Markdown link fix (#8328) (#8329) * Improve cleaning of markdown content * Update unit test with new check (cherry picked from commit cb0248d15944544d66b5b047b905275d76954a00) Co-authored-by: Oliver --- src/backend/InvenTree/InvenTree/helpers.py | 32 ++++++++++++++++++++-- src/backend/InvenTree/company/test_api.py | 25 ++++++++++------- 2 files changed, 44 insertions(+), 13 deletions(-) diff --git a/src/backend/InvenTree/InvenTree/helpers.py b/src/backend/InvenTree/InvenTree/helpers.py index 61b16d8145..2cdaf82913 100644 --- a/src/backend/InvenTree/InvenTree/helpers.py +++ b/src/backend/InvenTree/InvenTree/helpers.py @@ -21,6 +21,7 @@ from django.core.files.storage import Storage, default_storage from django.http import StreamingHttpResponse from django.utils.translation import gettext_lazy as _ +import bleach import pytz import regex from bleach import clean @@ -816,7 +817,6 @@ def clean_markdown(value: str): This function will remove javascript and other potentially harmful content from the markdown string. """ import markdown - from markdownify.templatetags.markdownify import markdownify try: markdownify_settings = settings.MARKDOWNIFY['default'] @@ -835,8 +835,34 @@ def clean_markdown(value: str): output_format='html', ) - # Clean the HTML content (for comparison). Ideally, this should be the same as the original content - clean_html = markdownify(value) + # Bleach settings + whitelist_tags = markdownify_settings.get( + 'WHITELIST_TAGS', bleach.sanitizer.ALLOWED_TAGS + ) + whitelist_attrs = markdownify_settings.get( + 'WHITELIST_ATTRS', bleach.sanitizer.ALLOWED_ATTRIBUTES + ) + whitelist_styles = markdownify_settings.get( + 'WHITELIST_STYLES', bleach.css_sanitizer.ALLOWED_CSS_PROPERTIES + ) + whitelist_protocols = markdownify_settings.get( + 'WHITELIST_PROTOCOLS', bleach.sanitizer.ALLOWED_PROTOCOLS + ) + strip = markdownify_settings.get('STRIP', True) + + css_sanitizer = bleach.css_sanitizer.CSSSanitizer( + allowed_css_properties=whitelist_styles + ) + cleaner = bleach.Cleaner( + tags=whitelist_tags, + attributes=whitelist_attrs, + css_sanitizer=css_sanitizer, + protocols=whitelist_protocols, + strip=strip, + ) + + # Clean the HTML content (for comparison). This must be the same as the original content + clean_html = cleaner.clean(html) if html != clean_html: raise ValidationError(_('Data contains prohibited markdown content')) diff --git a/src/backend/InvenTree/company/test_api.py b/src/backend/InvenTree/company/test_api.py index ec06ddc40f..e6013f0f8f 100644 --- a/src/backend/InvenTree/company/test_api.py +++ b/src/backend/InvenTree/company/test_api.py @@ -159,6 +159,7 @@ class CompanyTest(InvenTreeAPITestCase): def test_company_notes(self): """Test the markdown 'notes' field for the Company model.""" pk = Company.objects.first().pk + url = reverse('api-company-detail', kwargs={'pk': pk}) # Attempt to inject malicious markdown into the "notes" field xss = [ @@ -168,16 +169,23 @@ class CompanyTest(InvenTreeAPITestCase): ] for note in xss: - response = self.patch( - reverse('api-company-detail', kwargs={'pk': pk}), - {'notes': note}, - expected_code=400, - ) + response = self.patch(url, {'notes': note}, expected_code=400) self.assertIn( 'Data contains prohibited markdown content', str(response.data) ) + # Tests with disallowed tags + invalid_tags = [ + '', + 'A disallowed tag!', + ] + + for note in invalid_tags: + response = self.patch(url, {'notes': note}, expected_code=400) + + self.assertIn('Remove HTML tags from this value', str(response.data)) + # The following markdown is safe, and should be accepted good = [ 'This is a **bold** statement', @@ -186,14 +194,11 @@ class CompanyTest(InvenTreeAPITestCase): 'This is an ![image](https://www.google.com/test.jpg)', 'This is a `code` block', 'This text has ~~strikethrough~~ formatting', + 'This text has a raw link - https://www.google.com - and should still pass the test', ] for note in good: - response = self.patch( - reverse('api-company-detail', kwargs={'pk': pk}), - {'notes': note}, - expected_code=200, - ) + response = self.patch(url, {'notes': note}, expected_code=200) self.assertEqual(response.data['notes'], note)