From 9676714dcf6370eb19cf323234014fd5ddec3bc0 Mon Sep 17 00:00:00 2001 From: Hugo van Kemenade Date: Wed, 12 Jan 2022 13:04:27 +0200 Subject: [PATCH] Strip invalid XML characters from response --- src/pylast/__init__.py | 24 ++++++++++++++++++++---- tests/unicode_test.py | 39 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+), 4 deletions(-) diff --git a/src/pylast/__init__.py b/src/pylast/__init__.py index 54d4d40..e32e849 100644 --- a/src/pylast/__init__.py +++ b/src/pylast/__init__.py @@ -24,6 +24,7 @@ import hashlib import html.entities import logging import os +import re import shelve import ssl import tempfile @@ -969,7 +970,7 @@ class _Request: conn.close() return response_text - def execute(self, cacheable=False): + def execute(self, cacheable: bool = False) -> xml.dom.minidom.Document: """Returns the XML DOM response of the POST Request from the server""" if self.network.is_caching_enabled() and cacheable: @@ -977,13 +978,12 @@ class _Request: else: response = self._download_response() - return minidom.parseString(_string(response).replace("opensearch:", "")) + return _parse_response(response) def _check_response_for_errors(self, response): """Checks the response for errors and raises one if any exists.""" - try: - doc = minidom.parseString(_string(response).replace("opensearch:", "")) + doc = _parse_response(response) except Exception as e: raise MalformedResponseError(self.network, e) from e @@ -2950,4 +2950,20 @@ def _unescape_htmlentity(string): return string +def _parse_response(response: str) -> xml.dom.minidom.Document: + response = _string(response).replace("opensearch:", "") + try: + doc = minidom.parseString(response) + except xml.parsers.expat.ExpatError: + # Try again. For performance, we only remove when needed in rare cases. + doc = minidom.parseString(_remove_invalid_xml_chars(response)) + return doc + + +def _remove_invalid_xml_chars(string: str) -> str: + return re.sub( + r"[^\u0009\u000A\u000D\u0020-\uD7FF\uE000-\uFFFD\u10000-\u10FFF]+", "", string + ) + + # End of file diff --git a/tests/unicode_test.py b/tests/unicode_test.py index 7b3c271..350256c 100644 --- a/tests/unicode_test.py +++ b/tests/unicode_test.py @@ -27,3 +27,42 @@ def test_get_cache_key(artist): def test_cast_and_hash(obj): assert type(str(obj)) is str assert isinstance(hash(obj), int) + + +@pytest.mark.parametrize( + "test_input, expected", + [ + ( + # Plain text + 'test album name', + 'test album name', + ), + ( + # Contains Unicode ENQ Enquiry control character + 'test album \u0005name', + 'test album name', + ), + ], +) +def test__remove_invalid_xml_chars(test_input: str, expected: str) -> None: + assert pylast._remove_invalid_xml_chars(test_input) == expected + + +@pytest.mark.parametrize( + "test_input, expected", + [ + ( + # Plain text + 'test album name', + 'test album name', + ), + ( + # Contains Unicode ENQ Enquiry control character + 'test album \u0005name', + 'test album name', + ), + ], +) +def test__parse_response(test_input: str, expected: str) -> None: + doc = pylast._parse_response(test_input) + assert doc.toxml() == expected