Strip invalid XML characters from response

This commit is contained in:
Hugo van Kemenade 2022-01-12 13:04:27 +02:00
parent 2469a6ea47
commit 9676714dcf
2 changed files with 59 additions and 4 deletions

View file

@ -24,6 +24,7 @@ import hashlib
import html.entities
import logging
import os
import re
import shelve
import ssl
import tempfile
@ -969,7 +970,7 @@ class _Request:
conn.close()
return response_text
def execute(self, cacheable=False):
def execute(self, cacheable: bool = False) -> xml.dom.minidom.Document:
"""Returns the XML DOM response of the POST Request from the server"""
if self.network.is_caching_enabled() and cacheable:
@ -977,13 +978,12 @@ class _Request:
else:
response = self._download_response()
return minidom.parseString(_string(response).replace("opensearch:", ""))
return _parse_response(response)
def _check_response_for_errors(self, response):
"""Checks the response for errors and raises one if any exists."""
try:
doc = minidom.parseString(_string(response).replace("opensearch:", ""))
doc = _parse_response(response)
except Exception as e:
raise MalformedResponseError(self.network, e) from e
@ -2950,4 +2950,20 @@ def _unescape_htmlentity(string):
return string
def _parse_response(response: str) -> xml.dom.minidom.Document:
response = _string(response).replace("opensearch:", "")
try:
doc = minidom.parseString(response)
except xml.parsers.expat.ExpatError:
# Try again. For performance, we only remove when needed in rare cases.
doc = minidom.parseString(_remove_invalid_xml_chars(response))
return doc
def _remove_invalid_xml_chars(string: str) -> str:
return re.sub(
r"[^\u0009\u000A\u000D\u0020-\uD7FF\uE000-\uFFFD\u10000-\u10FFF]+", "", string
)
# End of file

View file

@ -27,3 +27,42 @@ def test_get_cache_key(artist):
def test_cast_and_hash(obj):
assert type(str(obj)) is str
assert isinstance(hash(obj), int)
@pytest.mark.parametrize(
"test_input, expected",
[
(
# Plain text
'<album mbid="">test album name</album>',
'<album mbid="">test album name</album>',
),
(
# Contains Unicode ENQ Enquiry control character
'<album mbid="">test album \u0005name</album>',
'<album mbid="">test album name</album>',
),
],
)
def test__remove_invalid_xml_chars(test_input: str, expected: str) -> None:
assert pylast._remove_invalid_xml_chars(test_input) == expected
@pytest.mark.parametrize(
"test_input, expected",
[
(
# Plain text
'<album mbid="">test album name</album>',
'<?xml version="1.0" ?><album mbid="">test album name</album>',
),
(
# Contains Unicode ENQ Enquiry control character
'<album mbid="">test album \u0005name</album>',
'<?xml version="1.0" ?><album mbid="">test album name</album>',
),
],
)
def test__parse_response(test_input: str, expected: str) -> None:
doc = pylast._parse_response(test_input)
assert doc.toxml() == expected