Any clue on how to parse xml in python that has: encoding='Windows-1255' in it? At least the lxml.etree parser won't even look at the string when there's an "encoding" tag in the XML header which isn't "utf-8" or "ASCII".
Running the following code fails with:
ValueError: Unicode strings with encoding declaration are not supported. Please use bytes input or XML fragments without declaration.
from lxml import etree
parser = etree.XMLParser(encoding='utf-8')
def convert_xml_to_utf8(xml_str):
tree = etree.fromstring(xml_str, parser=parser)
if tree.docinfo.encoding == 'utf-8':
# already in correct encoding, abort
return xml_str
decoded_str = xml_str.decode(tree.docinfo.encoding)
utf8_encoded_str = decoded_str.encode('utf-8')
tree = etree.fromstring(utf8_encoded_str)
tree.docinfo.encoding = 'utf-8'
return etree.tostring(tree, pretty_print = True, xml_declaration = True, encoding='UTF-8', standalone="yes")
data = '''<?xml version='1.0' encoding='Windows-1255'?><rss version="2.0"><channel ><title ><![CDATA[ynet - חדשות]]></title></channel></rss>'''
print(convert_xml_to_utf8(data))