Created
January 31, 2016 20:28
-
-
Save tantale/c217176eed7c6c5dad76 to your computer and use it in GitHub Desktop.
Detect the character encoding of the XML file
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
XML_DECL_REGEX = r""" | |
^<\?xml # w/o BOM, XML declaration starts with <?xml at the first byte | |
.+? # some chars (version info), matched minimal | |
encoding= # encoding attribute begins | |
["'] # attribute start delimiter | |
(?P<encstr> # what's matched in the brackets will be named encstr | |
[^"']+ # every character not delimiter (not overly exact!) | |
) # closes the brackets pair for the named group | |
["'] # attribute end delimiter | |
.*? # some chars optionally (standalone decl or whitespace) | |
\?> # XML declaration end | |
""" | |
search_xml_decl = re.compile(XML_DECL_REGEX, re.VERBOSE).search | |
def detect_xml_encoding(fp): | |
""" | |
Attempts to detect the character encoding of the XML file | |
given by a file object fp. fp must not be a codec wrapped file | |
object! | |
The return value can be: | |
- if detection of the BOM succeeds, the codec name of the | |
corresponding unicode charset is returned | |
- if BOM detection fails, the XML declaration is searched for | |
the encoding attribute and its value returned. the "<" | |
character has to be the very first in the file then (it's XML | |
standard after all). | |
- if BOM and XML declaration fail, None is returned. According | |
to XML 1.0 it should be utf_8 then, but it wasn't detected by | |
the means offered here. at least one can be pretty sure that a | |
character coding including most of ASCII is used :-/ | |
:param fp: Opened file of file-like object | |
:rtype: str or unicode | |
:return: Encoding name | |
""" | |
# == detection using BOM | |
# -- the BOMs we know, by their pattern | |
bom_dict = { # byte pattern : name | |
(0x00, 0x00, 0xFE, 0xFF): "utf_32_be", | |
(0xFF, 0xFE, 0x00, 0x00): "utf_32_le", | |
(0xFE, 0xFF, None, None): "utf_16_be", | |
(0xFF, 0xFE, None, None): "utf_16_le", | |
(0xEF, 0xBB, 0xBF, None): "utf_8", | |
} | |
# -- go to beginning of file and get the first 4 bytes | |
old_fp = fp.tell() | |
fp.seek(0) | |
(byte1, byte2, byte3, byte4) = tuple(map(ord, fp.read(4))) | |
# -- try bom detection using 4 bytes, 3 bytes, or 2 bytes | |
bom_detection = bom_dict.get((byte1, byte2, byte3, byte4)) | |
if not bom_detection: | |
bom_detection = bom_dict.get((byte1, byte2, byte3, None)) | |
if not bom_detection: | |
bom_detection = bom_dict.get((byte1, byte2, None, None)) | |
# -- if BOM detected, we're done :-) | |
if bom_detection: | |
fp.seek(old_fp) | |
return bom_detection | |
# -- still here? BOM detection failed. | |
# -- now that BOM detection has failed we assume one byte character | |
# -- encoding behaving ASCII - of course one could think of nice | |
# -- algorithms further investigating on that matter, but I won't for now. | |
# == search XML declaration for encoding attribute | |
# -- assume XML declaration fits into the first 2 KB (*cough*) | |
fp.seek(0) | |
tmp_buffer = fp.read(2048) | |
# -- search and extract encoding string | |
match = search_xml_decl(tmp_buffer) | |
fp.seek(old_fp) | |
if match: | |
return match.group("encstr") | |
else: | |
return None | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment