Created
January 30, 2015 09:50
-
-
Save komeda-shinji/508f35d1282b3b5d239f to your computer and use it in GitHub Desktop.
twill: HTTP のレスポンスヘッダーでエンコーディングが示されていないとき、エンコーディングを強制する拡張モジュール
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Extension functions for manipulating the browser charset encoding. | |
Commands: | |
getbrowserencoding -- get encoding from meta tag, and set it to browser. | |
setbrowserencoding <encoding> -- set browser encoding as <encoding>. | |
""" | |
import re | |
import twill.commands | |
def getbrowserencoding(): | |
""" | |
>> getbrowserencoding | |
Get current page encoding from HTTP header or HTML meta tag. | |
If Content-Type header doesn't have charset, lookup meta tag, | |
and set it to browser encoding. | |
""" | |
from twill.namespaces import get_twill_glocals | |
browser = twill.get_browser() | |
encoding = None | |
if browser.result is None: | |
print "not viewing any document" | |
return | |
response = browser._browser._response | |
split_header_words = twill.commands.mechanize._headersutil.split_header_words | |
for ct in response.info().getheaders("content-type"): | |
for k, v in split_header_words([ct])[0]: | |
if k == "charset": | |
encoding = browser._browser.encoding() | |
break | |
if encoding: | |
print 'encoding: %s (HTTP)' % encoding | |
return encoding | |
page = browser.get_html() | |
if page: | |
page = page.decode(browser._browser.encoding(), 'ignore') | |
m = re.search(r'<meta\s+charset="([^"]+)"/?>', page, re.IGNORECASE) | |
if not m: | |
m = re.search(r'<meta\s+http-equiv="Content-Type"\s+content="\S+;\s*charset=(\S+)"/?>', re.IGNORECASE) | |
if m: | |
encoding = m.group(1) | |
if encoding: | |
print 'encoding: %s (PAGE)' % encoding | |
setbrowserencoding(encoding) | |
return encoding | |
def setbrowserencoding(encoding): | |
""" | |
>> setbrowserencoding <encoding> | |
Set browser encoding as <encoding>. | |
This is useful for fix mismatching between browser recognized | |
and HTML charset. | |
""" | |
from twill.namespaces import get_twill_glocals | |
browser = twill.get_browser() | |
browser._browser._factory.encoding = encoding | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment