Skip to content

Instantly share code, notes, and snippets.

@PyYoshi
Created June 20, 2012 08:57
Show Gist options
  • Select an option

  • Save PyYoshi/2958909 to your computer and use it in GitHub Desktop.

Select an option

Save PyYoshi/2958909 to your computer and use it in GitHub Desktop.
bs4.1.0にcchardetを適用
--- E:/bs4.orig/dammit.py Wed Jun 13 13:52:56 2012
+++ E:/bs4/dammit.py Wed Jun 20 17:30:47 2012
@@ -13,15 +13,11 @@
import warnings
# Autodetects character encodings. Very useful.
-# Download from http://chardet.feedparser.org/
-# or 'apt-get install python-chardet'
-# or 'easy_install chardet'
+# Download from https://github.com/PyYoshi/cChardet
try:
- import chardet
- #import chardet.constants
- #chardet.constants._debug = 1
+ import cchardet
except ImportError:
- chardet = None
+ cchardet = None
# Available from http://cjkpython.i18n.org/.
try:
@@ -207,8 +203,8 @@
break
# If no luck and we have auto-detection library, try that:
- if not u and chardet and not isinstance(self.markup, unicode):
- u = self._convert_from(chardet.detect(self.markup)['encoding'])
+ if not u and cchardet and not isinstance(self.markup, unicode):
+ u = self._convert_from(cchardet.detect(self.markup))
# As a last resort, try utf-8 and windows-1252:
if not u:
# coding:utf8
# nosetests --nocapture --with-profile --profile-stats-file nose.prof tests.py
# hotshot2dot nose.prof| dot -Tpng -o profile.png
import time
from bs4 import BeautifulSoup
class TestBS4():
def test_speed(self):
html = file(r"testdata1.htm").read()
start = time.time()
s = BeautifulSoup(html)
end = time.time()
print end - start
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment