Created
June 20, 2012 08:57
-
-
Save PyYoshi/2958909 to your computer and use it in GitHub Desktop.
bs4.1.0にcchardetを適用
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| --- E:/bs4.orig/dammit.py Wed Jun 13 13:52:56 2012 | |
| +++ E:/bs4/dammit.py Wed Jun 20 17:30:47 2012 | |
| @@ -13,15 +13,11 @@ | |
| import warnings | |
| # Autodetects character encodings. Very useful. | |
| -# Download from http://chardet.feedparser.org/ | |
| -# or 'apt-get install python-chardet' | |
| -# or 'easy_install chardet' | |
| +# Download from https://github.com/PyYoshi/cChardet | |
| try: | |
| - import chardet | |
| - #import chardet.constants | |
| - #chardet.constants._debug = 1 | |
| + import cchardet | |
| except ImportError: | |
| - chardet = None | |
| + cchardet = None | |
| # Available from http://cjkpython.i18n.org/. | |
| try: | |
| @@ -207,8 +203,8 @@ | |
| break | |
| # If no luck and we have auto-detection library, try that: | |
| - if not u and chardet and not isinstance(self.markup, unicode): | |
| - u = self._convert_from(chardet.detect(self.markup)['encoding']) | |
| + if not u and cchardet and not isinstance(self.markup, unicode): | |
| + u = self._convert_from(cchardet.detect(self.markup)) | |
| # As a last resort, try utf-8 and windows-1252: | |
| if not u: |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # coding:utf8 | |
| # nosetests --nocapture --with-profile --profile-stats-file nose.prof tests.py | |
| # hotshot2dot nose.prof| dot -Tpng -o profile.png | |
| import time | |
| from bs4 import BeautifulSoup | |
| class TestBS4(): | |
| def test_speed(self): | |
| html = file(r"testdata1.htm").read() | |
| start = time.time() | |
| s = BeautifulSoup(html) | |
| end = time.time() | |
| print end - start | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment