Created
February 16, 2013 19:05
-
-
Save Kwpolska/4968252 to your computer and use it in GitHub Desktop.
LXML Breakage
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "lxml breakage" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "import lxml.html, requests", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 10 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "r = requests.get('https://aur.archlinux.org')\n", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 2 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "r.text[:38]", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "pyout", | |
"prompt_number": 8, | |
"text": "'<?xml version=\"1.0\" encoding=\"UTF-8\"?>'" | |
} | |
], | |
"prompt_number": 8 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "lxml.html.fromstring(r.text) # u'string'\n", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"ename": "ValueError", | |
"evalue": "Unicode strings with encoding declaration are not supported.", | |
"output_type": "pyerr", | |
"traceback": [ | |
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)", | |
"\u001b[1;32m<ipython-input-11-2b8a760199e9>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mlxml\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mhtml\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfromstring\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mr\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtext\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", | |
"\u001b[1;32m/usr/lib/python3.3/site-packages/lxml/html/__init__.py\u001b[0m in \u001b[0;36mfromstring\u001b[1;34m(html, base_url, parser, **kw)\u001b[0m\n\u001b[0;32m 663\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mdocument_fromstring\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mhtml\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mparser\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mparser\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mbase_url\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mbase_url\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkw\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 664\u001b[0m \u001b[1;31m# otherwise, lets parse it out...\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 665\u001b[1;33m \u001b[0mdoc\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdocument_fromstring\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mhtml\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mparser\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mparser\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mbase_url\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mbase_url\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkw\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 666\u001b[0m \u001b[0mbodies\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdoc\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfindall\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'body'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 667\u001b[0m \u001b[1;32mif\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mbodies\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", | |
"\u001b[1;32m/usr/lib/python3.3/site-packages/lxml/html/__init__.py\u001b[0m in \u001b[0;36mdocument_fromstring\u001b[1;34m(html, parser, **kw)\u001b[0m\n\u001b[0;32m 561\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mparser\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 562\u001b[0m \u001b[0mparser\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mhtml_parser\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 563\u001b[1;33m \u001b[0mvalue\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0metree\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfromstring\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mhtml\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mparser\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkw\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 564\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mvalue\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 565\u001b[0m raise etree.ParserError(\n", | |
"\u001b[1;32m/usr/lib/python3.3/site-packages/lxml/etree.cpython-33m.so\u001b[0m in \u001b[0;36mlxml.etree.fromstring (src/lxml/lxml.etree.c:61729)\u001b[1;34m()\u001b[0m\n", | |
"\u001b[1;32m/usr/lib/python3.3/site-packages/lxml/etree.cpython-33m.so\u001b[0m in \u001b[0;36mlxml.etree._parseMemoryDocument (src/lxml/lxml.etree.c:91131)\u001b[1;34m()\u001b[0m\n", | |
"\u001b[1;31mValueError\u001b[0m: Unicode strings with encoding declaration are not supported." | |
] | |
} | |
], | |
"prompt_number": 11 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "lxml.html.fromstring(r.content) # b'string'", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"ename": "TypeError", | |
"evalue": "startswith first arg must be bytes or a tuple of bytes, not str", | |
"output_type": "pyerr", | |
"traceback": [ | |
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)", | |
"\u001b[1;32m<ipython-input-12-059d51be153d>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mlxml\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mhtml\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfromstring\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mr\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcontent\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;31m# b'string'\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", | |
"\u001b[1;32m/usr/lib/python3.3/site-packages/lxml/html/__init__.py\u001b[0m in \u001b[0;36mfromstring\u001b[1;34m(html, base_url, parser, **kw)\u001b[0m\n\u001b[0;32m 659\u001b[0m \u001b[0mparser\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mhtml_parser\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 660\u001b[0m \u001b[0mstart\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mhtml\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;36m10\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mlstrip\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mlower\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 661\u001b[1;33m \u001b[1;32mif\u001b[0m \u001b[0mstart\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstartswith\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'<html'\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mor\u001b[0m \u001b[0mstart\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstartswith\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'<!doctype'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 662\u001b[0m \u001b[1;31m# Looks like a full HTML document\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 663\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mdocument_fromstring\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mhtml\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mparser\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mparser\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mbase_url\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mbase_url\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkw\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", | |
"\u001b[1;31mTypeError\u001b[0m: startswith first arg must be bytes or a tuple of bytes, not str" | |
] | |
} | |
], | |
"prompt_number": 12 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "lxml.html.fromstring(r.text[38:])", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "pyout", | |
"prompt_number": 13, | |
"text": "<Element html at 0x7f7e600c0e30>" | |
} | |
], | |
"prompt_number": 13 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "lxml.html.fromstring(r.content[38:])", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"ename": "TypeError", | |
"evalue": "startswith first arg must be bytes or a tuple of bytes, not str", | |
"output_type": "pyerr", | |
"traceback": [ | |
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)", | |
"\u001b[1;32m<ipython-input-14-f1fb572c1313>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mlxml\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mhtml\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfromstring\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mr\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcontent\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m38\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", | |
"\u001b[1;32m/usr/lib/python3.3/site-packages/lxml/html/__init__.py\u001b[0m in \u001b[0;36mfromstring\u001b[1;34m(html, base_url, parser, **kw)\u001b[0m\n\u001b[0;32m 659\u001b[0m \u001b[0mparser\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mhtml_parser\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 660\u001b[0m \u001b[0mstart\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mhtml\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;36m10\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mlstrip\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mlower\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 661\u001b[1;33m \u001b[1;32mif\u001b[0m \u001b[0mstart\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstartswith\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'<html'\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mor\u001b[0m \u001b[0mstart\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstartswith\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'<!doctype'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 662\u001b[0m \u001b[1;31m# Looks like a full HTML document\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 663\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mdocument_fromstring\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mhtml\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mparser\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mparser\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mbase_url\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mbase_url\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkw\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", | |
"\u001b[1;31mTypeError\u001b[0m: startswith first arg must be bytes or a tuple of bytes, not str" | |
] | |
} | |
], | |
"prompt_number": 14 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment