Created
June 22, 2010 21:51
-
-
Save proger/449154 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| io proger ~ 0 % py | |
| In [3]: import urllib2 | |
| In [5]: urllib2.urlopen('http://www.abitura.com/handbook/') | |
| Out[5]: <addinfourl at 4314565160 whose fp = <socket._fileobject object at 0x1016f4ad0>> | |
| In [6]: url = _ | |
| In [7]: url.readlines() | |
| Out[7]: ... | |
| In [8]: hb = _ | |
| In [18]: from lxml import etree | |
| In [19]: etree.fromstr | |
| etree.fromstring etree.fromstringlist | |
| In [21]: etree.fromstringlist(hb, etree.HT | |
| etree.HTML etree.HTMLParser | |
| In [22]: etree.fromstringlist(hb, etree.HTMLParser()) | |
| Out[22]: <Element html at 102c42628> | |
| In [23]: ht = _ | |
| In [25]: list(ht) | |
| Out[25]: [<Element head at 102c424c8>, <Element body at 102c42578>] | |
| In [26]: list(ht)[1] | |
| Out[26]: <Element body at 102c42578> | |
| In [27]: htb = _ | |
| In [28]: htb. | |
| htb.__class__ htb.__init__ htb.__sizeof__ htb.find htb.insert htb.nsmap | |
| htb.__contains__ htb.__iter__ htb.__str__ htb.findall htb.items htb.prefix | |
| htb.__copy__ htb.__len__ htb.__subclasshook__ htb.findtext htb.iter htb.remove | |
| htb.__deepcopy__ htb.__new__ htb._init htb.get htb.iterancestors htb.replace | |
| htb.__delattr__ htb.__nonzero__ htb.addnext htb.getchildren htb.iterchildren htb.set | |
| htb.__delitem__ htb.__reduce__ htb.addprevious htb.getiterator htb.iterdescendants htb.sourceline | |
| htb.__doc__ htb.__reduce_ex__ htb.append htb.getnext htb.iterfind htb.tag | |
| htb.__format__ htb.__repr__ htb.attrib htb.getparent htb.itersiblings htb.tail | |
| htb.__getattribute__ htb.__reversed__ htb.base htb.getprevious htb.itertext htb.text | |
| htb.__getitem__ htb.__setattr__ htb.clear htb.getroottree htb.keys htb.values | |
| htb.__hash__ htb.__setitem__ htb.extend htb.index htb.makeelement htb.xpath | |
| In [53]: tbl.iterdescendants(tag='a') | |
| Out[53]: <lxml.etree.ElementDepthFirstIterator object at 0x102c8d1f0> | |
| In [60]: list(list(list(tbl)[0])[0]) | |
| Out[60]: | |
| [<Element p at 102c42418>, | |
| <Element p at 102cb0050>, | |
| <Element p at 102cb0158>, | |
| <Element p at 102cb01b0>, | |
| <Element center at 102cb0208>] | |
| In [61]: for el in list(list(list(tbl)[0])[0]): | |
| KeyboardInterrupt | |
| In [61]: xp = '/html/body/table[2]/tbody/tr/td[2]/p/a' | |
| In [63]: etree.fromstringlist(hb, etree.HTMLParser()) | |
| Out[63]: <Element html at 102cb03c0> | |
| In [64]: root = _ | |
| In [65]: root.xpath? | |
| Type: builtin_function_or_method | |
| Base Class: <type 'builtin_function_or_method'> | |
| String Form: <built-in method xpath of lxml.etree._Element object at 0x102cb03c0> | |
| Namespace: Interactive | |
| Docstring: | |
| xpath(self, _path, namespaces=None, extensions=None, smart_strings=True, **_variables) | |
| Evaluate an xpath expression using the element as context node. | |
| In [67]: root.xpath(xp) | |
| Out[67]: [] | |
| In [68]: xp | |
| Out[68]: '/html/body/table[2]/tbody/tr/td[2]/p/a' | |
| In [69]: root.xpath(xpath) | |
| KeyboardInterrupt | |
| In [69]: root.xpath('/html/body/table[2]/tbody/tr') | |
| Out[69]: [] | |
| In [81]: anchors = root.xpath('/html/body/table[2]//a') | |
| In [83]: a = anchors[0] | |
| In [84]: a.attrib | |
| Out[84]: {'href': '../reseption.html'} | |
| In [86]: for a in anchors: print a.attrib.get('href', '') | |
| ....: | |
| ../reseption.html | |
| ../entering_task.html | |
| ../open_lessons/ | |
| ../textbooks.html | |
| ../problems/ | |
| ../questions/ | |
| ../abstracts/ | |
| ../tournaments/ | |
| ../sunday_school.html | |
| ../modern_physics/ | |
| ../happy_physics/ | |
| ../collection/ | |
| ../not_only/ | |
| ../links/ | |
| ../repetitor.html | |
| ../man | |
| ../mathematics/index.html | |
| ../chemi/index.html | |
| ../renessans/index.html | |
| ../open_lessons/index.html | |
| mailto:[email protected] | |
| kinematika1.html | |
| kinematika2.html | |
| kinematika3.html | |
| dinamika1.html | |
| dinamika2.html | |
| dinamika3.html | |
| dinamika4.html | |
| dinamika5.html | |
| dinamika6.html | |
| statika1.html | |
| statika2.html | |
| statika3.html | |
| molecular_physics1.html | |
| molecular_physics2.html | |
| molecular_physics3.html | |
| molecular_physics4.html | |
| molecular_physics5.html | |
| molecular_physics6.html | |
| molecular_physics6.html | |
| molecular_physics7.html | |
| molecular_physics8.html | |
| molecular_physics9.html | |
| molecular_physics10.html | |
| electricity1.html | |
| electricity2.html | |
| electricity3.html | |
| electricity4.html | |
| electricity5.html | |
| electricity6.html | |
| electricity7.html | |
| electricity8.html | |
| electricity9.html | |
| electricity10.html | |
| electricity11.html | |
| electricity12.html | |
| electricity13.html | |
| electricity14.html | |
| magnetic0.html | |
| magnetic1.html | |
| magnetic2.html | |
| magnetic3.html | |
| magnetic4.html | |
| magnetic5.html | |
| magnetic6.html | |
| magnetic6.html | |
| oscillation1.html | |
| oscillation2.html | |
| oscillation3.html | |
| oscillation4.html | |
| oscillation5.html | |
| oscillation6.html | |
| oscillation7.html | |
| oscillation8.html | |
| oscillation9.html | |
| oscillation10.html | |
| oscillation10.html | |
| oscillation12.html | |
| optic1.html | |
| optic2.html | |
| optic3.html | |
| optic4.html | |
| optic5.html | |
| optic6.html | |
| optic7.html | |
| optic8.html | |
| optic9.html | |
| optic10.html | |
| atom1.html | |
| atom2.html | |
| atom3.html | |
| atom4.html | |
| atom5.html | |
| atom6.html | |
| atom7.html | |
| atom8.html | |
| atom9.html | |
| http://ibol.ru | |
| http://hh.ru | |
| http://www.mc.ru/page.asp/metalloprokat/bronza | |
| In [87]: import re | |
| In [88]: for a in anchors: print a.attrib.get('href', '') | |
| KeyboardInterrupt | |
| In [88]: hrefs = [a.attrib.get('href', '') for a in anchors if re.match('[a-z_]+\d+\.html', a.attrib.get('href', ''))] | |
| In [89]: hrefs | |
| Out[89]: | |
| ['kinematika1.html', | |
| 'kinematika2.html', | |
| 'kinematika3.html', | |
| 'dinamika1.html', | |
| 'dinamika2.html', | |
| 'dinamika3.html', | |
| 'dinamika4.html', | |
| 'dinamika5.html', | |
| 'dinamika6.html', | |
| 'statika1.html', | |
| 'statika2.html', | |
| 'statika3.html', | |
| 'molecular_physics1.html', | |
| 'molecular_physics2.html', | |
| 'molecular_physics3.html', | |
| 'molecular_physics4.html', | |
| 'molecular_physics5.html', | |
| 'molecular_physics6.html', | |
| 'molecular_physics6.html', | |
| 'molecular_physics7.html', | |
| 'molecular_physics8.html', | |
| 'molecular_physics9.html', | |
| 'molecular_physics10.html', | |
| 'electricity1.html', | |
| 'electricity2.html', | |
| 'electricity3.html', | |
| 'electricity4.html', | |
| 'electricity5.html', | |
| 'electricity6.html', | |
| 'electricity7.html', | |
| 'electricity8.html', | |
| 'electricity9.html', | |
| 'electricity10.html', | |
| 'electricity11.html', | |
| 'electricity12.html', | |
| 'electricity13.html', | |
| 'electricity14.html', | |
| 'magnetic0.html', | |
| 'magnetic1.html', | |
| 'magnetic2.html', | |
| 'magnetic3.html', | |
| 'magnetic4.html', | |
| 'magnetic5.html', | |
| 'magnetic6.html', | |
| 'magnetic6.html', | |
| 'oscillation1.html', | |
| 'oscillation2.html', | |
| 'oscillation3.html', | |
| 'oscillation4.html', | |
| 'oscillation5.html', | |
| 'oscillation6.html', | |
| 'oscillation7.html', | |
| 'oscillation8.html', | |
| 'oscillation9.html', | |
| 'oscillation10.html', | |
| 'oscillation10.html', | |
| 'oscillation12.html', | |
| 'optic1.html', | |
| 'optic2.html', | |
| 'optic3.html', | |
| 'optic4.html', | |
| 'optic5.html', | |
| 'optic6.html', | |
| 'optic7.html', | |
| 'optic8.html', | |
| 'optic9.html', | |
| 'optic10.html', | |
| 'atom1.html', | |
| 'atom2.html', | |
| 'atom3.html', | |
| 'atom4.html', | |
| 'atom5.html', | |
| 'atom6.html', | |
| 'atom7.html', | |
| 'atom8.html', | |
| 'atom9.html'] | |
| In [91]: url.geturl() | |
| Out[91]: 'http://www.abitura.com/handbook/' | |
| In [92]: src = _ | |
| In [93]: imgs = [] | |
| In [94]: for href in hrefs: imgs.append(etree.fromstringlist(urllib2.urlopen(src + href).readlines(), parser=etree.HTMLParser())) | |
| ....: | |
| In [95]: imgs | |
| Out[95]: | |
| [<Element html at 102e92158>, | |
| <Element html at 102e92310>, | |
| <Element html at 102e92470>, | |
| <Element html at 102e925d0>, | |
| <Element html at 102e92788>, | |
| <Element html at 102e928e8>, | |
| <Element html at 102e92a48>, | |
| <Element html at 102e92ba8>, | |
| <Element html at 102e92d08>, | |
| <Element html at 102e92e10>, | |
| <Element html at 102e92f70>, | |
| <Element html at 102e9e100>, | |
| <Element html at 102e9e260>, | |
| <Element html at 102e9e368>, | |
| <Element html at 102e9e680>, | |
| <Element html at 102e9e838>, | |
| <Element html at 102e9e8e8>, | |
| <Element html at 102e9e418>, | |
| <Element html at 102e9e520>, | |
| <Element html at 102e9e6d8>, | |
| <Element html at 102e9e788>, | |
| <Element html at 102e9e9f0>, | |
| <Element html at 102e92af8>, | |
| <Element html at 102e92680>, | |
| <Element html at 102e92260>, | |
| <Element html at 102e924c8>, | |
| <Element html at 102e926d8>, | |
| <Element html at 102e9e3c0>, | |
| <Element html at 102e9e158>, | |
| <Element html at 102e9e578>, | |
| <Element html at 102e9eaf8>, | |
| <Element html at 102e9eba8>, | |
| <Element html at 102e9ecb0>, | |
| <Element html at 102e9edb8>, | |
| <Element html at 102e9ef18>, | |
| <Element html at 102e9efc8>, | |
| <Element html at 102eb1100>, | |
| <Element html at 102eb1208>, | |
| <Element html at 102eb1310>, | |
| <Element html at 102eb1470>, | |
| <Element html at 102eb15d0>, | |
| <Element html at 102eb1730>, | |
| <Element html at 102eb1890>, | |
| <Element html at 102eb19f0>, | |
| <Element html at 102eb1af8>, | |
| <Element html at 102eb1c00>, | |
| <Element html at 102eb1d08>, | |
| <Element html at 102eb1e10>, | |
| <Element html at 102eb1f18>, | |
| <Element html at 102ebe050>, | |
| <Element html at 102ebe208>, | |
| <Element html at 102ebe260>, | |
| <Element html at 102eb1940>, | |
| <Element html at 102eb1520>, | |
| <Element html at 102eb1260>, | |
| <Element html at 102e9ec00>, | |
| <Element html at 102e92208>, | |
| <Element html at 102e92998>, | |
| <Element html at 102e92c00>, | |
| <Element html at 102cb0998>, | |
| <Element html at 102e92d60>, | |
| <Element html at 102e920a8>, | |
| <Element html at 102ebe100>, | |
| <Element html at 102ebe3c0>, | |
| <Element html at 102ebe520>, | |
| <Element html at 102ebe680>, | |
| <Element html at 102ebe7e0>, | |
| <Element html at 102ebe940>, | |
| <Element html at 102ebeaf8>, | |
| <Element html at 102ebec00>, | |
| <Element html at 102ebed60>, | |
| <Element html at 102ebeec0>, | |
| <Element html at 102ec8050>, | |
| <Element html at 102ec81b0>, | |
| <Element html at 102ec8310>, | |
| <Element html at 102ec8470>] | |
| In [117]: xpi = '/html/body//img' | |
| In [125]: imgurls = [] | |
| In [126]: for i in imgs: | |
| for x in i.xpath(xpi): | |
| if re.match('.*\d\.jpg', x.attrib.get('src', '')): | |
| imgurls.append(x.attrib['src']) | |
| .....: | |
| .....: | |
| In [130]: imgurls | |
| Out[130]: | |
| ['images/kinem1.jpg', | |
| 'images/kinem2.jpg', | |
| 'images/kinem3.jpg', | |
| 'images/dinamika1.jpg', | |
| 'images/sila_uprugosti1.jpg', | |
| 'images/sila_tjagotenijaa1.jpg', | |
| 'images/sila_trenija1.jpg', | |
| 'images/energija1.jpg', | |
| 'images/statika1.jpg', | |
| 'images/statika2.jpg', | |
| 'images/statika3.jpg', | |
| 'images/molecular_physics/molecular_p1.jpg', | |
| 'images/molecular_physics/molecular_p2.jpg', | |
| 'molecular3.jpg', | |
| 'molecular4.jpg', | |
| 'molecular4.jpg', | |
| 'molecular5.jpg', | |
| 'molecular7.jpg', | |
| 'molecular7.jpg', | |
| 'molecular8.jpg', | |
| 'electricity/07.jpg', | |
| 'electricity/01.jpg', | |
| 'electricity/05.jpg', | |
| 'electricity/07.jpg', | |
| 'electricity/09.jpg', | |
| 'electricity/10.jpg', | |
| 'electricity/12.jpg', | |
| 'electricity/13.jpg', | |
| '14.jpg', | |
| 'electricity/16.jpg', | |
| 'electricity/16.jpg', | |
| 'electricity/17.jpg', | |
| 'electricity/18.jpg', | |
| 'electricity/19.jpg', | |
| 'magnetic0.jpg', | |
| 'magnetic1.jpg', | |
| 'magnetic2.jpg', | |
| 'magnetic3.jpg', | |
| 'magnetic4.jpg', | |
| 'magnetic6.jpg', | |
| 'magnetic7.jpg', | |
| 'magnetic7.jpg', | |
| 'oscillation1.jpg', | |
| 'oscillation2.jpg', | |
| 'oscillation3.jpg', | |
| 'oscillation4.jpg', | |
| 'oscillation5.jpg', | |
| 'oscillation6.jpg', | |
| 'oscillation7.jpg', | |
| 'oscillation8.jpg', | |
| 'oscillation9.jpg', | |
| 'oscillation10.jpg', | |
| 'oscillation10.jpg', | |
| 'oscillation12.jpg', | |
| 'optic/optic0.jpg', | |
| 'optic/optic3.jpg', | |
| 'optic/optic3.jpg', | |
| 'optic/optic4.jpg', | |
| 'optic/optic5.jpg', | |
| 'optic/optic6.jpg', | |
| 'optic/optic7.jpg', | |
| 'atom/atom1.jpg', | |
| 'optic/optic10.jpg', | |
| 'optic/optic10.jpg', | |
| 'optic/optic11.jpg', | |
| 'atom/atom1.jpg', | |
| 'atom/atom2.jpg', | |
| 'atom/atom3.jpg', | |
| 'atom/atom4.jpg', | |
| 'atom/atom5.jpg', | |
| 'atom/atom6.jpg', | |
| 'atom/atom7.jpg', | |
| 'atom8.jpg', | |
| 'atom/atom9.jpg'] | |
| In [131]: for i in imgurls: | |
| .....: f = src + i | |
| .....: u = urllib2.urlopen(f) | |
| .....: open(u.url.split('/')[-1], 'w').write(u.read()) | |
| .....: print u.url.split('/')[-1] | |
| .....: | |
| .....: | |
| kinem1.jpg | |
| kinem2.jpg | |
| kinem3.jpg | |
| dinamika1.jpg | |
| sila_uprugosti1.jpg | |
| sila_tjagotenijaa1.jpg | |
| sila_trenija1.jpg | |
| energija1.jpg | |
| statika1.jpg | |
| statika2.jpg | |
| statika3.jpg | |
| molecular_p1.jpg | |
| molecular_p2.jpg | |
| molecular3.jpg | |
| molecular4.jpg | |
| molecular4.jpg | |
| molecular5.jpg | |
| molecular7.jpg | |
| molecular7.jpg | |
| molecular8.jpg | |
| 07.jpg | |
| 01.jpg | |
| 05.jpg | |
| 07.jpg | |
| 09.jpg | |
| 10.jpg | |
| 12.jpg | |
| 13.jpg | |
| errorpage | |
| 16.jpg | |
| 16.jpg | |
| 17.jpg | |
| 18.jpg | |
| 19.jpg | |
| magnetic0.jpg | |
| magnetic1.jpg | |
| magnetic2.jpg | |
| magnetic3.jpg | |
| magnetic4.jpg | |
| magnetic6.jpg | |
| magnetic7.jpg | |
| magnetic7.jpg | |
| oscillation1.jpg | |
| oscillation2.jpg | |
| oscillation3.jpg | |
| oscillation4.jpg | |
| oscillation5.jpg | |
| oscillation6.jpg | |
| oscillation7.jpg | |
| oscillation8.jpg | |
| oscillation9.jpg | |
| oscillation10.jpg | |
| oscillation10.jpg | |
| oscillation12.jpg | |
| optic0.jpg | |
| optic3.jpg | |
| optic3.jpg | |
| optic4.jpg | |
| optic5.jpg | |
| optic6.jpg | |
| optic7.jpg | |
| atom1.jpg | |
| optic10.jpg | |
| optic10.jpg | |
| optic11.jpg | |
| atom1.jpg | |
| atom2.jpg | |
| atom3.jpg | |
| atom4.jpg | |
| atom5.jpg | |
| atom6.jpg | |
| atom7.jpg | |
| errorpage | |
| atom9.jpg | |
| In [132]: |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment