proger · June 22, 2010 21:51
diff --git a/gistfile2.txt b/gistfile2.txt
diff --git a/physics.py b/physics.py
 io proger ~ 0 % py
 In [3]: import urllib2

 In [5]: urllib2.urlopen('http://www.abitura.com/handbook/')
 Out[5]: <addinfourl at 4314565160 whose fp = <socket._fileobject object at 0x1016f4ad0>>

 In [6]: url = _

 In [7]: url.readlines()
 Out[7]: ...

 In [8]: hb = _

 In [18]: from lxml import etree

 In [19]: etree.fromstr
 etree.fromstring      etree.fromstringlist  

 In [21]: etree.fromstringlist(hb, etree.HT
 etree.HTML        etree.HTMLParser  

 In [22]: etree.fromstringlist(hb, etree.HTMLParser())
 Out[22]: <Element html at 102c42628>

 In [23]: ht = _

 In [25]: list(ht)
 Out[25]: [<Element head at 102c424c8>, <Element body at 102c42578>]

 In [26]: list(ht)[1]
 Out[26]: <Element body at 102c42578>

 In [27]: htb = _

 In [28]: htb.
 htb.__class__         htb.__init__          htb.__sizeof__        htb.find              htb.insert            htb.nsmap
 htb.__contains__      htb.__iter__          htb.__str__           htb.findall           htb.items             htb.prefix
 htb.__copy__          htb.__len__           htb.__subclasshook__  htb.findtext          htb.iter              htb.remove
 htb.__deepcopy__      htb.__new__           htb._init             htb.get               htb.iterancestors     htb.replace
 htb.__delattr__       htb.__nonzero__       htb.addnext           htb.getchildren       htb.iterchildren      htb.set
 htb.__delitem__       htb.__reduce__        htb.addprevious       htb.getiterator       htb.iterdescendants   htb.sourceline
 htb.__doc__           htb.__reduce_ex__     htb.append            htb.getnext           htb.iterfind          htb.tag
 htb.__format__        htb.__repr__          htb.attrib            htb.getparent         htb.itersiblings      htb.tail
 htb.__getattribute__  htb.__reversed__      htb.base              htb.getprevious       htb.itertext          htb.text
 htb.__getitem__       htb.__setattr__       htb.clear             htb.getroottree       htb.keys              htb.values
 htb.__hash__          htb.__setitem__       htb.extend            htb.index             htb.makeelement       htb.xpath

 In [53]: tbl.iterdescendants(tag='a')
 Out[53]: <lxml.etree.ElementDepthFirstIterator object at 0x102c8d1f0>

 In [60]: list(list(list(tbl)[0])[0])
 Out[60]: 
 [<Element p at 102c42418>,
 <Element p at 102cb0050>,
 <Element p at 102cb0158>,
 <Element p at 102cb01b0>,
 <Element center at 102cb0208>]

 In [61]: for el in list(list(list(tbl)[0])[0]): 
 KeyboardInterrupt

 In [61]: xp = '/html/body/table[2]/tbody/tr/td[2]/p/a'

 In [63]: etree.fromstringlist(hb, etree.HTMLParser())
 Out[63]: <Element html at 102cb03c0>

 In [64]: root = _

 In [65]: root.xpath?
 Type:		builtin_function_or_method
 Base Class:	<type 'builtin_function_or_method'>
 String Form:	<built-in method xpath of lxml.etree._Element object at 0x102cb03c0>
 Namespace:	Interactive
 Docstring:
    xpath(self, _path, namespaces=None, extensions=None, smart_strings=True, **_variables)
    
    Evaluate an xpath expression using the element as context node.


 In [67]: root.xpath(xp)
 Out[67]: []

 In [68]: xp
 Out[68]: '/html/body/table[2]/tbody/tr/td[2]/p/a'

 In [69]: root.xpath(xpath)
 KeyboardInterrupt

 In [69]: root.xpath('/html/body/table[2]/tbody/tr')
 Out[69]: []

 In [81]: anchors = root.xpath('/html/body/table[2]//a')

 In [83]: a = anchors[0]

 In [84]: a.attrib
 Out[84]: {'href': '../reseption.html'}

 In [86]: for a in anchors: print a.attrib.get('href', '')

   ....: 
 ../reseption.html
 ../entering_task.html
 ../open_lessons/
 ../textbooks.html
 ../problems/

 ../questions/
 ../abstracts/
 ../tournaments/
 ../sunday_school.html
 ../modern_physics/
 ../happy_physics/
 ../collection/
 ../not_only/
 ../links/
 ../repetitor.html
 ../man
 ../mathematics/index.html
 ../chemi/index.html
 ../renessans/index.html
 ../open_lessons/index.html
 mailto:info@abitura.com
 kinematika1.html
 kinematika2.html
 kinematika3.html
 dinamika1.html
 dinamika2.html
 dinamika3.html
 dinamika4.html
 dinamika5.html
 dinamika6.html
 statika1.html
 statika2.html
 statika3.html

 molecular_physics1.html
 molecular_physics2.html
 molecular_physics3.html
 molecular_physics4.html
 molecular_physics5.html
 molecular_physics6.html
 molecular_physics6.html
 molecular_physics7.html
 molecular_physics8.html
 molecular_physics9.html
 molecular_physics10.html
 electricity1.html
 electricity2.html
 electricity3.html
 electricity4.html
 electricity5.html
 electricity6.html
 electricity7.html
 electricity8.html
 electricity9.html
 electricity10.html
 electricity11.html
 electricity12.html
 electricity13.html
 electricity14.html
 magnetic0.html
 magnetic1.html
 magnetic2.html
 magnetic3.html
 magnetic4.html
 magnetic5.html
 magnetic6.html
 magnetic6.html
 oscillation1.html
 oscillation2.html
 oscillation3.html
 oscillation4.html
 oscillation5.html
 oscillation6.html
 oscillation7.html
 oscillation8.html
 oscillation9.html
 oscillation10.html
 oscillation10.html
 oscillation12.html
 optic1.html
 optic2.html
 optic3.html
 optic4.html
 optic5.html
 optic6.html
 optic7.html
 optic8.html
 optic9.html
 optic10.html
 atom1.html
 atom2.html
 atom3.html
 atom4.html
 atom5.html
 atom6.html
 atom7.html
 atom8.html
 atom9.html
 http://ibol.ru
 http://hh.ru
 http://www.mc.ru/page.asp/metalloprokat/bronza

 In [87]: import re

 In [88]: for a in anchors: print a.attrib.get('href', '')
 KeyboardInterrupt

 In [88]: hrefs = [a.attrib.get('href', '') for a in anchors if re.match('[a-z_]+\d+\.html', a.attrib.get('href', ''))]

 In [89]: hrefs
 Out[89]: 
 ['kinematika1.html',
 'kinematika2.html',
 'kinematika3.html',
 'dinamika1.html',
 'dinamika2.html',
 'dinamika3.html',
 'dinamika4.html',
 'dinamika5.html',
 'dinamika6.html',
 'statika1.html',
 'statika2.html',
 'statika3.html',
 'molecular_physics1.html',
 'molecular_physics2.html',
 'molecular_physics3.html',
 'molecular_physics4.html',
 'molecular_physics5.html',
 'molecular_physics6.html',
 'molecular_physics6.html',
 'molecular_physics7.html',
 'molecular_physics8.html',
 'molecular_physics9.html',
 'molecular_physics10.html',
 'electricity1.html',
 'electricity2.html',
 'electricity3.html',
 'electricity4.html',
 'electricity5.html',
 'electricity6.html',
 'electricity7.html',
 'electricity8.html',
 'electricity9.html',
 'electricity10.html',
 'electricity11.html',
 'electricity12.html',
 'electricity13.html',
 'electricity14.html',
 'magnetic0.html',
 'magnetic1.html',
 'magnetic2.html',
 'magnetic3.html',
 'magnetic4.html',
 'magnetic5.html',
 'magnetic6.html',
 'magnetic6.html',
 'oscillation1.html',
 'oscillation2.html',
 'oscillation3.html',
 'oscillation4.html',
 'oscillation5.html',
 'oscillation6.html',
 'oscillation7.html',
 'oscillation8.html',
 'oscillation9.html',
 'oscillation10.html',
 'oscillation10.html',
 'oscillation12.html',
 'optic1.html',
 'optic2.html',
 'optic3.html',
 'optic4.html',
 'optic5.html',
 'optic6.html',
 'optic7.html',
 'optic8.html',
 'optic9.html',
 'optic10.html',
 'atom1.html',
 'atom2.html',
 'atom3.html',
 'atom4.html',
 'atom5.html',
 'atom6.html',
 'atom7.html',
 'atom8.html',
 'atom9.html']

 In [91]: url.geturl()
 Out[91]: 'http://www.abitura.com/handbook/'

 In [92]: src = _

 In [93]: imgs = []

 In [94]: for href in hrefs: imgs.append(etree.fromstringlist(urllib2.urlopen(src + href).readlines(), parser=etree.HTMLParser()))
   ....: 

 In [95]: imgs
 Out[95]: 
 [<Element html at 102e92158>,
 <Element html at 102e92310>,
 <Element html at 102e92470>,
 <Element html at 102e925d0>,
 <Element html at 102e92788>,
 <Element html at 102e928e8>,
 <Element html at 102e92a48>,
 <Element html at 102e92ba8>,
 <Element html at 102e92d08>,
 <Element html at 102e92e10>,
 <Element html at 102e92f70>,
 <Element html at 102e9e100>,
 <Element html at 102e9e260>,
 <Element html at 102e9e368>,
 <Element html at 102e9e680>,
 <Element html at 102e9e838>,
 <Element html at 102e9e8e8>,
 <Element html at 102e9e418>,
 <Element html at 102e9e520>,
 <Element html at 102e9e6d8>,
 <Element html at 102e9e788>,
 <Element html at 102e9e9f0>,
 <Element html at 102e92af8>,
 <Element html at 102e92680>,
 <Element html at 102e92260>,
 <Element html at 102e924c8>,
 <Element html at 102e926d8>,
 <Element html at 102e9e3c0>,
 <Element html at 102e9e158>,
 <Element html at 102e9e578>,
 <Element html at 102e9eaf8>,
 <Element html at 102e9eba8>,
 <Element html at 102e9ecb0>,
 <Element html at 102e9edb8>,
 <Element html at 102e9ef18>,
 <Element html at 102e9efc8>,
 <Element html at 102eb1100>,
 <Element html at 102eb1208>,
 <Element html at 102eb1310>,
 <Element html at 102eb1470>,
 <Element html at 102eb15d0>,
 <Element html at 102eb1730>,
 <Element html at 102eb1890>,
 <Element html at 102eb19f0>,
 <Element html at 102eb1af8>,
 <Element html at 102eb1c00>,
 <Element html at 102eb1d08>,
 <Element html at 102eb1e10>,
 <Element html at 102eb1f18>,
 <Element html at 102ebe050>,
 <Element html at 102ebe208>,
 <Element html at 102ebe260>,
 <Element html at 102eb1940>,
 <Element html at 102eb1520>,
 <Element html at 102eb1260>,
 <Element html at 102e9ec00>,
 <Element html at 102e92208>,
 <Element html at 102e92998>,
 <Element html at 102e92c00>,
 <Element html at 102cb0998>,
 <Element html at 102e92d60>,
 <Element html at 102e920a8>,
 <Element html at 102ebe100>,
 <Element html at 102ebe3c0>,
 <Element html at 102ebe520>,
 <Element html at 102ebe680>,
 <Element html at 102ebe7e0>,
 <Element html at 102ebe940>,
 <Element html at 102ebeaf8>,
 <Element html at 102ebec00>,
 <Element html at 102ebed60>,
 <Element html at 102ebeec0>,
 <Element html at 102ec8050>,
 <Element html at 102ec81b0>,
 <Element html at 102ec8310>,
 <Element html at 102ec8470>]

 In [117]: xpi = '/html/body//img'

 In [125]: imgurls = []

 In [126]: for i in imgs:
    for x in i.xpath(xpi):
        if re.match('.*\d\.jpg', x.attrib.get('src', '')):
            imgurls.append(x.attrib['src'])
   .....:             
   .....:             

 In [130]: imgurls
 Out[130]: 
 ['images/kinem1.jpg',
 'images/kinem2.jpg',
 'images/kinem3.jpg',
 'images/dinamika1.jpg',
 'images/sila_uprugosti1.jpg',
 'images/sila_tjagotenijaa1.jpg',
 'images/sila_trenija1.jpg',
 'images/energija1.jpg',
 'images/statika1.jpg',
 'images/statika2.jpg',
 'images/statika3.jpg',
 'images/molecular_physics/molecular_p1.jpg',
 'images/molecular_physics/molecular_p2.jpg',
 'molecular3.jpg',
 'molecular4.jpg',
 'molecular4.jpg',
 'molecular5.jpg',
 'molecular7.jpg',
 'molecular7.jpg',
 'molecular8.jpg',
 'electricity/07.jpg',
 'electricity/01.jpg',
 'electricity/05.jpg',
 'electricity/07.jpg',
 'electricity/09.jpg',
 'electricity/10.jpg',
 'electricity/12.jpg',
 'electricity/13.jpg',
 '14.jpg',
 'electricity/16.jpg',
 'electricity/16.jpg',
 'electricity/17.jpg',
 'electricity/18.jpg',
 'electricity/19.jpg',
 'magnetic0.jpg',
 'magnetic1.jpg',
 'magnetic2.jpg',
 'magnetic3.jpg',
 'magnetic4.jpg',
 'magnetic6.jpg',
 'magnetic7.jpg',
 'magnetic7.jpg',
 'oscillation1.jpg',
 'oscillation2.jpg',
 'oscillation3.jpg',
 'oscillation4.jpg',
 'oscillation5.jpg',
 'oscillation6.jpg',
 'oscillation7.jpg',
 'oscillation8.jpg',
 'oscillation9.jpg',
 'oscillation10.jpg',
 'oscillation10.jpg',
 'oscillation12.jpg',
 'optic/optic0.jpg',
 'optic/optic3.jpg',
 'optic/optic3.jpg',
 'optic/optic4.jpg',
 'optic/optic5.jpg',
 'optic/optic6.jpg',
 'optic/optic7.jpg',
 'atom/atom1.jpg',
 'optic/optic10.jpg',
 'optic/optic10.jpg',
 'optic/optic11.jpg',
 'atom/atom1.jpg',
 'atom/atom2.jpg',
 'atom/atom3.jpg',
 'atom/atom4.jpg',
 'atom/atom5.jpg',
 'atom/atom6.jpg',
 'atom/atom7.jpg',
 'atom8.jpg',
 'atom/atom9.jpg']

 In [131]: for i in imgurls:
   .....:     f = src + i
   .....:     u = urllib2.urlopen(f)
   .....:     open(u.url.split('/')[-1], 'w').write(u.read())
   .....:     print u.url.split('/')[-1]
   .....:     
   .....:     
 kinem1.jpg
 kinem2.jpg
 kinem3.jpg
 dinamika1.jpg
 sila_uprugosti1.jpg
 sila_tjagotenijaa1.jpg
 sila_trenija1.jpg
 energija1.jpg
 statika1.jpg
 statika2.jpg
 statika3.jpg
 molecular_p1.jpg
 molecular_p2.jpg
 molecular3.jpg
 molecular4.jpg
 molecular4.jpg
 molecular5.jpg
 molecular7.jpg
 molecular7.jpg
 molecular8.jpg
 07.jpg
 01.jpg
 05.jpg
 07.jpg
 09.jpg
 10.jpg
 12.jpg
 13.jpg
 errorpage
 16.jpg
 16.jpg
 17.jpg
 18.jpg
 19.jpg
 magnetic0.jpg
 magnetic1.jpg
 magnetic2.jpg
 magnetic3.jpg
 magnetic4.jpg
 magnetic6.jpg
 magnetic7.jpg
 magnetic7.jpg
 oscillation1.jpg
 oscillation2.jpg
 oscillation3.jpg
 oscillation4.jpg
 oscillation5.jpg
 oscillation6.jpg
 oscillation7.jpg
 oscillation8.jpg
 oscillation9.jpg
 oscillation10.jpg
 oscillation10.jpg
 oscillation12.jpg
 optic0.jpg
 optic3.jpg
 optic3.jpg
 optic4.jpg
 optic5.jpg
 optic6.jpg
 optic7.jpg
 atom1.jpg
 optic10.jpg
 optic10.jpg
 optic11.jpg
 atom1.jpg
 atom2.jpg
 atom3.jpg
 atom4.jpg
 atom5.jpg
 atom6.jpg
 atom7.jpg
 errorpage
 atom9.jpg

 In [132]:
	io proger ~ 0 % py
	In [3]: import urllib2

	In [5]: urllib2.urlopen('http://www.abitura.com/handbook/')
	Out[5]: <addinfourl at 4314565160 whose fp = <socket._fileobject object at 0x1016f4ad0>>

	In [6]: url = _

	In [7]: url.readlines()
	Out[7]: ...

	In [8]: hb = _

	In [18]: from lxml import etree

	In [19]: etree.fromstr
	etree.fromstring etree.fromstringlist

	In [21]: etree.fromstringlist(hb, etree.HT
	etree.HTML etree.HTMLParser

	In [22]: etree.fromstringlist(hb, etree.HTMLParser())
	Out[22]: <Element html at 102c42628>

	In [23]: ht = _

	In [25]: list(ht)
	Out[25]: [<Element head at 102c424c8>, <Element body at 102c42578>]

	In [26]: list(ht)[1]
	Out[26]: <Element body at 102c42578>

	In [27]: htb = _

	In [28]: htb.
	htb.__class__ htb.__init__ htb.__sizeof__ htb.find htb.insert htb.nsmap
	htb.__contains__ htb.__iter__ htb.__str__ htb.findall htb.items htb.prefix
	htb.__copy__ htb.__len__ htb.__subclasshook__ htb.findtext htb.iter htb.remove
	htb.__deepcopy__ htb.__new__ htb._init htb.get htb.iterancestors htb.replace
	htb.__delattr__ htb.__nonzero__ htb.addnext htb.getchildren htb.iterchildren htb.set
	htb.__delitem__ htb.__reduce__ htb.addprevious htb.getiterator htb.iterdescendants htb.sourceline
	htb.__doc__ htb.__reduce_ex__ htb.append htb.getnext htb.iterfind htb.tag
	htb.__format__ htb.__repr__ htb.attrib htb.getparent htb.itersiblings htb.tail
	htb.__getattribute__ htb.__reversed__ htb.base htb.getprevious htb.itertext htb.text
	htb.__getitem__ htb.__setattr__ htb.clear htb.getroottree htb.keys htb.values
	htb.__hash__ htb.__setitem__ htb.extend htb.index htb.makeelement htb.xpath

	In [53]: tbl.iterdescendants(tag='a')
	Out[53]: <lxml.etree.ElementDepthFirstIterator object at 0x102c8d1f0>

	In [60]: list(list(list(tbl)[0])[0])
	Out[60]:
	[<Element p at 102c42418>,
	<Element p at 102cb0050>,
	<Element p at 102cb0158>,
	<Element p at 102cb01b0>,
	<Element center at 102cb0208>]

	In [61]: for el in list(list(list(tbl)[0])[0]):
	KeyboardInterrupt

	In [61]: xp = '/html/body/table[2]/tbody/tr/td[2]/p/a'

	In [63]: etree.fromstringlist(hb, etree.HTMLParser())
	Out[63]: <Element html at 102cb03c0>

	In [64]: root = _

	In [65]: root.xpath?
	Type: builtin_function_or_method
	Base Class: <type 'builtin_function_or_method'>
	String Form: <built-in method xpath of lxml.etree._Element object at 0x102cb03c0>
	Namespace: Interactive
	Docstring:
	xpath(self, _path, namespaces=None, extensions=None, smart_strings=True, **_variables)

	Evaluate an xpath expression using the element as context node.


	In [67]: root.xpath(xp)
	Out[67]: []

	In [68]: xp
	Out[68]: '/html/body/table[2]/tbody/tr/td[2]/p/a'

	In [69]: root.xpath(xpath)
	KeyboardInterrupt

	In [69]: root.xpath('/html/body/table[2]/tbody/tr')
	Out[69]: []

	In [81]: anchors = root.xpath('/html/body/table[2]//a')

	In [83]: a = anchors[0]

	In [84]: a.attrib
	Out[84]: {'href': '../reseption.html'}

	In [86]: for a in anchors: print a.attrib.get('href', '')

	....:
	../reseption.html
	../entering_task.html
	../open_lessons/
	../textbooks.html
	../problems/

	../questions/
	../abstracts/
	../tournaments/
	../sunday_school.html
	../modern_physics/
	../happy_physics/
	../collection/
	../not_only/
	../links/
	../repetitor.html
	../man
	../mathematics/index.html
	../chemi/index.html
	../renessans/index.html
	../open_lessons/index.html
	mailto:info@abitura.com
	kinematika1.html
	kinematika2.html
	kinematika3.html
	dinamika1.html
	dinamika2.html
	dinamika3.html
	dinamika4.html
	dinamika5.html
	dinamika6.html
	statika1.html
	statika2.html
	statika3.html

	molecular_physics1.html
	molecular_physics2.html
	molecular_physics3.html
	molecular_physics4.html
	molecular_physics5.html
	molecular_physics6.html
	molecular_physics6.html
	molecular_physics7.html
	molecular_physics8.html
	molecular_physics9.html
	molecular_physics10.html
	electricity1.html
	electricity2.html
	electricity3.html
	electricity4.html
	electricity5.html
	electricity6.html
	electricity7.html
	electricity8.html
	electricity9.html
	electricity10.html
	electricity11.html
	electricity12.html
	electricity13.html
	electricity14.html
	magnetic0.html
	magnetic1.html
	magnetic2.html
	magnetic3.html
	magnetic4.html
	magnetic5.html
	magnetic6.html
	magnetic6.html
	oscillation1.html
	oscillation2.html
	oscillation3.html
	oscillation4.html
	oscillation5.html
	oscillation6.html
	oscillation7.html
	oscillation8.html
	oscillation9.html
	oscillation10.html
	oscillation10.html
	oscillation12.html
	optic1.html
	optic2.html
	optic3.html
	optic4.html
	optic5.html
	optic6.html
	optic7.html
	optic8.html
	optic9.html
	optic10.html
	atom1.html
	atom2.html
	atom3.html
	atom4.html
	atom5.html
	atom6.html
	atom7.html
	atom8.html
	atom9.html
	http://ibol.ru
	http://hh.ru
	http://www.mc.ru/page.asp/metalloprokat/bronza

	In [87]: import re

	In [88]: for a in anchors: print a.attrib.get('href', '')
	KeyboardInterrupt

	In [88]: hrefs = [a.attrib.get('href', '') for a in anchors if re.match('[a-z_]+\d+\.html', a.attrib.get('href', ''))]

	In [89]: hrefs
	Out[89]:
	['kinematika1.html',
	'kinematika2.html',
	'kinematika3.html',
	'dinamika1.html',
	'dinamika2.html',
	'dinamika3.html',
	'dinamika4.html',
	'dinamika5.html',
	'dinamika6.html',
	'statika1.html',
	'statika2.html',
	'statika3.html',
	'molecular_physics1.html',
	'molecular_physics2.html',
	'molecular_physics3.html',
	'molecular_physics4.html',
	'molecular_physics5.html',
	'molecular_physics6.html',
	'molecular_physics6.html',
	'molecular_physics7.html',
	'molecular_physics8.html',
	'molecular_physics9.html',
	'molecular_physics10.html',
	'electricity1.html',
	'electricity2.html',
	'electricity3.html',
	'electricity4.html',
	'electricity5.html',
	'electricity6.html',
	'electricity7.html',
	'electricity8.html',
	'electricity9.html',
	'electricity10.html',
	'electricity11.html',
	'electricity12.html',
	'electricity13.html',
	'electricity14.html',
	'magnetic0.html',
	'magnetic1.html',
	'magnetic2.html',
	'magnetic3.html',
	'magnetic4.html',
	'magnetic5.html',
	'magnetic6.html',
	'magnetic6.html',
	'oscillation1.html',
	'oscillation2.html',
	'oscillation3.html',
	'oscillation4.html',
	'oscillation5.html',
	'oscillation6.html',
	'oscillation7.html',
	'oscillation8.html',
	'oscillation9.html',
	'oscillation10.html',
	'oscillation10.html',
	'oscillation12.html',
	'optic1.html',
	'optic2.html',
	'optic3.html',
	'optic4.html',
	'optic5.html',
	'optic6.html',
	'optic7.html',
	'optic8.html',
	'optic9.html',
	'optic10.html',
	'atom1.html',
	'atom2.html',
	'atom3.html',
	'atom4.html',
	'atom5.html',
	'atom6.html',
	'atom7.html',
	'atom8.html',
	'atom9.html']

	In [91]: url.geturl()
	Out[91]: 'http://www.abitura.com/handbook/'

	In [92]: src = _

	In [93]: imgs = []

	In [94]: for href in hrefs: imgs.append(etree.fromstringlist(urllib2.urlopen(src + href).readlines(), parser=etree.HTMLParser()))
	....:

	In [95]: imgs
	Out[95]:
	[<Element html at 102e92158>,
	<Element html at 102e92310>,
	<Element html at 102e92470>,
	<Element html at 102e925d0>,
	<Element html at 102e92788>,
	<Element html at 102e928e8>,
	<Element html at 102e92a48>,
	<Element html at 102e92ba8>,
	<Element html at 102e92d08>,
	<Element html at 102e92e10>,
	<Element html at 102e92f70>,
	<Element html at 102e9e100>,
	<Element html at 102e9e260>,
	<Element html at 102e9e368>,
	<Element html at 102e9e680>,
	<Element html at 102e9e838>,
	<Element html at 102e9e8e8>,
	<Element html at 102e9e418>,
	<Element html at 102e9e520>,
	<Element html at 102e9e6d8>,
	<Element html at 102e9e788>,
	<Element html at 102e9e9f0>,
	<Element html at 102e92af8>,
	<Element html at 102e92680>,
	<Element html at 102e92260>,
	<Element html at 102e924c8>,
	<Element html at 102e926d8>,
	<Element html at 102e9e3c0>,
	<Element html at 102e9e158>,
	<Element html at 102e9e578>,
	<Element html at 102e9eaf8>,
	<Element html at 102e9eba8>,
	<Element html at 102e9ecb0>,
	<Element html at 102e9edb8>,
	<Element html at 102e9ef18>,
	<Element html at 102e9efc8>,
	<Element html at 102eb1100>,
	<Element html at 102eb1208>,
	<Element html at 102eb1310>,
	<Element html at 102eb1470>,
	<Element html at 102eb15d0>,
	<Element html at 102eb1730>,
	<Element html at 102eb1890>,
	<Element html at 102eb19f0>,
	<Element html at 102eb1af8>,
	<Element html at 102eb1c00>,
	<Element html at 102eb1d08>,
	<Element html at 102eb1e10>,
	<Element html at 102eb1f18>,
	<Element html at 102ebe050>,
	<Element html at 102ebe208>,
	<Element html at 102ebe260>,
	<Element html at 102eb1940>,
	<Element html at 102eb1520>,
	<Element html at 102eb1260>,
	<Element html at 102e9ec00>,
	<Element html at 102e92208>,
	<Element html at 102e92998>,
	<Element html at 102e92c00>,
	<Element html at 102cb0998>,
	<Element html at 102e92d60>,
	<Element html at 102e920a8>,
	<Element html at 102ebe100>,
	<Element html at 102ebe3c0>,
	<Element html at 102ebe520>,
	<Element html at 102ebe680>,
	<Element html at 102ebe7e0>,
	<Element html at 102ebe940>,
	<Element html at 102ebeaf8>,
	<Element html at 102ebec00>,
	<Element html at 102ebed60>,
	<Element html at 102ebeec0>,
	<Element html at 102ec8050>,
	<Element html at 102ec81b0>,
	<Element html at 102ec8310>,
	<Element html at 102ec8470>]

	In [117]: xpi = '/html/body//img'

	In [125]: imgurls = []

	In [126]: for i in imgs:
	for x in i.xpath(xpi):
	if re.match('.*\d\.jpg', x.attrib.get('src', '')):
	imgurls.append(x.attrib['src'])
	.....:
	.....:

	In [130]: imgurls
	Out[130]:
	['images/kinem1.jpg',
	'images/kinem2.jpg',
	'images/kinem3.jpg',
	'images/dinamika1.jpg',
	'images/sila_uprugosti1.jpg',
	'images/sila_tjagotenijaa1.jpg',
	'images/sila_trenija1.jpg',
	'images/energija1.jpg',
	'images/statika1.jpg',
	'images/statika2.jpg',
	'images/statika3.jpg',
	'images/molecular_physics/molecular_p1.jpg',
	'images/molecular_physics/molecular_p2.jpg',
	'molecular3.jpg',
	'molecular4.jpg',
	'molecular4.jpg',
	'molecular5.jpg',
	'molecular7.jpg',
	'molecular7.jpg',
	'molecular8.jpg',
	'electricity/07.jpg',
	'electricity/01.jpg',
	'electricity/05.jpg',
	'electricity/07.jpg',
	'electricity/09.jpg',
	'electricity/10.jpg',
	'electricity/12.jpg',
	'electricity/13.jpg',
	'14.jpg',
	'electricity/16.jpg',
	'electricity/16.jpg',
	'electricity/17.jpg',
	'electricity/18.jpg',
	'electricity/19.jpg',
	'magnetic0.jpg',
	'magnetic1.jpg',
	'magnetic2.jpg',
	'magnetic3.jpg',
	'magnetic4.jpg',
	'magnetic6.jpg',
	'magnetic7.jpg',
	'magnetic7.jpg',
	'oscillation1.jpg',
	'oscillation2.jpg',
	'oscillation3.jpg',
	'oscillation4.jpg',
	'oscillation5.jpg',
	'oscillation6.jpg',
	'oscillation7.jpg',
	'oscillation8.jpg',
	'oscillation9.jpg',
	'oscillation10.jpg',
	'oscillation10.jpg',
	'oscillation12.jpg',
	'optic/optic0.jpg',
	'optic/optic3.jpg',
	'optic/optic3.jpg',
	'optic/optic4.jpg',
	'optic/optic5.jpg',
	'optic/optic6.jpg',
	'optic/optic7.jpg',
	'atom/atom1.jpg',
	'optic/optic10.jpg',
	'optic/optic10.jpg',
	'optic/optic11.jpg',
	'atom/atom1.jpg',
	'atom/atom2.jpg',
	'atom/atom3.jpg',
	'atom/atom4.jpg',
	'atom/atom5.jpg',
	'atom/atom6.jpg',
	'atom/atom7.jpg',
	'atom8.jpg',
	'atom/atom9.jpg']

	In [131]: for i in imgurls:
	.....: f = src + i
	.....: u = urllib2.urlopen(f)
	.....: open(u.url.split('/')[-1], 'w').write(u.read())
	.....: print u.url.split('/')[-1]
	.....:
	.....:
	kinem1.jpg
	kinem2.jpg
	kinem3.jpg
	dinamika1.jpg
	sila_uprugosti1.jpg
	sila_tjagotenijaa1.jpg
	sila_trenija1.jpg
	energija1.jpg
	statika1.jpg
	statika2.jpg
	statika3.jpg
	molecular_p1.jpg
	molecular_p2.jpg
	molecular3.jpg
	molecular4.jpg
	molecular4.jpg
	molecular5.jpg
	molecular7.jpg
	molecular7.jpg
	molecular8.jpg
	07.jpg
	01.jpg
	05.jpg
	07.jpg
	09.jpg
	10.jpg
	12.jpg
	13.jpg
	errorpage
	16.jpg
	16.jpg
	17.jpg
	18.jpg
	19.jpg
	magnetic0.jpg
	magnetic1.jpg
	magnetic2.jpg
	magnetic3.jpg
	magnetic4.jpg
	magnetic6.jpg
	magnetic7.jpg
	magnetic7.jpg
	oscillation1.jpg
	oscillation2.jpg
	oscillation3.jpg
	oscillation4.jpg
	oscillation5.jpg
	oscillation6.jpg
	oscillation7.jpg
	oscillation8.jpg
	oscillation9.jpg
	oscillation10.jpg
	oscillation10.jpg
	oscillation12.jpg
	optic0.jpg
	optic3.jpg
	optic3.jpg
	optic4.jpg
	optic5.jpg
	optic6.jpg
	optic7.jpg
	atom1.jpg
	optic10.jpg
	optic10.jpg
	optic11.jpg
	atom1.jpg
	atom2.jpg
	atom3.jpg
	atom4.jpg
	atom5.jpg
	atom6.jpg
	atom7.jpg
	errorpage
	atom9.jpg

	In [132]:
No results found