Skip to content

Instantly share code, notes, and snippets.

@redaready
Created January 22, 2010 17:01
Show Gist options
  • Select an option

  • Save redaready/283921 to your computer and use it in GitHub Desktop.

Select an option

Save redaready/283921 to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
from urllib2 import urlopen
from lxml import etree
import os,codecs
regexpNS = "http://exslt.org/regular-expressions"
url = urlopen('http://www.ddmap.com/map/21/busline-overview.htm')
parser = etree.HTMLParser(encoding="gb2312")
tree = etree.parse(url,parser)
xpath = etree.XPath("//a[re:test(attribute::href, '^/map/21/bus-key-')]",
namespaces={'re':regexpNS})
#xpath2 = etree.XPath("//a[@href]")
matches = xpath(tree)
#matches2 = xpath2(tree)
f = codecs.open('shanghai.txt','w',encoding='utf-8')
print len(matches)
f.write(str(len(matches))+os.linesep)
for m in matches:
f.write(m.text+os.linesep)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment