hlian · May 26, 2011 16:00
diff --git a/philosophy.py b/philosophy.py
 #!/usr/bin/env python

 import gzip
 import urllib2
 from cStringIO import StringIO
 from lxml import etree, html
 from lxml.cssselect import CSSSelector

 opener = urllib2.build_opener()
 opener.addheaders = [('User-agent', 'Mozilla/5.0')]

 ALL_LINKS = CSSSelector('a')
 ITALIC_LINKS = CSSSelector('i a')
 PARAGRAPHS = CSSSelector('#bodyContent > p')

 def to_unicode(element):
    """A replacement for lxml's :meth:`Element.text_content`, which
       does not return unicode. This trick via Geoffrey Sneddon."""
    return etree.tostring(element,
                          encoding=unicode,
                          method='text',
                          with_tail=False)

 def unparens(p):
    text = etree.tostring(p)
    depth = 0

    newtext = ''
    inquote = False
    for c in text:
        if c == '(' and not inquote:
            depth += 1
            continue
        if c == ')' and not inquote:
            depth -= 1
            continue
        if c == '"':
            inquote = not inquote
        if depth == 0:
            newtext += c

    return html.fragment_fromstring(newtext)

 def iterate(url):
    f = opener.open(url, timeout=2)
    print(f.url)
    if f.info().get('content-encoding') == 'gzip':
        data = StringIO(f.read())
        gzipper = gzip.GzipFile(fileobj=data)
        root = html.document_fromstring(gzipper.read())
    else:
        root = html.document_fromstring(f.read())

    ps = PARAGRAPHS(root)
    for p in ps:
        p = unparens(p)
        links = ALL_LINKS(p)
        italics = set(ITALIC_LINKS(p))
        links = [l for l in links if not l in italics]
        links = [l for l in links if not to_unicode(l).startswith('[')]
        if links:
            href = links[0].attrib['href']
            if 'File:' in href:
                continue
            if href.startswith('/'):
                href = 'http://en.wikipedia.org' + href
            return href

 if __name__ == '__main__':
    href = 'http://en.wikipedia.org/wiki/Special:Random'
    #href = 'http://en.wikipedia.org/wiki/United_States' (GZIP)!
    while True:
        href = iterate(href)
	#!/usr/bin/env python

	import gzip
	import urllib2
	from cStringIO import StringIO
	from lxml import etree, html
	from lxml.cssselect import CSSSelector

	opener = urllib2.build_opener()
	opener.addheaders = [('User-agent', 'Mozilla/5.0')]

	ALL_LINKS = CSSSelector('a')
	ITALIC_LINKS = CSSSelector('i a')
	PARAGRAPHS = CSSSelector('#bodyContent > p')

	def to_unicode(element):
	"""A replacement for lxml's :meth:`Element.text_content`, which
	does not return unicode. This trick via Geoffrey Sneddon."""
	return etree.tostring(element,
	encoding=unicode,
	method='text',
	with_tail=False)

	def unparens(p):
	text = etree.tostring(p)
	depth = 0

	newtext = ''
	inquote = False
	for c in text:
	if c == '(' and not inquote:
	depth += 1
	continue
	if c == ')' and not inquote:
	depth -= 1
	continue
	if c == '"':
	inquote = not inquote
	if depth == 0:
	newtext += c

	return html.fragment_fromstring(newtext)

	def iterate(url):
	f = opener.open(url, timeout=2)
	print(f.url)
	if f.info().get('content-encoding') == 'gzip':
	data = StringIO(f.read())
	gzipper = gzip.GzipFile(fileobj=data)
	root = html.document_fromstring(gzipper.read())
	else:
	root = html.document_fromstring(f.read())

	ps = PARAGRAPHS(root)
	for p in ps:
	p = unparens(p)
	links = ALL_LINKS(p)
	italics = set(ITALIC_LINKS(p))
	links = [l for l in links if not l in italics]
	links = [l for l in links if not to_unicode(l).startswith('[')]
	if links:
	href = links[0].attrib['href']
	if 'File:' in href:
	continue
	if href.startswith('/'):
	href = 'http://en.wikipedia.org' + href
	return href

	if __name__ == '__main__':
	href = 'http://en.wikipedia.org/wiki/Special:Random'
	#href = 'http://en.wikipedia.org/wiki/United_States' (GZIP)!
	while True:
	href = iterate(href)