Skip to content

Instantly share code, notes, and snippets.

@gerardpaapu
Created December 6, 2011 20:25
Show Gist options
  • Save gerardpaapu/1439818 to your computer and use it in GitHub Desktop.
Save gerardpaapu/1439818 to your computer and use it in GitHub Desktop.
A filter to extract elements from an html file/stream
#!/usr/bin/python
import sys
from optparse import OptionParser
from lxml import html
from lxml.cssselect import CSSSelector
from lxml.etree import XPath
def main():
parser = OptionParser(description="Select elements from an html file with css selectors",
version="0.1",
prog="elements")
parser.add_option("-s", "--selector", dest="selector", default=False,
help="a css selector", metavar="select")
parser.add_option("-x", "--xpath", dest="xpath_exp", default=False,
help="an xpath selector", metavar="xpath")
parser.add_option("-o", "--outfile", dest="out_file", default=False,
metavar="OUT", help="defaults to stdout")
parser.add_option("-i", "--infile", dest="in_file", default=False, metavar="IN",
help="defaults to stdin")
(options, args) = parser.parse_args()
if options.out_file:
out_file = open(options.out_file, "w")
else:
out_file = sys.stdout
if options.in_file:
in_file = open(options.in_file, "r")
else:
in_file = sys.stdin
if options.selector or options.xpath_exp:
if options.selector:
sel = CSSSelector(options.selector)
else:
sel = XPath(options.xpath_exp)
tree = html.parse(in_file)
for item in sel(tree):
try:
out_file.write(html.tostring(item).strip().encode('utf-8'))
except TypeError:
out_file.write(item.encode('utf-8'))
out_file.write('\n')
else:
parser.print_help()
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment