Created
June 21, 2012 04:45
-
-
Save mdengler/2963878 to your computer and use it in GitHub Desktop.
html2csv.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/env python | |
# -*- coding: utf-8 -*- | |
""" | |
Examples: | |
%(progname)s http://en.wikipedia.org/wiki/List_of_Olympic_records_in_athletics | |
This is essentially this logic, done up | |
for table in lxml.html.from_string(some_html): | |
for row in table.cssselect("tr"): | |
print ",".join([td.text_content() for td in row.cssselect("td")]) | |
There are a few ways one can narrow down which tables are shown: | |
1) --id="css or html id" | |
2) --widest | |
3) --tallest | |
4) --most-cells | |
5) --starting-closest-to-text="text you know comes right before the main table" | |
All of these selectors will cause a maximum of one table to be displayed. | |
Author: Martin Dengler | |
License: GPL v3+ | |
""" | |
import csv | |
import lxml | |
import lxml.html | |
import optparse | |
import os | |
import sys | |
import urllib | |
def tables_to_tuples(tables): | |
"""takes output of csselect("table") and returns tuples (rows) of cells""" | |
returned_rows = [] | |
for table in tables: | |
rows = table.cssselect("tr") | |
for row in rows: | |
cells = row.cssselect("td") | |
for cell in cells: | |
for span_style in cell.cssselect("span[style]"): | |
style_text = lxml.html.get_text(span_style, "style") | |
if "display:" in style_text and "none" in style_text: | |
# print "found" | |
span_style.drop_tree() | |
# for hidden in cell.cssselect('*[style="display:none;"]'): | |
# print "found hidden: %s / %s" % (hidden, hidden.text_content) | |
# hidden.drop_tree() | |
cells_text = [cell.text_content().strip() for cell in cells] | |
returned_rows.append(cells_text) | |
return returned_rows | |
def gettree(args, starting_closest_to_text=None): | |
"""returns lxml doc tree for stdin or url based on args""" | |
if len(args) == 0: | |
lines = sys.stdin.readlines() | |
else: | |
lines = urllib.urlopen(args[0]).readlines() | |
if starting_closest_to_text is not None: | |
text_line = 0 | |
found_line = None | |
for line in lines: | |
if starting_closest_to_text in line: | |
found_line = text_line | |
break | |
text_line += 1 | |
if found_line is not None: | |
lines = lines[found_line:] | |
#print os.linesep.join(map(str, lines)) | |
else: | |
raise Exception("Could not find text [%s] in input" % starting_closest_to_text) | |
html = os.linesep.join(lines) | |
tree = lxml.html.fromstring(html) | |
return tree | |
def compare_by_rows(table_a, table_b): | |
return cmp(len(table_a.getchildren()), len(table_b.getchildren())) | |
def compare_by_columns(table_a, table_b): | |
return cmp(len(table_a.children()), len(table_b.children())) | |
def compare_by_cells(table_a, table_b): | |
return cmp(len(table_a.iterdescendants()), len(table_b.iterdescendants())) | |
def longest(tables): | |
return most(compare_by_rows, tables) | |
def widest(tables): | |
return most(compare_by_columns, tables) | |
def biggest(tables): | |
return most(compare_by_cells, tables) | |
def most(comparator, tables): | |
return sorted(tables, cmp=comparator)[-1] | |
def gettables(tree, args, biggest=False, longest=False, widest=False, only_first=False): | |
"""returns lxml.cssselect tables for the tree, optionally only returning the nth table (indexed from 0)""" | |
tables = list(tree.cssselect("table")) | |
if len(args) > 1: | |
n = int(args[1]) - 1 | |
tables = tables[n:n+1] | |
if biggest: | |
return biggest(tables) | |
if longest: | |
return longest(tables) | |
if widest: | |
return widest(tables) | |
if only_first: | |
tables = tables[0:1] | |
print dir(tables[0]), tables[0].text_content | |
return tables | |
def writetables(tables, outputfh=None): | |
if outputfh is None: | |
outputfh = sys.stdout | |
csvwriter = csv.writer(outputfh) | |
def clean(s): | |
s = unicode(s).encode("utf-8") | |
s = "".join([c for c in s if ord(c) in range(128)]) | |
return s | |
for row in tables_to_tuples(tables): | |
csvwriter.writerow([clean(c) for c in row]) | |
def main(args): | |
# fix for the below fix | |
if "LANG" not in os.environ or os.environ["LANG"] == "C": | |
os.environ["LANG"] = "en_US.utf8" | |
# from http://wjd.nu/notes/2009#unicodeencodeerror-python-redirect-pipe | |
import codecs, locale | |
sys.stdout = codecs.getwriter(locale.getdefaultlocale()[1])(sys.stdout, 'replace') | |
option_parser = optparse.OptionParser() | |
option_parser.add_option("--starting-closest-to-text", type="string") | |
option_parser.add_option("--biggest", action="store_true") | |
option_parser.add_option("--widest", action="store_true") | |
option_parser.add_option("--longest", action="store_true") | |
options, remaining_args = option_parser.parse_args(args) | |
tree = gettree(remaining_args, | |
starting_closest_to_text=options.starting_closest_to_text) | |
tables = gettables(tree, remaining_args, | |
biggest=options.biggest is not None, | |
widest=options.widest is not None, | |
longest=options.longest is not None, | |
only_first=options.starting_closest_to_text is not None) | |
writetables(tables) | |
if __name__ == "__main__": | |
main(sys.argv[1:]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment