Skip to content

Instantly share code, notes, and snippets.

@e9t
Last active October 9, 2015 17:58
Show Gist options
  • Save e9t/3552505 to your computer and use it in GitHub Desktop.
Save e9t/3552505 to your computer and use it in GitHub Desktop.
smi2html

smi2html

Objective

Extract text in .smi files and convert to .html files

Development notes

  • The .smi files should be in the ./smi folder.
  • Extracted text will be contained in .html files and located in a `./html' folder.
#! /usr/bin/python2.7
# -*- coding: utf-8 -*-
import html5lib
import os
from glob import glob
def get_filenames(directory):
return glob(os.path.join(directory, '*'))
def get_xpaths(filename):
with open(filename, 'r') as f:
p = html5lib.HTMLParser(\
tree=html5lib.treebuilders.getTreeBuilder("lxml"),\
namespaceHTMLElements=False)
page = p.parse(f)
xp = page.xpath(xpaths)
return xp
def print_txt(filename, data):
with open(filename, 'w') as f:
f.write("<html>\n<head>\n")
f.write("<meta charset=utf-8>\n")
f.write("</head>\n<body>")
for d in data:
d = d.encode('utf-8')
f.write(d)
f.write("<br>")
f.write("</body>")
def main(directory, xpaths):
filenames = get_filenames(directory)
for f in filenames:
print 'processing ' + f
xp = get_xpaths(f)
f = 'html' + f[5:-4] + '.html'
print_txt(f, xp)
print 'done'
if __name__=='__main__':
directory = '''./smi/'''
xpaths = "//body//text()"
main(directory, xpaths)
#!/bin/bash
FENC="cp949"
TENC="utf-8"
for oldfile in `ls *.smi`;
do
newfile="${oldfile%%.*}.utf-8.smi"
echo "converting $oldfile to $newfile"
iconv -f $FENC -t $TENC $oldfile > $newfile
done
#!/bin/bash
for oldfile in `ls Desperate\ Ho*.smi`;
do
newfile="Desperate.Housewives.s01e$oldfile.smi"
echo "converting $oldfile to $newfile"
mv $oldfile $newfile
done
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment