Extract text in .smi files and convert to .html files
- The
.smi
files should be in the./smi
folder. - Extracted text will be contained in
.html
files and located in a `./html' folder.
#! /usr/bin/python2.7 | |
# -*- coding: utf-8 -*- | |
import html5lib | |
import os | |
from glob import glob | |
def get_filenames(directory): | |
return glob(os.path.join(directory, '*')) | |
def get_xpaths(filename): | |
with open(filename, 'r') as f: | |
p = html5lib.HTMLParser(\ | |
tree=html5lib.treebuilders.getTreeBuilder("lxml"),\ | |
namespaceHTMLElements=False) | |
page = p.parse(f) | |
xp = page.xpath(xpaths) | |
return xp | |
def print_txt(filename, data): | |
with open(filename, 'w') as f: | |
f.write("<html>\n<head>\n") | |
f.write("<meta charset=utf-8>\n") | |
f.write("</head>\n<body>") | |
for d in data: | |
d = d.encode('utf-8') | |
f.write(d) | |
f.write("<br>") | |
f.write("</body>") | |
def main(directory, xpaths): | |
filenames = get_filenames(directory) | |
for f in filenames: | |
print 'processing ' + f | |
xp = get_xpaths(f) | |
f = 'html' + f[5:-4] + '.html' | |
print_txt(f, xp) | |
print 'done' | |
if __name__=='__main__': | |
directory = '''./smi/''' | |
xpaths = "//body//text()" | |
main(directory, xpaths) |
#!/bin/bash | |
FENC="cp949" | |
TENC="utf-8" | |
for oldfile in `ls *.smi`; | |
do | |
newfile="${oldfile%%.*}.utf-8.smi" | |
echo "converting $oldfile to $newfile" | |
iconv -f $FENC -t $TENC $oldfile > $newfile | |
done |
#!/bin/bash | |
for oldfile in `ls Desperate\ Ho*.smi`; | |
do | |
newfile="Desperate.Housewives.s01e$oldfile.smi" | |
echo "converting $oldfile to $newfile" | |
mv $oldfile $newfile | |
done |