Extract text in .smi files and convert to .html files
- The 
.smifiles should be in the./smifolder. - Extracted text will be contained in 
.htmlfiles and located in a `./html' folder. 
| #! /usr/bin/python2.7 | |
| # -*- coding: utf-8 -*- | |
| import html5lib | |
| import os | |
| from glob import glob | |
| def get_filenames(directory): | |
| return glob(os.path.join(directory, '*')) | |
| def get_xpaths(filename): | |
| with open(filename, 'r') as f: | |
| p = html5lib.HTMLParser(\ | |
| tree=html5lib.treebuilders.getTreeBuilder("lxml"),\ | |
| namespaceHTMLElements=False) | |
| page = p.parse(f) | |
| xp = page.xpath(xpaths) | |
| return xp | |
| def print_txt(filename, data): | |
| with open(filename, 'w') as f: | |
| f.write("<html>\n<head>\n") | |
| f.write("<meta charset=utf-8>\n") | |
| f.write("</head>\n<body>") | |
| for d in data: | |
| d = d.encode('utf-8') | |
| f.write(d) | |
| f.write("<br>") | |
| f.write("</body>") | |
| def main(directory, xpaths): | |
| filenames = get_filenames(directory) | |
| for f in filenames: | |
| print 'processing ' + f | |
| xp = get_xpaths(f) | |
| f = 'html' + f[5:-4] + '.html' | |
| print_txt(f, xp) | |
| print 'done' | |
| if __name__=='__main__': | |
| directory = '''./smi/''' | |
| xpaths = "//body//text()" | |
| main(directory, xpaths) | 
| #!/bin/bash | |
| FENC="cp949" | |
| TENC="utf-8" | |
| for oldfile in `ls *.smi`; | |
| do | |
| newfile="${oldfile%%.*}.utf-8.smi" | |
| echo "converting $oldfile to $newfile" | |
| iconv -f $FENC -t $TENC $oldfile > $newfile | |
| done | 
| #!/bin/bash | |
| for oldfile in `ls Desperate\ Ho*.smi`; | |
| do | |
| newfile="Desperate.Housewives.s01e$oldfile.smi" | |
| echo "converting $oldfile to $newfile" | |
| mv $oldfile $newfile | |
| done |