Skip to content

Instantly share code, notes, and snippets.

@madaan
Created August 18, 2014 11:22
Show Gist options
  • Save madaan/26cfcf025838a37fdd64 to your computer and use it in GitHub Desktop.
Save madaan/26cfcf025838a37fdd64 to your computer and use it in GitHub Desktop.
#sg
#This script iterates over documents in a directory and strips tags
#Thanks to http://stackoverflow.com/questions/753052/strip-html-from-strings-in-python
from HTMLParser import HTMLParser
class MLStripper(HTMLParser):
def __init__(self):
self.reset()
self.fed = []
def handle_data(self, d):
self.fed.append(d)
def get_data(self):
return ''.join(self.fed)
def strip_tags(html):
s = MLStripper()
s.feed(html)
return s.get_data()
#iterate over the files in the directory
def process(ipdir, opdir):
for doc in os.listdir(ipdir):
print 'processing ' + doc
with open (ipdir + doc, "r") as myfile:
data=myfile.read().replace('\n', '')
data = strip_tags(data)
with open(opdir + doc, "w") as myfile:
myfile.write(data)
import os,sys
if __name__ == '__main__':
ipdir = sys.argv[1]
opdir = sys.argv[2]
print ipdir, opdir
process(ipdir, opdir)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment