Last active
February 8, 2019 21:12
-
-
Save paceaux/4743009c84ecdab9625d1b23662d1223 to your computer and use it in GitHub Desktop.
RTF to JSON parser
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os, shutil, sys, getopt, fnmatch, json | |
from os.path import join | |
from HTMLParser import HTMLParser | |
from pyth.plugins.rtf15.reader import Rtf15Reader | |
from pyth.plugins.xhtml.writer import XHTMLWriter | |
from bs4 import BeautifulSoup | |
# constants | |
DEFAULT_DICTIONARY = { | |
'chronoNumber': '', | |
'chronoContent': ' ', | |
'metadata' : { | |
'restrictedChrono': False, | |
'office': '', | |
'primaryAccountingManual': '', | |
'alternateAccountingManuals': [], | |
'industries': [], | |
'assignedTo': '', | |
'dateCleared': '', | |
'clearingPartner': '', | |
'individual': '', | |
'priorConsultations': [] | |
} | |
} | |
POSITION_OF_DATA = 'div > p:nth-of-type(1) + p' | |
OUTPUT_NAME = 'output.json' | |
DEBUG = False | |
class FileRules: | |
""" Used for easily passing rules around for how to read and write files | |
Attributes: | |
path: string, pathname of files | |
filePattern: pattern to use to find files | |
""" | |
def __init__(self, filePattern = "*.rtf"): | |
""" Inits FileRules with a filePattern""" | |
self.path = os.getcwd() | |
self.filePattern = filePattern | |
@property | |
def filePattern(self): | |
return self._filePattern.lower() | |
@filePattern.setter | |
def filePattern(self, new_filePattern): | |
if type(new_filePattern) == str: | |
self._filePattern = new_filePattern.lower() | |
else: | |
raise Exception("invalid value for filePattern") | |
def PathAndType(self): | |
return join("/", self.path, self.filePattern) | |
def fileList(fileRules): | |
""" Returns files as an array | |
Args: | |
fileRules. A FileRules object | |
Returns: | |
array | |
""" | |
fileList = [] | |
directoryList = os.listdir(fileRules.path) | |
for entry in directoryList: | |
if fnmatch.fnmatch(entry, fileRules.filePattern): | |
fileList.append(entry) | |
elif fnmatch.fnmatch(entry, fileRules.filePattern.upper()): | |
fileList.append(entry) | |
return fileList | |
def getDecodedText(file): | |
""" Converts text into utf-8 | |
Args: | |
file: string of the path and filename to a file | |
Returns: string of utf-8 text | |
""" | |
lines = open(file).readlines() | |
newLines = [] | |
for line in lines: | |
newLine = line.decode('utf-8') | |
newLines.append(newLine) | |
return ' '.join(newLines) | |
def getHtmlFromRtf(text): | |
""" converts RTF to HTML | |
Args: | |
text: utf8 encoded text that is RTF content | |
Returns: | |
string containing valid XHTML | |
""" | |
doc = Rtf15Reader.read(text) | |
html = XHTMLWriter.write(doc, pretty=True).read() | |
return html | |
def getCleanedHtml(html): | |
""" Removes undesirable markup from html | |
Args: | |
html: a string that is the path to an HTML file | |
returns: | |
Soup: a soup object that's been modified | |
""" | |
soup =BeautifulSoup(html, 'html.parser') | |
# replace p >u with an h3 | |
titles = soup.select('p > u:nth-of-type(1)') | |
for title in titles: | |
if (title.string == title.string.upper()) : | |
h3 = soup.new_tag('h3') | |
text = title.string | |
h3.string = text | |
title.find_parent('p').replaceWith(h3) | |
# remove u | |
underlines = soup.select('strong em u') | |
for underline in underlines: | |
underline.unwrap() | |
# remove empty p | |
for p in soup.find_all('p'): | |
if (len(p.text)) == 0: | |
p.decompose() | |
return str(soup) | |
def getTextFile(fileName, text = ''): | |
""" creates a file with text in it | |
Args: | |
fileName: Name of the file (e.g. foo.html) | |
text: Text that should be be in the file | |
Returns: | |
file | |
""" | |
outputFile = open(fileName, 'w') | |
outputFile.write(text) | |
outputFile.close() | |
return outputFile | |
def extractMetadata(soup, dataPosition): | |
""" | |
""" | |
metadataWrapper = soup.select(dataPosition) | |
metadataContents = metadataWrapper[0].contents | |
metadata = dict(DEFAULT_DICTIONARY['metadata']) | |
for idx, metadataTag in enumerate(metadataContents): | |
if (metadataTag.string != None): | |
if "OFFICE" in metadataTag.string: | |
metadata['office'] = str(metadataContents[idx + 1]).strip() | |
elif "PRIMARY ACCMAN" in metadataTag.string: | |
metadata['primaryAccountingManual'] = str(metadataContents[idx + 1]).strip() | |
elif "ALTERNATE ACCMAN" in metadataTag.string: | |
metadata['alternateAccountingManuals'].append(str(metadataContents[idx + 1]).strip()) | |
elif "INDUSTRY IDENTIFIER" in metadataTag.string: | |
metadata['industries'].append(str(metadataContents[idx + 1]).strip()) | |
elif "ASSIGNED TO" in metadataTag.string: | |
metadata['assignedTo'] = str(metadataContents[idx + 1]).strip() | |
elif "DATE CLEARED" in metadataTag.string: | |
metadata['dateCleared'] = str(metadataContents[idx + 1]).strip() | |
elif "CLEARING PARTNER" in metadataTag.string: | |
metadata['clearingPartner'] = str(metadataContents[idx + 1]).strip() | |
elif "INDIVIDUAL" in metadataTag.string: | |
metadata['individual'] = str(metadataContents[idx + 1]).strip() | |
elif "PRIOR CONSULTATIONS" in metadataTag.string: | |
metadata['priorConsultations'] = str(metadataContents[idx + 1]).strip() | |
return metadata | |
def extractContent(soup): | |
""" extracts only the content part from the soup file | |
Args: | |
soup: A Beautiful soup object | |
Returns: | |
a prettified string of HTML | |
""" | |
metadataP = soup.find('p') | |
metadataP.decompose() | |
soup.find('div').unwrap() | |
prettifiedSoup = soup.prettify(formatter='html') | |
return str(prettifiedSoup) | |
def getDictFromHtml(file, defaultDictionary = DEFAULT_DICTIONARY, dataPosition = POSITION_OF_DATA): | |
""" produces a Dictionary | |
Args: | |
file: string. A path to the HTML file to be parsed | |
defaultDictionary: Dictionary. presents the data model this becomes | |
dataPosition: string. CSS selector that's BeautifulSoup compliant for where to find data | |
""" | |
soup = BeautifulSoup(file, 'html.parser') | |
soupClone = BeautifulSoup(file, 'html.parser') | |
dictionary = dict(defaultDictionary) | |
metadata = extractMetadata(soup, dataPosition) | |
content = extractContent(soupClone) | |
dictionary['metadata'] = metadata | |
dictionary['chronoNumber'] = soup.select(dataPosition)[0].contents[1].strip() | |
dictionary['chronoContent'] = content | |
return dictionary | |
def getAsJson(dictionary): | |
return json.dumps(dictionary) | |
def getRtfAsJson(file, debug = False): | |
""" Converts a file from origin to a new version, also cleans it | |
Args: | |
file: string, name of the file | |
Returns: | |
dict | |
""" | |
decodedText = getDecodedText(file) | |
preHtmlFile = open('preHtml', 'w+') | |
preHtmlFile.write(decodedText) | |
html = getHtmlFromRtf(preHtmlFile) | |
cleanedHtml = getCleanedHtml(html) | |
if (debug == True): | |
# For Debugging | |
htmlFile = getTextFile(file + '.html', str(html)) | |
cleanedHtmlFile = getTextFile(file + '.cleaned.html', str(cleanedHtml)) | |
data = getDictFromHtml(cleanedHtml) | |
return data | |
# htmlFile.close() | |
# preHtmlFile.close() | |
# figure out how to delete files | |
def outputFile(outputName, listOfFiles, debug = DEBUG): | |
""" Outputs a converted file | |
Args: | |
outputName. String. Name of the file to be created | |
listOfFiles: List. The files that need to be converted | |
""" | |
if (len(listOfFiles)) > 1: | |
jsonArray = [] | |
for file in listOfFiles: | |
convertedFile = getRtfAsJson(file, debug) | |
jsonArray.append(convertedFile) | |
outputFile = getTextFile(outputName, getAsJson(jsonArray)) | |
else: | |
convertedFile = getRtfAsJson(listOfFiles[0], debug) | |
outputFile = getTextFile(outputName, getAsJson(convertedFile)) | |
def main(argv): | |
fileRules = FileRules() | |
outputName = OUTPUT_NAME | |
debug = DEBUG | |
try: | |
opts, args = getopt.getopt(argv, "p:f:o:d", ["path=", "file=", "output="]) | |
except getopt.GetoptError: | |
print("error with the arguments") | |
sys.exit(2) | |
for opt, arg in opts: | |
if opt in ("-p", "--path"): | |
fileRules.path = arg | |
elif opt in ("-f", "--file"): | |
fileRules.filePattern = arg | |
elif opt in ("-o", "--output"): | |
outputName = arg | |
elif opt in ("-d", "--debug"): | |
debug = True | |
listOfFiles = fileList(fileRules) | |
try: | |
outputFile(outputName, listOfFiles, debug) | |
except Exception as e: | |
print("Error With outputting the file") | |
print(e) | |
if __name__ == "__main__": | |
main(sys.argv[1:]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment