Last active
November 15, 2015 13:03
-
-
Save DrLulz/24ebe13a8ab658f6a0ff to your computer and use it in GitHub Desktop.
Script should be run from terminal with argument. If the .py and the .csv are in the same directory the argument can be the filename itself (eg $ python raymond-4.0.py filename.csv), else if the .py and .csv are in different directories the argument must be the absolute path to the .csv (eg $ python raymond-4.0.py "/Users/raymond/Desktop/filenam…
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/python | |
# -*- coding: utf-8 -*- | |
import re | |
import os | |
import csv | |
import sys | |
import glob | |
import string | |
import linecache | |
from Tkinter import Tk | |
from cStringIO import StringIO | |
from tkFileDialog import askdirectory | |
from pyth.plugins.rtf15.reader import Rtf15Reader | |
#CSV_FILE = '/Users/NAME/Desktop/file.csv' | |
# [ERROR HANDLING] | |
#import logging as log | |
#log.basicConfig(filename=os.path.expanduser('~/Desktop/log'), level=log.DEBUG) | |
class bcolors: | |
HEADER = '\033[95m' | |
OKBLUE = '\033[94m' | |
OKGREEN = '\033[92m' | |
WARNING = '\033[93m' | |
FAIL = '\033[91m' | |
ENDC = '\033[0m' | |
BOLD = '\033[1m' | |
UNDERLINE = '\033[4m' | |
def PrintException(): | |
exc_type, exc_obj, tb = sys.exc_info() | |
f = tb.tb_frame | |
lineno = tb.tb_lineno | |
filename = f.f_code.co_filename | |
linecache.checkcache(filename) | |
line = linecache.getline(filename, lineno, f.f_globals) | |
print bcolors.HEADER + 'EXCEPTION IN: {}'.format(filename) + bcolors.ENDC | |
print bcolors.HEADER + 'LINE: {}'.format(lineno) + bcolors.ENDC | |
print bcolors.HEADER + 'CODE: {}'.format(line.strip()) + bcolors.ENDC | |
print bcolors.WARNING + 'ERROR: {}'.format(exc_obj) + bcolors.ENDC | |
sys.exit() | |
def decode_cell(cell): | |
'''The cell matched so lets handle it''' | |
# variable that will hold the converted text | |
temp_cell = [] | |
# pyth checks for the rtf syntax before processing, so 'unicode_escape' escapes the '\' so pyth doesn't complain | |
cell_encode = cell.decode('unicode_escape') | |
cell_encode = filter(lambda x: x in string.printable, cell_encode) | |
cell_rtf = Rtf15Reader.read(StringIO(cell_encode)) | |
# turn the pyth object into readable text | |
cell_txt = [x.content for x in cell_rtf.content] | |
# iterate and extract the pyth object text into temp_cell | |
for line in cell_txt: | |
for l in line: | |
temp_cell.append(l.content) | |
# combine and join the extracted text into one string (for one cell) | |
combined = [i for sub in temp_cell for i in sub] | |
new_cell = ' '.join(combined) | |
# the non-ascii characters in your file were followed by _ so i removed them for cleanliness | |
# uncomment to keep the _ | |
new_cell = re.sub('_', '', new_cell) | |
# remove extra whitespace and return the converted cell | |
# remove L at end of string | |
decoded_cell = ' '.join(new_cell[:-1].split()) | |
# log.debug(decoded_cell) | |
# log.info(decoded_cell) | |
# log.warning(decoded_cell) | |
return decoded_cell | |
def find_rtf(row): | |
'''Start looking for rtf syntax''' | |
# variable that will return the row to writer | |
temp_row = [] | |
# loop and index each cell in row | |
for n, cell in enumerate(row): | |
# your csv is shitty | |
if type(cell) == str: | |
cell = unicode(cell, "utf-8", errors="ignore") | |
else: | |
cell = unicode(cell) | |
# if the cell text starts with {\\rtf we need to know | |
if re.match(r'^{\\\\rtf', cell): | |
# holder | |
combined = [] | |
# collect all cells following matched cell | |
for item in row[n:]: | |
combined.append(item) | |
# combine the rest of the row | |
cell = ' '.join(combined) | |
# send off to convert rtf | |
cell_matched = decode_cell(cell) | |
# add the cell, with converted rtf, back to the row | |
temp_row.append(cell_matched.encode('ascii', 'ignore')) | |
# we don’t want to process further cells because they're now combined | |
# break the loop to start at next row | |
break | |
else: | |
# if the cell didn't have rtf just add it back to the row | |
temp_row.append(cell.encode('ascii', 'ignore')) | |
# log.info(temp_row) | |
return temp_row | |
def add_suffix(f_ori): | |
'''Append suffix to original file name''' | |
suffix = '-processed' | |
# explode full path into path, name, ext | |
path, name = os.path.split(f_ori) | |
name, ext = os.path.splitext(name) | |
# function to append suffix | |
mk_suffix = lambda i: os.path.join(path, '%s%s%s' % (name, i, ext)) | |
# process and return | |
return mk_suffix(suffix) | |
def open_csv(arg, abspath): | |
'''Open original file, process, and save to new file''' | |
# if abspath is False you supplied a filename as the argument | |
# the .py and .csv are assumed to be in the same directory | |
if abspath is False: | |
f_ori = os.path.normpath('{}/{}'.format(os.getcwd(), arg)) | |
f_new = add_suffix(f_ori) | |
try: | |
with open(f_ori, 'rU') as file1, open(f_new, 'wb') as file2: | |
reader = csv.reader(file1) | |
writer = csv.writer(file2) | |
for row in reader: | |
new_row = find_rtf(row) | |
writer.writerow(new_row) | |
except: | |
PrintException() | |
# if abspath is True you supplied the full path to the .csv (/Users/NAME/Desktop/wut.csv) | |
elif abspath is True: | |
f_ori = arg | |
f_new = add_suffix(f_ori) | |
try: | |
with open(f_ori, 'rU') as file1, open(f_new, 'wb') as file2: | |
reader = csv.reader(file1) | |
writer = csv.writer(file2) | |
for row in reader: | |
new_row = find_rtf(row) | |
writer.writerow(new_row) | |
except: | |
PrintException() | |
def main(): | |
'''Initiate script with argument''' | |
query = sys.argv[1] | |
#query = CSV_FILE | |
if query.endswith('.csv') or query.endswith('.CSV'): | |
which_path = query.split('/') | |
if len(which_path) == 1: | |
open_csv(query, False) | |
elif len(which_path) > 1: | |
open_csv(query, True) | |
print bcolors.OKGREEN + 'CSV PROCESSED' + bcolors.ENDC | |
else: | |
print bcolors.OKBLUE + 'FILE IS NOT CSV' + bcolors.ENDC | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment