Created
April 21, 2015 19:16
-
-
Save DrLulz/09d7ad67c238b6a83a68 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import re | |
import os | |
import csv | |
import glob | |
import string | |
from Tkinter import Tk | |
import unicodedata as udata | |
from cStringIO import StringIO | |
from tkFileDialog import askdirectory | |
from pyth.plugins.rtf15.reader import Rtf15Reader | |
def decode_cell(cell): | |
'''The cell matched so lets handle it''' | |
# variable that will hold the converted text | |
temp_cell = [] | |
# pyth checks for the rtf syntax before processing, so 'unicode_escape' escapes the '\' so pyth doesn't complain | |
cell_encode = udata.normalize('NFKD', cell.decode('unicode_escape')).encode('ascii', 'ignore') | |
cell_encode = filter(lambda x: x in string.printable, cell_encode) | |
cell_rtf = Rtf15Reader.read(StringIO(cell_encode)) | |
# turn the pyth object into readable text | |
cell_txt = [x.content for x in cell_rtf.content] | |
# iterate and extract the pyth object text into temp_cell | |
for line in cell_txt: | |
for l in line: | |
temp_cell.append(l.content) | |
# combine and join the extracted text into one string (for one cell) | |
combined = [i for sub in temp_cell for i in sub] | |
new_cell = ' '.join(combined) | |
# the non-ascii characters in your file were followed by _ so i removed them for cleanliness | |
# uncomment to keep the _ | |
new_cell = re.sub('_', '', new_cell) | |
# remove extra whitespace and return the converted cell | |
# remove L at end of string | |
return ' '.join(new_cell[:-1].split()) | |
def find_rtf(row): | |
'''Start looking for rtf syntax''' | |
# variable that will return the row to writer | |
temp_row = [] | |
# loop and index each cell in row | |
for n, cell in enumerate(row): | |
# your csv is shitty | |
if type(cell) == str: | |
cell = unicode(cell, "utf-8", errors="ignore") | |
else: | |
cell = unicode(cell) | |
# if the cell text starts with {\\rtf we need to know | |
if re.match(r'^{\\\\rtf', cell): | |
# holder | |
combined = [] | |
# collect all cells following matched cell | |
for item in row[n:]: | |
combined.append(item) | |
# combine the rest of the row | |
cell = ' '.join(combined) | |
# send off to convert rtf | |
cell_matched = decode_cell(cell) | |
# add the cell, with converted rtf, back to the row | |
temp_row.append(cell_matched.encode('ascii', 'ignore')) | |
# we don’t want to process further cells because they're now combined | |
# break the loop to start at next row | |
break | |
else: | |
# if the cell didn't have rtf just add it back to the row | |
temp_row.append(cell.encode('ascii', 'ignore')) | |
return temp_row | |
def open_csv(f_ori, f_new): | |
'''Open original file, process, and save to new file''' | |
# 'rU' = read 'r' and open with 'U' so the newlines inside the cell are respected | |
# 'wb' = write 'w' in binary 'b' mode | |
# 'with open' automatically closes the file | |
with open(f_ori, 'rU') as file1, open(f_new, 'wb') as file2: | |
reader = csv.reader(file1) | |
writer = csv.writer(file2) | |
# loop through rows in the opened csv | |
for row in reader: | |
# send to fx to look for rtf syntax | |
new_row = find_rtf(row) | |
# write the row to new file | |
writer.writerow(new_row) | |
def add_suffix(f_ori): | |
'''Append suffix to original file name''' | |
suffix = '-processed' | |
# explode full path into path, name, ext | |
path, name = os.path.split(f_ori) | |
name, ext = os.path.splitext(name) | |
# function to append suffix | |
mk_suffix = lambda i: os.path.join(path, '%s%s%s' % (name, i, ext)) | |
# process and return | |
return mk_suffix(suffix) | |
def iterate_dir(path): | |
'''Iterate files in selected dir and filter out .csv''' | |
extension = '/*.csv' | |
select = path + extension | |
for i in glob.iglob(select): | |
# create unique name for new file | |
# send to opener | |
open_csv(i, add_suffix(i)) | |
def main(): | |
'''Initiate script and select directory to process''' | |
ini_path = os.path.expanduser('~/Desktop') | |
OPEN_OPTIONS = dict( | |
# specify root folder for ui | |
# uncomment initialdir entirely to remember last dir | |
#initialdir='/Users', | |
initialdir=ini_path, | |
title='Select Directory' | |
) | |
Tk().withdraw() | |
ask_path = askdirectory(**OPEN_OPTIONS) | |
# move to fx | |
iterate_dir(ask_path) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment