Created
April 12, 2015 19:09
-
-
Save DrLulz/75e6654c45f2f4722e79 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import re | |
import os | |
import csv | |
import glob | |
import unicodedata as udata | |
from Tkinter import Tk | |
from cStringIO import StringIO | |
from tkFileDialog import askdirectory | |
from pyth.plugins.rtf15.reader import Rtf15Reader | |
def decode_cell(cell): | |
'''The cell matched so lets handle it''' | |
# variable that will hold the converted text | |
temp_cell = [] | |
# probably unnecessary to normalize, but with so many encoding issues... | |
# NFK-'D' means to decompose (eg convert \u00C7 into two separate entities upon printing) | |
# pyth checks for the rtf syntax before processing, so 'unicode_escape' escapes the '\' so pyth doesn't complain | |
cell_encode = udata.normalize('NFKD', cell.decode('unicode_escape')) | |
cell_rtf = Rtf15Reader.read(StringIO(cell_encode)) | |
# turn the pyth object into readable text | |
cell_txt = [x.content for x in cell_rtf.content] | |
# iterate and extract the pyth object text into temp_cell | |
for line in cell_txt: | |
for l in line: | |
temp_cell.append(l.content) | |
# combine and join the extracted text into one string (for one cell) | |
combined = [i for sub in temp_cell for i in sub] | |
new_cell = ' '.join(combined) | |
# remove extra whitespace and return the converted cell | |
return ' '.join(new_cell.split()) | |
def find_rtf(row): | |
'''Start looking for rtf syntax''' | |
# variable that will return the row to writer | |
temp_row = [] | |
# loop through each cell | |
for cell in row: | |
# your csv is shitty | |
if type(cell) == str: | |
cell = unicode(cell, "utf-8", errors="ignore") | |
else: | |
cell = unicode(cell) | |
# if the cell text starts with {\\rtf we need to know | |
if re.match(r'^{\\\\rtf', cell): | |
# on matching send the cell off to convert rtf | |
# also, your csv is shitty, so lets ignore weird characters | |
cell_matched = decode_cell(cell.encode('ascii', 'ignore')) | |
# add the cell, with converted rtf, back to the row | |
temp_row.append(cell_matched.encode('ascii', 'ignore')) | |
else: | |
# if the cell didn't have rtf just add it back to the row | |
temp_row.append(cell.encode('ascii', 'ignore')) | |
return temp_row | |
def open_csv(f_ori, f_new): | |
'''Open original file, process, and save to new file''' | |
# 'rU' = read 'r' and open with 'U' so the newlines inside the cell are respected | |
# 'wb' = write 'w' in binary 'b' mode | |
# 'with open' automatically closes the file | |
with open(f_ori, 'rU') as file1, open(f_new, 'wb') as file2: | |
reader = csv.reader(file1) | |
writer = csv.writer(file2) | |
# loop through rows in the opened csv | |
for row in reader: | |
# send to fx to look for rtf syntax | |
new_row = find_rtf(row) | |
# write the row to new file | |
writer.writerow(new_row) | |
def add_suffix(f_ori): | |
'''Append suffix to original file name''' | |
suffix = '-processed' | |
# explode full path into path, name, ext | |
path, name = os.path.split(f_ori) | |
name, ext = os.path.splitext(name) | |
# function to append suffix | |
mk_suffix = lambda i: os.path.join(path, '%s%s%s' % (name, i, ext)) | |
# process and return | |
return mk_suffix(suffix) | |
def iterate_dir(path): | |
'''Iterate files in selected dir and filter out .csv''' | |
extension = '/*.csv' | |
select = path + extension | |
for i in glob.iglob(select): | |
# create unique name for new file | |
# send to opener | |
open_csv(i, add_suffix(i)) | |
def main(): | |
'''Initiate script and select directory to process''' | |
ini_path = os.path.expanduser('~/Desktop') | |
OPEN_OPTIONS = dict( | |
# specify root folder for ui | |
# uncomment initialdir entirely to remember last dir | |
#initialdir='/Users', | |
initialdir=ini_path, | |
title='Select Directory' | |
) | |
Tk().withdraw() | |
ask_path = askdirectory(**OPEN_OPTIONS) | |
# move to fx | |
iterate_dir(ask_path) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment