Skip to content

Instantly share code, notes, and snippets.

@DrLulz
Created April 21, 2015 19:16
Show Gist options
  • Save DrLulz/09d7ad67c238b6a83a68 to your computer and use it in GitHub Desktop.
Save DrLulz/09d7ad67c238b6a83a68 to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
import re
import os
import csv
import glob
import string
from Tkinter import Tk
import unicodedata as udata
from cStringIO import StringIO
from tkFileDialog import askdirectory
from pyth.plugins.rtf15.reader import Rtf15Reader
def decode_cell(cell):
'''The cell matched so lets handle it'''
# variable that will hold the converted text
temp_cell = []
# pyth checks for the rtf syntax before processing, so 'unicode_escape' escapes the '\' so pyth doesn't complain
cell_encode = udata.normalize('NFKD', cell.decode('unicode_escape')).encode('ascii', 'ignore')
cell_encode = filter(lambda x: x in string.printable, cell_encode)
cell_rtf = Rtf15Reader.read(StringIO(cell_encode))
# turn the pyth object into readable text
cell_txt = [x.content for x in cell_rtf.content]
# iterate and extract the pyth object text into temp_cell
for line in cell_txt:
for l in line:
temp_cell.append(l.content)
# combine and join the extracted text into one string (for one cell)
combined = [i for sub in temp_cell for i in sub]
new_cell = ' '.join(combined)
# the non-ascii characters in your file were followed by _ so i removed them for cleanliness
# uncomment to keep the _
new_cell = re.sub('_', '', new_cell)
# remove extra whitespace and return the converted cell
# remove L at end of string
return ' '.join(new_cell[:-1].split())
def find_rtf(row):
'''Start looking for rtf syntax'''
# variable that will return the row to writer
temp_row = []
# loop and index each cell in row
for n, cell in enumerate(row):
# your csv is shitty
if type(cell) == str:
cell = unicode(cell, "utf-8", errors="ignore")
else:
cell = unicode(cell)
# if the cell text starts with {\\rtf we need to know
if re.match(r'^{\\\\rtf', cell):
# holder
combined = []
# collect all cells following matched cell
for item in row[n:]:
combined.append(item)
# combine the rest of the row
cell = ' '.join(combined)
# send off to convert rtf
cell_matched = decode_cell(cell)
# add the cell, with converted rtf, back to the row
temp_row.append(cell_matched.encode('ascii', 'ignore'))
# we don’t want to process further cells because they're now combined
# break the loop to start at next row
break
else:
# if the cell didn't have rtf just add it back to the row
temp_row.append(cell.encode('ascii', 'ignore'))
return temp_row
def open_csv(f_ori, f_new):
'''Open original file, process, and save to new file'''
# 'rU' = read 'r' and open with 'U' so the newlines inside the cell are respected
# 'wb' = write 'w' in binary 'b' mode
# 'with open' automatically closes the file
with open(f_ori, 'rU') as file1, open(f_new, 'wb') as file2:
reader = csv.reader(file1)
writer = csv.writer(file2)
# loop through rows in the opened csv
for row in reader:
# send to fx to look for rtf syntax
new_row = find_rtf(row)
# write the row to new file
writer.writerow(new_row)
def add_suffix(f_ori):
'''Append suffix to original file name'''
suffix = '-processed'
# explode full path into path, name, ext
path, name = os.path.split(f_ori)
name, ext = os.path.splitext(name)
# function to append suffix
mk_suffix = lambda i: os.path.join(path, '%s%s%s' % (name, i, ext))
# process and return
return mk_suffix(suffix)
def iterate_dir(path):
'''Iterate files in selected dir and filter out .csv'''
extension = '/*.csv'
select = path + extension
for i in glob.iglob(select):
# create unique name for new file
# send to opener
open_csv(i, add_suffix(i))
def main():
'''Initiate script and select directory to process'''
ini_path = os.path.expanduser('~/Desktop')
OPEN_OPTIONS = dict(
# specify root folder for ui
# uncomment initialdir entirely to remember last dir
#initialdir='/Users',
initialdir=ini_path,
title='Select Directory'
)
Tk().withdraw()
ask_path = askdirectory(**OPEN_OPTIONS)
# move to fx
iterate_dir(ask_path)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment