DrLulz · April 12, 2015 19:09
diff --git a/raymond-2.0.py b/raymond-2.0.py
 # -*- coding: utf-8 -*-

 import re
 import os
 import csv
 import glob
 import unicodedata as udata
 from Tkinter import Tk
 from cStringIO import StringIO
 from tkFileDialog import askdirectory
 from pyth.plugins.rtf15.reader import Rtf15Reader



 def decode_cell(cell):
    '''The cell matched so lets handle it'''
    
    # variable that will hold the converted text
    temp_cell = []
    
    # probably unnecessary to normalize, but with so many encoding issues...
    # NFK-'D' means to decompose (eg convert \u00C7 into two separate entities upon printing)
    # pyth checks for the rtf syntax before processing, so 'unicode_escape' escapes the '\' so pyth doesn't complain
    cell_encode = udata.normalize('NFKD', cell.decode('unicode_escape'))
    cell_rtf = Rtf15Reader.read(StringIO(cell_encode))

    # turn the pyth object into readable text
    cell_txt = [x.content for x in cell_rtf.content]
    
    # iterate and extract the pyth object text into temp_cell
    for line in cell_txt:
        for l in line:
            temp_cell.append(l.content)
                
    
    # combine and join the extracted text into one string (for one cell)
    combined = [i for sub in temp_cell for i in sub]
    new_cell =  ' '.join(combined)
    
    # remove extra whitespace and return the converted cell
    return ' '.join(new_cell.split())



 def find_rtf(row):
    '''Start looking for rtf syntax'''
    
    # variable that will return the row to writer
    temp_row = []
    
    # loop through each cell
    for cell in row:
        
        # your csv is shitty
        if type(cell) == str:
            cell = unicode(cell, "utf-8", errors="ignore")
        else:
            cell = unicode(cell)

        # if the cell text starts with {\\rtf we need to know
        if re.match(r'^{\\\\rtf', cell):
            
            # on matching send the cell off to convert rtf
            # also, your csv is shitty, so lets ignore weird characters
            cell_matched = decode_cell(cell.encode('ascii', 'ignore'))
            
            # add the cell, with converted rtf, back to the row
            temp_row.append(cell_matched.encode('ascii', 'ignore'))
            
        else:
            # if the cell didn't have rtf just add it back to the row
            temp_row.append(cell.encode('ascii', 'ignore'))
            
    return temp_row



 def open_csv(f_ori, f_new):
    '''Open original file, process, and save to new file'''

    # 'rU' = read 'r' and open with 'U' so the newlines inside the cell are respected
    # 'wb' = write 'w' in binary 'b' mode
    # 'with open' automatically closes the file
    with open(f_ori, 'rU') as file1, open(f_new, 'wb') as file2:
    
        reader = csv.reader(file1)
        writer = csv.writer(file2)
        
        # loop through rows in the opened csv
        for row in reader:
            
            # send to fx to look for rtf syntax
            new_row = find_rtf(row)
            
            # write the row to new file
            writer.writerow(new_row)



 def add_suffix(f_ori):
    '''Append suffix to original file name'''
    
    suffix = '-processed'
    
    # explode full path into path, name, ext
    path, name = os.path.split(f_ori)
    name, ext = os.path.splitext(name)
    
    # function to append suffix
    mk_suffix = lambda i: os.path.join(path, '%s%s%s' % (name, i, ext))
    
    # process and return
    return mk_suffix(suffix)



 def iterate_dir(path):
    '''Iterate files in selected dir and filter out .csv'''
    
    extension = '/*.csv'
    select = path + extension
        
    for i in glob.iglob(select):
        # create unique name for new file
        # send to opener
        open_csv(i, add_suffix(i))



 def main():
    '''Initiate script and select directory to process'''
    
    ini_path = os.path.expanduser('~/Desktop')
    
    OPEN_OPTIONS = dict(
                        # specify root folder for ui
                        # uncomment initialdir entirely to remember last dir
                        #initialdir='/Users',
                        initialdir=ini_path,
                        title='Select Directory'
                        )

    Tk().withdraw()
    ask_path = askdirectory(**OPEN_OPTIONS)

    # move to fx
    iterate_dir(ask_path)



 if __name__ == "__main__":
    main()
	# -- coding: utf-8 --

	import re
	import os
	import csv
	import glob
	import unicodedata as udata
	from Tkinter import Tk
	from cStringIO import StringIO
	from tkFileDialog import askdirectory
	from pyth.plugins.rtf15.reader import Rtf15Reader



	def decode_cell(cell):
	'''The cell matched so lets handle it'''

	# variable that will hold the converted text
	temp_cell = []

	# probably unnecessary to normalize, but with so many encoding issues...
	# NFK-'D' means to decompose (eg convert \u00C7 into two separate entities upon printing)
	# pyth checks for the rtf syntax before processing, so 'unicode_escape' escapes the '\' so pyth doesn't complain
	cell_encode = udata.normalize('NFKD', cell.decode('unicode_escape'))
	cell_rtf = Rtf15Reader.read(StringIO(cell_encode))

	# turn the pyth object into readable text
	cell_txt = [x.content for x in cell_rtf.content]

	# iterate and extract the pyth object text into temp_cell
	for line in cell_txt:
	for l in line:
	temp_cell.append(l.content)


	# combine and join the extracted text into one string (for one cell)
	combined = [i for sub in temp_cell for i in sub]
	new_cell = ' '.join(combined)

	# remove extra whitespace and return the converted cell
	return ' '.join(new_cell.split())



	def find_rtf(row):
	'''Start looking for rtf syntax'''

	# variable that will return the row to writer
	temp_row = []

	# loop through each cell
	for cell in row:

	# your csv is shitty
	if type(cell) == str:
	cell = unicode(cell, "utf-8", errors="ignore")
	else:
	cell = unicode(cell)

	# if the cell text starts with {\\rtf we need to know
	if re.match(r'^{\\\\rtf', cell):

	# on matching send the cell off to convert rtf
	# also, your csv is shitty, so lets ignore weird characters
	cell_matched = decode_cell(cell.encode('ascii', 'ignore'))

	# add the cell, with converted rtf, back to the row
	temp_row.append(cell_matched.encode('ascii', 'ignore'))

	else:
	# if the cell didn't have rtf just add it back to the row
	temp_row.append(cell.encode('ascii', 'ignore'))

	return temp_row



	def open_csv(f_ori, f_new):
	'''Open original file, process, and save to new file'''

	# 'rU' = read 'r' and open with 'U' so the newlines inside the cell are respected
	# 'wb' = write 'w' in binary 'b' mode
	# 'with open' automatically closes the file
	with open(f_ori, 'rU') as file1, open(f_new, 'wb') as file2:

	reader = csv.reader(file1)
	writer = csv.writer(file2)

	# loop through rows in the opened csv
	for row in reader:

	# send to fx to look for rtf syntax
	new_row = find_rtf(row)

	# write the row to new file
	writer.writerow(new_row)



	def add_suffix(f_ori):
	'''Append suffix to original file name'''

	suffix = '-processed'

	# explode full path into path, name, ext
	path, name = os.path.split(f_ori)
	name, ext = os.path.splitext(name)

	# function to append suffix
	mk_suffix = lambda i: os.path.join(path, '%s%s%s' % (name, i, ext))

	# process and return
	return mk_suffix(suffix)



	def iterate_dir(path):
	'''Iterate files in selected dir and filter out .csv'''

	extension = '/*.csv'
	select = path + extension

	for i in glob.iglob(select):
	# create unique name for new file
	# send to opener
	open_csv(i, add_suffix(i))



	def main():
	'''Initiate script and select directory to process'''

	ini_path = os.path.expanduser('~/Desktop')

	OPEN_OPTIONS = dict(
	# specify root folder for ui
	# uncomment initialdir entirely to remember last dir
	#initialdir='/Users',
	initialdir=ini_path,
	title='Select Directory'
	)

	Tk().withdraw()
	ask_path = askdirectory(**OPEN_OPTIONS)

	# move to fx
	iterate_dir(ask_path)



	if __name__ == "__main__":
	main()