JGVerdugo · November 8, 2012 03:31
diff --git a/punct.py b/punct.py
 #!/usr/bin/python

 import codecs
 import os
 import re
 import unicodedata

 # Loads a UTF-8 text file into memory as a character string
 def readDoc(filename):
 	file = codecs.open(filename, "r", "UTF-8")
 	string = file.read()
 	file.close()
 	return string

 # Writes a character string (doc) as a UTF-8 text file
 def writeDoc(doc):
 	newfile = codecs.open("newfile.txt", "w", "UTF-8")
 	newfile.write(doc)
 	newfile.close()
 	
 # Gets the character's Unicode category and its name. Returns a tuple containing these two values.
 def getCharName(char):
 	category = unicodedata.category(char)
 	name = unicodedata.name(char)
 	values = (category, name)
 	return values

 # Finds text elements in the document and replaces all punctuation characters with a tag containing
 # their names, at string level
 def handleText(doc):
 	oldString = doc
 	newString = ""
 	for char in oldString:
 		category = unicodedata.category(char)
 		if re.match("[PS][cdefimkos]*", category):
 			char = "<" + unicodedata.name(char) + ">"
 			
 		newString += char
 	
 	return newString
 		
 doc = readDoc("text.txt")
 doc = handleText(doc)
 writeDoc(doc)
	#!/usr/bin/python

	import codecs
	import os
	import re
	import unicodedata

	# Loads a UTF-8 text file into memory as a character string
	def readDoc(filename):
	file = codecs.open(filename, "r", "UTF-8")
	string = file.read()
	file.close()
	return string

	# Writes a character string (doc) as a UTF-8 text file
	def writeDoc(doc):
	newfile = codecs.open("newfile.txt", "w", "UTF-8")
	newfile.write(doc)
	newfile.close()

	# Gets the character's Unicode category and its name. Returns a tuple containing these two values.
	def getCharName(char):
	category = unicodedata.category(char)
	name = unicodedata.name(char)
	values = (category, name)
	return values

	# Finds text elements in the document and replaces all punctuation characters with a tag containing
	# their names, at string level
	def handleText(doc):
	oldString = doc
	newString = ""
	for char in oldString:
	category = unicodedata.category(char)
	if re.match("[PS][cdefimkos]*", category):
	char = "<" + unicodedata.name(char) + ">"

	newString += char

	return newString

	doc = readDoc("text.txt")
	doc = handleText(doc)
	writeDoc(doc)
No results found