brianly · October 9, 2013 03:43
diff --git a/yammerwords.py b/yammerwords.py
 import sys
 import csv
 import argparse
 import re
 import nltk

 from operator import itemgetter
 from string import punctuation

 def replace_topic(matched_topic):
    """Removes the tag goop from the message body"""
    pattern = re.compile('\[Tag:\d{1,9}:')
    tag = re.sub(pattern, '', matched_topic.group(0))

    return tag.replace(tag[len(tag)-1], '')

 def convert_message(raw_message):
    """Converts tag syntax to plain text message"""
    pattern = re.compile('\[Tag:\d{1,9}:([a-zA-Z])*\]')

    if re.search(pattern, raw_message):
        return  re.sub(pattern, replace_topic, raw_message)

    return raw_message

 def tokenize(message):
    """Split a message into a list of words without punctuation"""
 #    message = message.lower()
 #    return message.strip(punctuation).split()
    return nltk.word_tokenize(message)

 def main(args=None):

    if args.command == 'frequency':
        wordCnt = {}
        with open(args.export, 'rb') as fi:
            reader = csv.DictReader(fi)

            for row in reader:
                message = convert_message(row['body'])

                for word in tokenize(message):
                    word = word.rstrip('.')
                    word = word.rstrip(',')
                    word = word.rstrip(':')
                    word = word.rstrip('?')
                    word = word.rstrip(';')
                    word = word.rstrip('!')
                    word = word.rstrip('\'')
                    word = word.rstrip(')')
                    word = word.rstrip('&quot;')
                    word = word.rstrip('&quot')

                    word = word.lstrip('&quot;')
                    word = word.lstrip('&quot')
                    word = word.lstrip('\'')
                    word = word.lstrip('(')
                    word = word.lstrip(')')

                    if word in wordCnt:
                        wordCnt[word] += 1
                    else:
                        wordCnt[word] = 1

        for item in sorted(wordCnt.items(), key=itemgetter(1)):
 #            if len(item[0]) > 4 and item[1] > 1:
             print '%s\t%s' % (item[0], item[1])

    return 0



 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Process words in a Yammer message export.')
    subparsers = parser.add_subparsers()

    parser_install = subparsers.add_parser('frequency')
    parser_install.add_argument('--file', type=str, help='Optional output file')
    parser_install.add_argument('export', type=str, help='Message export file')
    parser_install.set_defaults(command='frequency')

    args = parser.parse_args()

    sys.exit(main(args))
 Yammer
	import sys
	import csv
	import argparse
	import re
	import nltk

	from operator import itemgetter
	from string import punctuation

	def replace_topic(matched_topic):
	"""Removes the tag goop from the message body"""
	pattern = re.compile('\[Tag:\d{1,9}:')
	tag = re.sub(pattern, '', matched_topic.group(0))

	return tag.replace(tag[len(tag)-1], '')

	def convert_message(raw_message):
	"""Converts tag syntax to plain text message"""
	pattern = re.compile('\[Tag:\d{1,9}:([a-zA-Z])*\]')

	if re.search(pattern, raw_message):
	return re.sub(pattern, replace_topic, raw_message)

	return raw_message

	def tokenize(message):
	"""Split a message into a list of words without punctuation"""
	# message = message.lower()
	# return message.strip(punctuation).split()
	return nltk.word_tokenize(message)

	def main(args=None):

	if args.command == 'frequency':
	wordCnt = {}
	with open(args.export, 'rb') as fi:
	reader = csv.DictReader(fi)

	for row in reader:
	message = convert_message(row['body'])

	for word in tokenize(message):
	word = word.rstrip('.')
	word = word.rstrip(',')
	word = word.rstrip(':')
	word = word.rstrip('?')
	word = word.rstrip(';')
	word = word.rstrip('!')
	word = word.rstrip('\'')
	word = word.rstrip(')')
	word = word.rstrip('"')
	word = word.rstrip('&quot')

	word = word.lstrip('"')
	word = word.lstrip('&quot')
	word = word.lstrip('\'')
	word = word.lstrip('(')
	word = word.lstrip(')')

	if word in wordCnt:
	wordCnt[word] += 1
	else:
	wordCnt[word] = 1

	for item in sorted(wordCnt.items(), key=itemgetter(1)):
	# if len(item[0]) > 4 and item[1] > 1:
	print '%s\t%s' % (item[0], item[1])

	return 0



	if __name__ == "__main__":
	parser = argparse.ArgumentParser(description='Process words in a Yammer message export.')
	subparsers = parser.add_subparsers()

	parser_install = subparsers.add_parser('frequency')
	parser_install.add_argument('--file', type=str, help='Optional output file')
	parser_install.add_argument('export', type=str, help='Message export file')
	parser_install.set_defaults(command='frequency')

	args = parser.parse_args()

	sys.exit(main(args))
	Yammer