dumpmycode · March 23, 2016 04:36
diff --git a/wordcount.py b/wordcount.py
 #!/usr/bin/python -tt
 # Copyright 2010 Google Inc.
 # Licensed under the Apache License, Version 2.0
 # http://www.apache.org/licenses/LICENSE-2.0

 # Google's Python Class
 # http://code.google.com/edu/languages/google-python-class/

 """Wordcount exercise
 Google's Python class

 The main() below is already defined and complete. It calls print_words()
 and print_top() functions which you write.

 1. For the --count flag, implement a print_words(filename) function that counts
 how often each word appears in the text and prints:
 word1 count1
 word2 count2
 ...

 Print the above list in order sorted by word (python will sort punctuation to
 come before letters -- that's fine). Store all the words as lowercase,
 so 'The' and 'the' count as the same word.

 2. For the --topcount flag, implement a print_top(filename) which is similar
 to print_words() but which prints just the top 20 most common words sorted
 so the most common word is first, then the next most common, and so on.

 Use str.split() (no arguments) to split on all whitespace.

 Workflow: don't build the whole program at once. Get it to an intermediate
 milestone and print your data structure and sys.exit(0).
 When that's working, try for the next milestone.

 Optional: define a helper function to avoid code duplication inside
 print_words() and print_top().

 """

 import sys

 # +++your code here+++
 # Define print_words(filename) and print_top(filename) functions.
 # You could write a helper utility function that reads a file
 # and builds and returns a word/count dict for it.
 # Then print_words() and print_top() can just call the utility function.

    '''
    this code is approx 20x slower than google's wordcount solution
    based on profile run, the str.count method is the bottleneck and
    there are few other places that could do with optimization as well
    such as not using string concatenation, xrange instead of range, 
    build dict in more straightforward manner instead of using list and list.
    Things to learn and improve in future.
    
    example:
    
    #! /usr/bin/env python
    
    import profile

    f = '10k.txt'
    def func(f):
    # calling str.lower this way is way faster
    # only 8 function calls.
    with open(f) as fo:
        s = fo.read().lower().split()
    return s

    def func(f):
    # calling str.lower this way is very slow
    # (18k function calls)
        s=''
        with open(f) as fo:
            for line in fo:
                s += line.lower()
        s = s.split()
        return s
        
    print(func(f))
    profile.run('func(f)')
    '''

 def churn(fn):
    #convert file to blob of string, clean it, lowercase it and make wordlist.
    #create a unique wordlist and use uniq_wl as key, wl.count as value in dict.
    wl = ''
    with open(fn) as fo:
        for line in fo:
            wl += line.replace('\n', ' ').lower()
    wl = wl.split()
    uniq_wl = list(set(wl))
    dic = {}
    for word in uniq_wl:
        dic[word] = wl.count(word)

    return dic


 def print_words(filename):
    #sort and print wordlist using key as sort in ascending order
    dic = churn(filename)
    for key in sorted(dic):
        print('{} {}'.format(key, dic[key]))

 def print_top(filename):
    #sort 20 most commonly used word in descending order.
    dic = churn(filename)
    dic = sorted(dic.items(), key=lambda x:x[1], reverse=True)
    for item in (dic[:20]):
        print('{} {}'.format(item[0], item[1]))

 ###

 # This basic command line argument parsing code is provided and
 # calls the print_words() and print_top() functions which you must define.
 def main():

  if len(sys.argv) != 3:
    print 'usage: ./wordcount.py {--count | --topcount} file'
    sys.exit(1)

  option = sys.argv[1]
  filename = sys.argv[2]
  if option == '--count':
    print_words(filename)
  elif option == '--topcount':
    print_top(filename)
  else:
    print 'unknown option: ' + option
    sys.exit(1)

 if __name__ == '__main__':
  main()
	#!/usr/bin/python -tt
	# Copyright 2010 Google Inc.
	# Licensed under the Apache License, Version 2.0
	# http://www.apache.org/licenses/LICENSE-2.0

	# Google's Python Class
	# http://code.google.com/edu/languages/google-python-class/

	"""Wordcount exercise
	Google's Python class

	The main() below is already defined and complete. It calls print_words()
	and print_top() functions which you write.

	1. For the --count flag, implement a print_words(filename) function that counts
	how often each word appears in the text and prints:
	word1 count1
	word2 count2
	...

	Print the above list in order sorted by word (python will sort punctuation to
	come before letters -- that's fine). Store all the words as lowercase,
	so 'The' and 'the' count as the same word.

	2. For the --topcount flag, implement a print_top(filename) which is similar
	to print_words() but which prints just the top 20 most common words sorted
	so the most common word is first, then the next most common, and so on.

	Use str.split() (no arguments) to split on all whitespace.

	Workflow: don't build the whole program at once. Get it to an intermediate
	milestone and print your data structure and sys.exit(0).
	When that's working, try for the next milestone.

	Optional: define a helper function to avoid code duplication inside
	print_words() and print_top().

	"""

	import sys

	# +++your code here+++
	# Define print_words(filename) and print_top(filename) functions.
	# You could write a helper utility function that reads a file
	# and builds and returns a word/count dict for it.
	# Then print_words() and print_top() can just call the utility function.

	'''
	this code is approx 20x slower than google's wordcount solution
	based on profile run, the str.count method is the bottleneck and
	there are few other places that could do with optimization as well
	such as not using string concatenation, xrange instead of range,
	build dict in more straightforward manner instead of using list and list.
	Things to learn and improve in future.

	example:

	#! /usr/bin/env python

	import profile

	f = '10k.txt'
	def func(f):
	# calling str.lower this way is way faster
	# only 8 function calls.
	with open(f) as fo:
	s = fo.read().lower().split()
	return s

	def func(f):
	# calling str.lower this way is very slow
	# (18k function calls)
	s=''
	with open(f) as fo:
	for line in fo:
	s += line.lower()
	s = s.split()
	return s

	print(func(f))
	profile.run('func(f)')
	'''

	def churn(fn):
	#convert file to blob of string, clean it, lowercase it and make wordlist.
	#create a unique wordlist and use uniq_wl as key, wl.count as value in dict.
	wl = ''
	with open(fn) as fo:
	for line in fo:
	wl += line.replace('\n', ' ').lower()
	wl = wl.split()
	uniq_wl = list(set(wl))
	dic = {}
	for word in uniq_wl:
	dic[word] = wl.count(word)

	return dic


	def print_words(filename):
	#sort and print wordlist using key as sort in ascending order
	dic = churn(filename)
	for key in sorted(dic):
	print('{} {}'.format(key, dic[key]))

	def print_top(filename):
	#sort 20 most commonly used word in descending order.
	dic = churn(filename)
	dic = sorted(dic.items(), key=lambda x:x[1], reverse=True)
	for item in (dic[:20]):
	print('{} {}'.format(item[0], item[1]))

	###

	# This basic command line argument parsing code is provided and
	# calls the print_words() and print_top() functions which you must define.
	def main():

	if len(sys.argv) != 3:
	print 'usage: ./wordcount.py {--count \| --topcount} file'
	sys.exit(1)

	option = sys.argv[1]
	filename = sys.argv[2]
	if option == '--count':
	print_words(filename)
	elif option == '--topcount':
	print_top(filename)
	else:
	print 'unknown option: ' + option
	sys.exit(1)

	if __name__ == '__main__':
	main()
No results found