evansde77 · February 10, 2021 12:14 · nottings · Jun 1, 2016 · evansde77 · Jun 1, 2016
diff --git a/word_count_histogram.py b/word_count_histogram.py
 #!/usr/bin/env python
 """
 Example wordcount and ascii histogram script

 - Writes a data file in a temporary dir
 - defines function to parse file into a word iterable
 - Histogram class that does the word count and draws it as
  ascii strings with various sorting options

 Example usage:

 data_file = write_data_file()
 histogram = Histogram()
 histogram.populate(words(data_file))
 print histogram.draw()

 [    ] [    ] [====] [    ] [====] [    ] [    ] [====]
 [    ] [    ] [====] [    ] [====] [    ] [    ] [====]
 [    ] [    ] [====] [====] [====] [====] [====] [====]
 [====] [====] [====] [====] [====] [====] [====] [====]
 derp   wibble foo    baz    bork   womp   whizz  bar


 """
 import os
 import re
 import string
 import tempfile
 import collections

 PUNCTUATION = re.compile('[{}]'.format(re.escape(string.punctuation)))

 DATA = \
 """
 foo bar baz womp
 whizz bar foo

 womp. derp, bork!
 bork bork bork!!
 foo foo bar bar
 baz whizz wibble

 """


 def write_data_file():
    """
    create a tempfile containing the data fixture, return the
    filename

    """
    tempdir = tempfile.mkdtemp()
    datafile = os.path.join(tempdir, "data.txt")
    with open(datafile, 'w') as handle:
        handle.write(DATA)
    return datafile


 def words(filename):
    """
    given a file containing whitespace/newline words,
    parse the file and clean up the words, then yield them on as
    an iterator
    """
    with open(filename, 'r') as handle:
        for line in handle:
            line = PUNCTUATION.sub('', line)
            linewords = (word.strip() for word in line.split() if word.strip())
            for word in linewords:
                yield word


 class Column(list):
    """
    Column

    Helper class to contain column elements, plus some extra
    information to aid sorting
    """
    def __init__(self, *elements):
        super(Column, self).__init__(*elements)
        self.column_name = None
        self.value = 0


 class Histogram(dict):
    """
    Histogram

    dictionary based helper object to populate and count words
    and draw an ascii histogram

    """
    def __init__(self):
        super(Histogram, self).__init__()
        self.height = None
        self.width = None
        self.column_width = None
        self.data_entry = None
        self.blank_entry = None
        self.columns = {}

    def populate(self, iterable):
        """
        populate

        Populate self by consuming the iterable provided,
        creating an entry for each word seem and keeping count of the
        columns and widths as they go by

        :param iterable: iterable word list generator

        """
        max_word_len = 0
        for word in iterable:
            self.setdefault(word, 0)
            self[word] += 1
            word_length = len(word)
            if word_length > max_word_len:
                max_word_len = word_length

        self.width = len(self)
        self.height = max(self.itervalues())
        self.column_width = max_word_len
        padding = self.column_width - 2
        self.data_entry = '[{}]'.format('='*padding)
        self.blank_entry = '[{}]'.format(' '*padding)
        self._build_columns()

    def _make_column(self, count, name):
        """
        build a column instance containing the appropriate filled
        and blank entries, set the name and value fields to aid
        sorting

        :param count: number of entries in the column
        :param name: name of the column

        :returns: Column instance

        """
        result = [name]
        result.extend(self.data_entry for i in range(count))
        result.extend(self.blank_entry for i in range(self.height-count))
        col = Column(result)
        col.value = count
        col.column_name = name
        return col

    def _build_columns(self):
        """
        build the internal column data structure so that we can consume
        it to draw the histogram
        """
        self.columns = {}
        for word, count in self.iteritems():
            column_name = word.ljust(self.column_width, ' ')
            column = self._make_column(count, column_name)
            self.columns[column_name] = column

    def _format(self, columns):
        """
        given an array of columns, draw the formatted histogram string

        :param columns: list of column instances

        """
        result = "\n"
        for i in range(1, self.height+2):
            result += ' '.join([col[-i] for col in columns if col])
            result += '\n'
        return result

    def draw(self):
        """
        draw - create an unsorted histogram string

        """
        values = self.columns.values()
        return self._format(values)

    def draw_sorted_size(self, descending=False):
        """
        draw sorted size

        create a histogram sorted by bin population, defaults to
        ascending order, this can be flipped using the descending=True flag

        """
        od = collections.OrderedDict(
            sorted(
                self.columns.iteritems(),
                key=lambda x: x[1].value,
                reverse=descending
            )
        )
        values = [od[k] for k in od]
        return self._format(values)

    def draw_sorted_title(self, descending=False):
        """
        draw sorted title - create a representation of the histogram
        sorted by title (alphabetically)

        """
        od = collections.OrderedDict(
            sorted(
                self.columns.iteritems(),
                key=lambda x: x[1].column_name,
                reverse=descending
            )
        )
        values = [od[k] for k in od]
        return self._format(values)


 if __name__ == '__main__':
    #
    # main test program
    #
    # create test data file
    data_file = write_data_file()
    # create and populate the histogram instance
    histogram = Histogram()
    histogram.populate(words(data_file))

    # draw some histograms using various sorting approaches
    print histogram.draw()
    print histogram.draw_sorted_size()
    print histogram.draw_sorted_size(descending=True)
    print histogram.draw_sorted_title()
    print histogram.draw_sorted_title(descending=True)

    # clean up the data file
    os.remove(data_file)
	#!/usr/bin/env python
	"""
	Example wordcount and ascii histogram script

	- Writes a data file in a temporary dir
	- defines function to parse file into a word iterable
	- Histogram class that does the word count and draws it as
	ascii strings with various sorting options

	Example usage:

	data_file = write_data_file()
	histogram = Histogram()
	histogram.populate(words(data_file))
	print histogram.draw()

	[ ] [ ] [====] [ ] [====] [ ] [ ] [====]
	[ ] [ ] [====] [ ] [====] [ ] [ ] [====]
	[ ] [ ] [====] [====] [====] [====] [====] [====]
	[====] [====] [====] [====] [====] [====] [====] [====]
	derp wibble foo baz bork womp whizz bar


	"""
	import os
	import re
	import string
	import tempfile
	import collections

	PUNCTUATION = re.compile('[{}]'.format(re.escape(string.punctuation)))

	DATA = \
	"""
	foo bar baz womp
	whizz bar foo

	womp. derp, bork!
	bork bork bork!!
	foo foo bar bar
	baz whizz wibble

	"""


	def write_data_file():
	"""
	create a tempfile containing the data fixture, return the
	filename

	"""
	tempdir = tempfile.mkdtemp()
	datafile = os.path.join(tempdir, "data.txt")
	with open(datafile, 'w') as handle:
	handle.write(DATA)
	return datafile


	def words(filename):
	"""
	given a file containing whitespace/newline words,
	parse the file and clean up the words, then yield them on as
	an iterator
	"""
	with open(filename, 'r') as handle:
	for line in handle:
	line = PUNCTUATION.sub('', line)
	linewords = (word.strip() for word in line.split() if word.strip())
	for word in linewords:
	yield word


	class Column(list):
	"""
	Column

	Helper class to contain column elements, plus some extra
	information to aid sorting
	"""
	def __init__(self, *elements):
	super(Column, self).__init__(*elements)
	self.column_name = None
	self.value = 0


	class Histogram(dict):
	"""
	Histogram

	dictionary based helper object to populate and count words
	and draw an ascii histogram

	"""
	def __init__(self):
	super(Histogram, self).__init__()
	self.height = None
	self.width = None
	self.column_width = None
	self.data_entry = None
	self.blank_entry = None
	self.columns = {}

	def populate(self, iterable):
	"""
	populate

	Populate self by consuming the iterable provided,
	creating an entry for each word seem and keeping count of the
	columns and widths as they go by

	:param iterable: iterable word list generator

	"""
	max_word_len = 0
	for word in iterable:
	self.setdefault(word, 0)
	self[word] += 1
	word_length = len(word)
	if word_length > max_word_len:
	max_word_len = word_length

	self.width = len(self)
	self.height = max(self.itervalues())
	self.column_width = max_word_len
	padding = self.column_width - 2
	self.data_entry = '[{}]'.format('='*padding)
	self.blank_entry = '[{}]'.format(' '*padding)
	self._build_columns()

	def _make_column(self, count, name):
	"""
	build a column instance containing the appropriate filled
	and blank entries, set the name and value fields to aid
	sorting

	:param count: number of entries in the column
	:param name: name of the column

	:returns: Column instance

	"""
	result = [name]
	result.extend(self.data_entry for i in range(count))
	result.extend(self.blank_entry for i in range(self.height-count))
	col = Column(result)
	col.value = count
	col.column_name = name
	return col

	def _build_columns(self):
	"""
	build the internal column data structure so that we can consume
	it to draw the histogram
	"""
	self.columns = {}
	for word, count in self.iteritems():
	column_name = word.ljust(self.column_width, ' ')
	column = self._make_column(count, column_name)
	self.columns[column_name] = column

	def _format(self, columns):
	"""
	given an array of columns, draw the formatted histogram string

	:param columns: list of column instances

	"""
	result = "\n"
	for i in range(1, self.height+2):
	result += ' '.join([col[-i] for col in columns if col])
	result += '\n'
	return result

	def draw(self):
	"""
	draw - create an unsorted histogram string

	"""
	values = self.columns.values()
	return self._format(values)

	def draw_sorted_size(self, descending=False):
	"""
	draw sorted size

	create a histogram sorted by bin population, defaults to
	ascending order, this can be flipped using the descending=True flag

	"""
	od = collections.OrderedDict(
	sorted(
	self.columns.iteritems(),
	key=lambda x: x[1].value,
	reverse=descending
	)
	)
	values = [od[k] for k in od]
	return self._format(values)

	def draw_sorted_title(self, descending=False):
	"""
	draw sorted title - create a representation of the histogram
	sorted by title (alphabetically)

	"""
	od = collections.OrderedDict(
	sorted(
	self.columns.iteritems(),
	key=lambda x: x[1].column_name,
	reverse=descending
	)
	)
	values = [od[k] for k in od]
	return self._format(values)


	if __name__ == '__main__':
	#
	# main test program
	#
	# create test data file
	data_file = write_data_file()
	# create and populate the histogram instance
	histogram = Histogram()
	histogram.populate(words(data_file))

	# draw some histograms using various sorting approaches
	print histogram.draw()
	print histogram.draw_sorted_size()
	print histogram.draw_sorted_size(descending=True)
	print histogram.draw_sorted_title()
	print histogram.draw_sorted_title(descending=True)

	# clean up the data file
	os.remove(data_file)