Last active
February 10, 2021 12:14
-
-
Save evansde77/910ea58bd59d43dc9b0b3e6995f7b424 to your computer and use it in GitHub Desktop.
Playing around with an ascii histogram example
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
Example wordcount and ascii histogram script | |
- Writes a data file in a temporary dir | |
- defines function to parse file into a word iterable | |
- Histogram class that does the word count and draws it as | |
ascii strings with various sorting options | |
Example usage: | |
data_file = write_data_file() | |
histogram = Histogram() | |
histogram.populate(words(data_file)) | |
print histogram.draw() | |
[ ] [ ] [====] [ ] [====] [ ] [ ] [====] | |
[ ] [ ] [====] [ ] [====] [ ] [ ] [====] | |
[ ] [ ] [====] [====] [====] [====] [====] [====] | |
[====] [====] [====] [====] [====] [====] [====] [====] | |
derp wibble foo baz bork womp whizz bar | |
""" | |
import os | |
import re | |
import string | |
import tempfile | |
import collections | |
PUNCTUATION = re.compile('[{}]'.format(re.escape(string.punctuation))) | |
DATA = \ | |
""" | |
foo bar baz womp | |
whizz bar foo | |
womp. derp, bork! | |
bork bork bork!! | |
foo foo bar bar | |
baz whizz wibble | |
""" | |
def write_data_file(): | |
""" | |
create a tempfile containing the data fixture, return the | |
filename | |
""" | |
tempdir = tempfile.mkdtemp() | |
datafile = os.path.join(tempdir, "data.txt") | |
with open(datafile, 'w') as handle: | |
handle.write(DATA) | |
return datafile | |
def words(filename): | |
""" | |
given a file containing whitespace/newline words, | |
parse the file and clean up the words, then yield them on as | |
an iterator | |
""" | |
with open(filename, 'r') as handle: | |
for line in handle: | |
line = PUNCTUATION.sub('', line) | |
linewords = (word.strip() for word in line.split() if word.strip()) | |
for word in linewords: | |
yield word | |
class Column(list): | |
""" | |
Column | |
Helper class to contain column elements, plus some extra | |
information to aid sorting | |
""" | |
def __init__(self, *elements): | |
super(Column, self).__init__(*elements) | |
self.column_name = None | |
self.value = 0 | |
class Histogram(dict): | |
""" | |
Histogram | |
dictionary based helper object to populate and count words | |
and draw an ascii histogram | |
""" | |
def __init__(self): | |
super(Histogram, self).__init__() | |
self.height = None | |
self.width = None | |
self.column_width = None | |
self.data_entry = None | |
self.blank_entry = None | |
self.columns = {} | |
def populate(self, iterable): | |
""" | |
populate | |
Populate self by consuming the iterable provided, | |
creating an entry for each word seem and keeping count of the | |
columns and widths as they go by | |
:param iterable: iterable word list generator | |
""" | |
max_word_len = 0 | |
for word in iterable: | |
self.setdefault(word, 0) | |
self[word] += 1 | |
word_length = len(word) | |
if word_length > max_word_len: | |
max_word_len = word_length | |
self.width = len(self) | |
self.height = max(self.itervalues()) | |
self.column_width = max_word_len | |
padding = self.column_width - 2 | |
self.data_entry = '[{}]'.format('='*padding) | |
self.blank_entry = '[{}]'.format(' '*padding) | |
self._build_columns() | |
def _make_column(self, count, name): | |
""" | |
build a column instance containing the appropriate filled | |
and blank entries, set the name and value fields to aid | |
sorting | |
:param count: number of entries in the column | |
:param name: name of the column | |
:returns: Column instance | |
""" | |
result = [name] | |
result.extend(self.data_entry for i in range(count)) | |
result.extend(self.blank_entry for i in range(self.height-count)) | |
col = Column(result) | |
col.value = count | |
col.column_name = name | |
return col | |
def _build_columns(self): | |
""" | |
build the internal column data structure so that we can consume | |
it to draw the histogram | |
""" | |
self.columns = {} | |
for word, count in self.iteritems(): | |
column_name = word.ljust(self.column_width, ' ') | |
column = self._make_column(count, column_name) | |
self.columns[column_name] = column | |
def _format(self, columns): | |
""" | |
given an array of columns, draw the formatted histogram string | |
:param columns: list of column instances | |
""" | |
result = "\n" | |
for i in range(1, self.height+2): | |
result += ' '.join([col[-i] for col in columns if col]) | |
result += '\n' | |
return result | |
def draw(self): | |
""" | |
draw - create an unsorted histogram string | |
""" | |
values = self.columns.values() | |
return self._format(values) | |
def draw_sorted_size(self, descending=False): | |
""" | |
draw sorted size | |
create a histogram sorted by bin population, defaults to | |
ascending order, this can be flipped using the descending=True flag | |
""" | |
od = collections.OrderedDict( | |
sorted( | |
self.columns.iteritems(), | |
key=lambda x: x[1].value, | |
reverse=descending | |
) | |
) | |
values = [od[k] for k in od] | |
return self._format(values) | |
def draw_sorted_title(self, descending=False): | |
""" | |
draw sorted title - create a representation of the histogram | |
sorted by title (alphabetically) | |
""" | |
od = collections.OrderedDict( | |
sorted( | |
self.columns.iteritems(), | |
key=lambda x: x[1].column_name, | |
reverse=descending | |
) | |
) | |
values = [od[k] for k in od] | |
return self._format(values) | |
if __name__ == '__main__': | |
# | |
# main test program | |
# | |
# create test data file | |
data_file = write_data_file() | |
# create and populate the histogram instance | |
histogram = Histogram() | |
histogram.populate(words(data_file)) | |
# draw some histograms using various sorting approaches | |
print histogram.draw() | |
print histogram.draw_sorted_size() | |
print histogram.draw_sorted_size(descending=True) | |
print histogram.draw_sorted_title() | |
print histogram.draw_sorted_title(descending=True) | |
# clean up the data file | |
os.remove(data_file) |
@nottings update should take care of punctuation, non-ascii chars and unicode left as an exercise to the reader :)
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Very nice. Don't forget some texts contain punctuation. Example: "The quick brown fox jumps over the lazy dog. The dog barks!"
"dog." and "dog" should both be treated as "dog"