Created
June 23, 2014 01:28
-
-
Save kelleyk/6f5bffe11fb70d694488 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from __future__ import absolute_import, print_function, division, unicode_literals | |
import re | |
from collections import defaultdict, Counter | |
INITIAL_SYMBOLS = b'abc' | |
def count_fragments(lines, initial_symbols=None): | |
initial_symbols = initial_symbols or INITIAL_SYMBOLS | |
# # @KK: Built-in 're' module has wonky support for repeated groups, so this pattern (which | |
# # would match the whole thing in one go and makes more sense) doesn't really work. | |
# pattern = re.compile(r'^([' + re.escape(initial_symbols) + '][^' + re.escape(initial_symbols) + ']*)*$') | |
pattern = re.compile(r'^([' + re.escape(initial_symbols) + '][^' + re.escape(initial_symbols) + ']*)(.*)$') | |
fragments = defaultdict(Counter) | |
for i, line in enumerate(lines): | |
while line: | |
m = pattern.match(line) | |
if not m: | |
raise AssertionError('Line {} does not match regex!'.format(i+1)) | |
frag, line = m.groups() | |
fragments[frag[0]].update((frag,)) | |
return fragments | |
def format_counts(fragments): | |
for initial, frag_counts in fragments.items(): | |
print('{}'.format(initial)) | |
for qty, frag in sorted(((qty, frag) for frag, qty in frag_counts.items()), reverse=True): | |
print(' {:>5} {}'.format(qty, frag)) | |
def main(): | |
import sys | |
with open(sys.argv[1], 'rb') as f: | |
format_counts(count_fragments(line for line in f)) | |
if __name__ == '__main__': | |
main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from __future__ import absolute_import, print_function, division, unicode_literals | |
import pytest | |
from fragcount import count_fragments | |
@pytest.mark.parametrize(('lines', 'expected'), [ | |
([b'axyzbyazyzz'], | |
dict(a=dict(axyz=1, azyzz=1), b=dict(by=1))), | |
]) | |
def test_count_fragments(lines, expected): | |
result = count_fragments(lines, initial_symbols=b'abc') | |
result_pod = {k: {x: y for x, y in v.items()} for k, v in result.items()} | |
assert result_pod == expected |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment