Created
July 5, 2017 00:02
-
-
Save mmas/9f11dde0cceb352f97bba54feb2bd28c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import re | |
import sys | |
started = False | |
for line in sys.stdin: | |
if started: | |
if line.startswith('*** END OF THIS PROJECT'): | |
break | |
# Filter out some punctuation marks and set to lowercase. | |
line = re.sub(r'["?!.,;:()-]', '', line).strip().lower() | |
for word in line.split(): | |
print '%s\t1' % word | |
elif line.startswith('*** START OF THIS PROJECT'): | |
started = True |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment