Created
July 30, 2011 04:48
-
-
Save GaretJax/1115208 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import re | |
def iterwords(fh): | |
for number, line in enumerate(fh): | |
for word in re.split(r'\s+', line.strip()): | |
# Preprocess the words here, for example to strip out punctuation | |
# (the following example is sloooow, compile this regex if you | |
# really want to use it): | |
# | |
# word = re.sub(r'[,.:]', '', word) | |
# | |
word = word.lower() | |
yield number, word | |
def search(fh, query): | |
query = re.split(r'\s+', query.strip().lower()) | |
matches = [] | |
words = iterwords(fh) | |
for line, word in words: | |
lines_count = 1 | |
current_line = line | |
for keyword in query: | |
if keyword == word: | |
next_line, word = next(words) | |
if next_line > current_line: | |
lines_count += 1 | |
current_line = next_line | |
else: | |
break | |
else: | |
matches.append((line, lines_count)) | |
return tuple(matches) | |
if __name__ == '__main__': | |
query = sys.argv[1] | |
fh = open(sys.argv[2]) | |
matches = search(fh, query) | |
# Form here on it's only presentation | |
fh.seek(0) | |
lines = enumerate(fh) | |
for lineno, linecount in matches: | |
number, line = next(lines) | |
while number < lineno: | |
number, line = next(lines) | |
result_lines = [line] | |
for i in range(linecount-1): | |
result_lines.append(next(lines)[1]) | |
print "Match found on line {0} (spawning {1} lines):\n > {2}".format( | |
lineno, linecount, ' > '.join(result_lines).strip()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment