Last active
October 29, 2017 01:01
-
-
Save kra3/11208171 to your computer and use it in GitHub Desktop.
A generic script for text processing originally written specific to some usecases around wiki text.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
__author__ = 'Arun KR (kra3) <[email protected]>' | |
__license__ = 'Simplified BSD' | |
import sys | |
""" | |
wiki_helper.py rules.txt data.txt > result.txt | |
rules.txt could be any text file with one rule per line. | |
A Rule will be of format CODE :=> STRING` | |
CODE will be one of DL or RM. | |
DL for delete matching line - PS: matching starts from begining affter removing whitespaces. | |
RM for remove matching strings from a line. | |
You have to redirect output to a file or another unix command for further processing. | |
Utility developed for malayalam wikibooks maintainers. | |
""" | |
def wiki_helper(data, rules): | |
# list of rules | |
delete_line_matches = [] | |
remove_string_matches = [] | |
# Open files | |
data_fh = open(data) | |
rules_fh = open(rules) | |
## extracting user defined rules | |
for line in rules_fh.readlines(): | |
rule = line.split(':=>') | |
if len(rule) != 2: # safegaurd against malformed rules. | |
continue | |
code, expr = map(str.strip, rule) # strip down her out of whitespaces | |
# circus to put rules at their places. | |
if code == 'DL': | |
delete_line_matches.append(expr) | |
elif code == 'RM': | |
remove_string_matches.append(expr) | |
else: | |
pass | |
## processing data with rules | |
for line in data_fh.readlines(): | |
matched = False # sentinel | |
# loop until a match for delete line is found, | |
# set sentinel and be out as fast as you can. | |
for match in delete_line_matches: | |
if line.strip().startswith(match): | |
matched = True | |
break | |
# remove all those junk to become a slim beauty. | |
if not matched: | |
for expr in remove_string_matches: | |
line = line.replace(expr, '') | |
# Now, Go; take on the world... | |
print line, | |
if __name__ == '__main__': | |
if not len(sys.argv) == 3: | |
print "Incorrect format. Try:" | |
print "\twiki_helper.py rules data" | |
exit(1) | |
wiki_helper(sys.argv[2], sys.argv[1]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
rules.txt
DL :=> {{കേരളത്തിലെ പക്ഷികളുടെ പട്ടിക - തുടക്കം|നിര=
RM :=> {{കേരളത്തിലെ പക്ഷികളുടെ പട്ടിക - ഉള്ളടക്കം|
RM :=> }}