Skip to content

Instantly share code, notes, and snippets.

@junaidpv
Created April 20, 2011 03:17
Show Gist options
  • Save junaidpv/930271 to your computer and use it in GitHub Desktop.
Save junaidpv/930271 to your computer and use it in GitHub Desktop.
import codecs
import re
from datetime import datetime
# from http://www.peterbe.com/plog/uniqifiers-benchmark
def f5(seq, idfun=None):
# order preserving
if idfun is None:
def idfun(x): return x
seen = {}
result = []
for item in seq:
marker = idfun(item)
# in old Python versions:
# if seen.has_key(marker)
# but in new ones:
if marker in seen: continue
seen[marker] = 1
result.append(item)
return result
start_time = datetime.now()
# Open utf-8 encoded source file
input_file = codecs.open('o_source.txt', 'r', encoding='utf-8')
output_file = codecs.open('title_list.txt', 'w+', encoding='utf-8')
print "Read entire content of source file..."
# read entire file content
source_text = input_file.read()
print "Close input file."
input_file.close() # need no more
print "Find all titles in the text..."
titles = re.findall(r'\<!--(.*?)--\>', source_text);
# Uniquify titles
titles = f5(titles)
print "Join all titles separated by newline..."
dest_text = '\n'.join(titles)
print "Write to output file..."
output_file.write(dest_text)
output_file.flush()
print "Close output file..."
output_file.close()
end_time = datetime.now()
diff_time = end_time - start_time
print "Done!"
print "Time taken to process: ", diff_time
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment