Created
September 19, 2017 11:15
-
-
Save mdamien/fc5d366534a68c6eb6ddb894e9201b10 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys, os, collections, string, math | |
Result = collections.namedtuple('Result', | |
['file', 'matches', 'n_lines', | |
'n_exact_match', 'n_exact_length', | |
'title_match']) | |
def remove_accents(s): | |
table = collections.defaultdict(lambda: None) | |
table.update({ | |
ord('é'):'e', | |
ord('ô'):'o', | |
ord(' '):' ', | |
ord('\N{NO-BREAK SPACE}'): ' ', | |
ord('\N{EN SPACE}'): ' ', | |
ord('\N{EM SPACE}'): ' ', | |
ord('\N{THREE-PER-EM SPACE}'): ' ', | |
ord('\N{FOUR-PER-EM SPACE}'): ' ', | |
ord('\N{SIX-PER-EM SPACE}'): ' ', | |
ord('\N{FIGURE SPACE}'): ' ', | |
ord('\N{PUNCTUATION SPACE}'): ' ', | |
ord('\N{THIN SPACE}'): ' ', | |
ord('\N{HAIR SPACE}'): ' ', | |
ord('\N{ZERO WIDTH SPACE}'): ' ', | |
ord('\N{NARROW NO-BREAK SPACE}'): ' ', | |
ord('\N{MEDIUM MATHEMATICAL SPACE}'): ' ', | |
ord('\N{IDEOGRAPHIC SPACE}'): ' ', | |
ord('\N{IDEOGRAPHIC HALF FILL SPACE}'): ' ', | |
ord('\N{ZERO WIDTH NO-BREAK SPACE}'): ' ', | |
ord('\N{TAG SPACE}'): ' ', | |
}) | |
table.update(dict(zip(map(ord,string.ascii_uppercase), string.ascii_lowercase))) | |
table.update(dict(zip(map(ord,string.ascii_lowercase), string.ascii_lowercase))) | |
table.update(dict(zip(map(ord,string.digits), string.digits))) | |
return s.translate(table) | |
DIR = 'data/poemes/' | |
DIR = 'data/pcdwiki/' | |
FILES = [(file, list(open(DIR + file))) for file in os.listdir(DIR)] | |
print('index built') | |
def search(q): | |
q = remove_accents(q.lower()) | |
results = [] | |
print('searching..') | |
for file, content in FILES: | |
matches = [] | |
n_lines = 0 | |
n_exact_match = 0 | |
n_exact_length = 0 | |
title_match = q in remove_accents(file.lower()) | |
for line in content: | |
line = line.strip() | |
clean_line = remove_accents(line.lower()) | |
for word in clean_line.split(' '): | |
if q in word: | |
n_exact_match += 1 | |
if q in clean_line: | |
matches.append(line) | |
n_lines += 1 | |
n_exact_length += len(line) | |
if matches: | |
results.append(Result(file=file, | |
matches=matches, n_lines=n_lines, | |
n_exact_match=n_exact_match, n_exact_length=n_exact_length, | |
title_match=title_match)) | |
print('search finished') | |
# density: -len(r.matches) / r.n_lines) | |
# ponderation of scores: -len(r.matches) * (1 + 0.2/r.n_lines)) | |
def score(r): | |
doc_score = math.log(len(r.matches)) * 0.5 \ | |
+ math.log(1 + len(r.matches) / r.n_lines) * 4 \ | |
+ math.log(r.n_lines) * 0.2 | |
# + len(r.matches) / r.n_lines * math.log(len(r.matches)) * 2 \ | |
if r.title_match: | |
return doc_score*1.6 | |
return doc_score | |
sorted_results = list(sorted(results, key=lambda r: -score(r))) | |
print(len(sorted_results), 'results') | |
for r in sorted_results[:5]: | |
print() | |
print(r.file) | |
print('density:', len(r.matches) / r.n_lines) | |
print(' - log density:', math.log(1 + len(r.matches) / r.n_lines)) | |
print(' - composite density', len(r.matches) / r.n_lines * math.log(len(r.matches))) | |
print('length:', r.n_lines) | |
print(' - log length:', math.log(r.n_lines)) | |
print('matches:', len(r.matches)) | |
print(' - log matches:', math.log(len(r.matches))) | |
print('n_exact_match:', r.n_exact_match) | |
print('n_exact_length:', r.n_exact_length) | |
print('score:', score(r)) | |
return sorted_results | |
if __name__ == '__main__': | |
search(sys.argv[1]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment