Created
September 1, 2010 19:18
-
-
Save jelsas/561196 to your computer and use it in GitHub Desktop.
Functions to build full dependence model queries for Indri http://lemurproject.org/indri
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
Functions to build full dependence model queries for Indri http://lemurproject.org/indri. | |
See "A Markov Random Field Model for Term Dependencies" by Metzler & Croft | |
http://ciir.cs.umass.edu/pubfiles/ir-387.pdf | |
''' | |
import re | |
nonword_chars = re.compile(r'\W+') | |
def powerset(l): | |
'''Computes the power set of the given list, excluding the empty set. | |
For example: | |
>>> powerset([]) | |
[] | |
>>> powerset('123') | |
[['1'], ['2'], ['1', '2'], ['3'], ['1', '3'], ['2', '3'], ['1', '2', '3']] | |
>>> powerset([1, 2, 3, 4, 5]) | |
[[1], [2], [1, 2], [3], [1, 3], [2, 3], [1, 2, 3], [4], [1, 4], [2, 4], [1, 2, 4], [3, 4], [1, 3, 4], [2, 3, 4], [1, 2, 3, 4], [5], [1, 5], [2, 5], [1, 2, 5], [3, 5], [1, 3, 5], [2, 3, 5], [1, 2, 3, 5], [4, 5], [1, 4, 5], [2, 4, 5], [1, 2, 4, 5], [3, 4, 5], [1, 3, 4, 5], [2, 3, 4, 5], [1, 2, 3, 4, 5]] | |
''' | |
p = [[]] | |
for t in l: | |
to_add = [x + [t] for x in p] | |
p = p + to_add | |
return p[1:] # remove the first (empty) set | |
def all_adjacent(l): | |
'''Computes all adjacent items of the given list. | |
For example: | |
>>> all_adjacent([]) | |
[] | |
>>> all_adjacent('123') | |
['1', '12', '123', '2', '23'] | |
>>> all_adjacent([1,2,3,4,5]) | |
[[1], [1, 2], [1, 2, 3], [1, 2, 3, 4], [1, 2, 3, 4, 5], [2], [2, 3], [2, 3, 4], [2, 3, 4, 5], [3], [3, 4], [3, 4, 5], [4], [4, 5]] | |
''' | |
p = [] | |
for i in xrange(len(l)-1): | |
for j in xrange(i, len(l)): | |
p = p + [ l[i:j+1] ] | |
return p | |
def _add_field_to_q_list(l, field): | |
if field: return ['%s.(%s)' % (q, field) for q in l] | |
else: return l | |
def build_indri_query_dm(query, field=None): | |
'''Builds a dependence model query from the given text string, optionally | |
adding field field context for the provided field. | |
For example: | |
>>> build_indri_query_dm('information retrieval systems') | |
'#weight( 0.8 #combine( information retrieval systems ) 0.1 #combine( #1( information retrieval ) #1( information retrieval systems ) #1( retrieval systems ) ) 0.1 #combine( #uw4( information retrieval ) #uw4( information systems ) #uw4( retrieval systems ) #uw8( information retrieval systems ) ) )' | |
>>> build_indri_query_dm('information retrieval systems', 'title') | |
'#weight( 0.8 #combine( information.(title) retrieval.(title) systems.(title) ) 0.1 #combine( #1( information retrieval ).(title) #1( information retrieval systems ).(title) #1( retrieval systems ).(title) ) 0.1 #combine( #uw4( information retrieval ).(title) #uw4( information systems ).(title) #uw4( retrieval systems ).(title) #uw8( information retrieval systems ).(title) ) )' | |
''' | |
q_tokens = [s for s in nonword_chars.split(query) if len(s) > 0] | |
if len(q_tokens) == 0: | |
return | |
elif len(q_tokens) == 1: | |
if field: | |
return '%s.(%s)' % (q_tokens[0], field) | |
else: | |
return q_tokens[0] | |
elif len(q_tokens) > 5: | |
return '#combine( %s )' % ' '.join(_add_field_to_q_list(q_tokens, field)) | |
ordered_windows = ['#1( %s )' % ' '.join(s) \ | |
for s in all_adjacent(q_tokens) if len(s) > 1] | |
unordered_windows = ['#uw%d( %s )' % ((len(s)-1)*4, ' '.join(s)) \ | |
for s in powerset(q_tokens) if len(s) > 1] | |
token_q = '#combine( %s )' % ' '.join(_add_field_to_q_list(q_tokens, field)) | |
od_q = '#combine( %s )' % \ | |
' '.join(_add_field_to_q_list(ordered_windows, field)) | |
uw_q = '#combine( %s )' % \ | |
' '.join(_add_field_to_q_list(unordered_windows, field)) | |
return '#weight( 0.8 %s 0.1 %s 0.1 %s )' % (token_q, od_q, uw_q) | |
if __name__ == "__main__": | |
import sys | |
for q in sys.argv[1:]: | |
print build_indri_query_dm(q) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment