Created
July 9, 2013 23:09
-
-
Save shenfeng/5962147 to your computer and use it in GitHub Desktop.
教老婆学习python:了解倒排索引是怎么回事。
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
__author__ = 'feng' | |
articles = [ | |
'Familiarize Yourself with IntelliJ IDEA editor', | |
'While keeping the Ctrl key pressed, rotate the mouse wheel. As you rotate the mouse wheel forward', | |
'font size grows larger; as you rotate the mouse wheel backwards, font size decreases', | |
'In the popup frame, type Reset font size, and click Enter.', | |
'These operations apply to the active editor only. In the other editor tabs, font size is not affected.', | |
'There is no default keyboard shortcut associated with Reset font size action. However, you can create your', | |
'Place the caret in the editor.' | |
] | |
# 给每篇文章编号:id | |
article_map = dict(zip(range(len(articles)), articles)) | |
# 建立倒排索引 | |
def create_index(): | |
# 单词到对应的文章id | |
# {'word': [1, 2, 3], 'the': [7, 9], 'active': [1]} | |
index = {} | |
for id, article in article_map.items(): | |
words = article.split() | |
for word in words: | |
word = word.lower() | |
if word in index: | |
index[word].add(id) | |
else: | |
index[word] = set([id]) | |
return index | |
def search_index(query): | |
# 切词 | |
keywords = query.split() | |
# 先建立倒排索引。实际中,这个应该时预先建立的 | |
index = create_index() | |
if keywords: | |
ids = index.get(keywords[0], set()) | |
for q in keywords[1:]: | |
# 集合的 交 运算 | |
ids = ids & index.get(q, set()) | |
for id in ids: | |
print article_map[id] | |
# 直接搜索,需要扫瞄所有的文章。当文章数很多时,有效率问题。 | |
def search(keyword): | |
for article in articles: | |
if keyword in article: | |
yield article | |
def main(): | |
search_index("rotate mouse") | |
# for result in search('the'): | |
# print re.sub("the", "<b>the</b>", result) | |
# print result | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
article_map = dict(enumerate(articles))
create_index函数中,index = defaultdict(set)
search_index函数中,
ids = set()
for key in keywords:
ids = ids & index.get(key, set())