Created
February 11, 2019 21:31
-
-
Save jrgavilanes/2db2020c40d64c0c0ef7b9e86a7e98ce to your computer and use it in GitHub Desktop.
text index python
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
superindice = {} | |
def indexa(id, content, index): | |
for word in set(content.lower().split()): | |
if word not in index: | |
index[word] = [id] | |
else: | |
if id not in index[word]: | |
index[word].append(id) | |
def desindexa(id, index): | |
to_delete=[] | |
for key in index.keys(): | |
if id in index[key]: | |
index[key].remove(id) | |
if len(index[key])==0: | |
to_delete.append(key) | |
for key in to_delete: | |
del index[key] | |
def busca(words, index): | |
words = list(set(words.lower().split())) | |
matrix=[] | |
for w in words: | |
matrix.append(index[w]) if w in index else matrix.append([]) | |
if (len(matrix)==0): | |
return [] | |
i=0 | |
result=set(matrix[i]) | |
while i<len(words)-1: | |
result = result.intersection(set(matrix[i+1])) | |
i+=1 | |
return list(result) | |
def main(): | |
# from faker import Faker | |
# fake = Faker() | |
# print(fake.paragraph(10)) | |
archivo1 = "Hola! En un lugar de Holanda" | |
archivo2 = "de cuyo nombre no quiero acordarme como una ola" | |
indexa("archivo1",archivo1,superindice) | |
indexa("archivo2",archivo2,superindice) | |
desindexa("archivo1", superindice) | |
print(busca("quiero ola", superindice)) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment