Created
June 28, 2013 18:23
-
-
Save Slater-Victoroff/5886861 to your computer and use it in GitHub Desktop.
Pyenchant spell checking with ElasticSearch
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class ElasticEnchant: | |
def __init__(self, esDatabase): | |
self.es_instance = esDatabase | |
def produce_dictionary(self, output_file, **kwargs): | |
"""Produces a dictionary or updates it depending on kwargs | |
If no kwargs are given then this method will write a full dictionary including all | |
entries in all indices and types and output it in an enchant-friendly way to the output file. | |
Accepted kwargs are index, and source_file. If you want to index multiple types | |
or indices you should pass them in as comma delimited strings. Source file should be | |
an absolute path to an existing enchant-friendly dictionary file. | |
max_results will also set the maximum number of entries to be used in generating the dictionary. | |
Set to 50k by default""" | |
index = kwargs.get("index", "_all") | |
max_results = kwargs.get("max_results", 50000) | |
words = set() | |
if kwargs.get("source_file", None): | |
words = set(open(kwargs["source_file"]).readlines()) | |
url = "/".join([self.es_instance.url, index, "_search?size="+str(max_results)+"&q=*.*"]) | |
response = requests.get(url) | |
for entry in json.loads(response._content)['hits']['hits']: | |
if entry["_source"]["searchable_text"]: | |
text = entry["_source"]["searchable_text"] | |
words |= set(re.findall(r'[a-z]+', text.lower())) | |
else: | |
continue | |
with open(output_file, 'wb') as dictionary: | |
for word in words: | |
dictionary.write(word+"\n") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Super