Last active
March 17, 2016 15:30
-
-
Save ragingbal/fde7340dc539d7409503 to your computer and use it in GitHub Desktop.
Some Samples with Elastic Search DSL in python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from elasticsearch_dsl.connections import connections | |
from elasticsearch import Elasticsearch | |
from elasticsearch_dsl import Search | |
from datetime import datetime | |
from elasticsearch_dsl import DocType, String, Date, Nested, Boolean, analyzer | |
class MerchantTerm(DocType): | |
title = String() | |
classified_by = String() | |
created_at = Date() | |
category = String(fields={'raw': String(index='not_analyzed')}) | |
class Meta: | |
index = 'merchant_terms' | |
def save(self, ** kwargs): | |
self.created_at = datetime.now() | |
super().save(** kwargs) | |
class MerchantSignature(DocType): | |
ms_description = String() | |
ms_classified_by = String(fields={'raw': String(index='not_analyzed')}) | |
created_at = Date() | |
ms_category = String(fields={'raw': String(index='not_analyzed')}) | |
class Meta: | |
index = 'merchant_signatures' | |
def save(self, ** kwargs): | |
self.created_at = datetime.now() | |
super().save(** kwargs) | |
class MerchantSignatureCleaned(DocType): | |
ms_description = String() | |
ms_transactionCount = String() | |
ms_classified_by = String(fields={'raw': String(index='not_analyzed')}) | |
created_at = Date() | |
ms_category = String(fields={'raw': String(index='not_analyzed')}) | |
class Meta: | |
index = 'merchant_signatures_cleaned' | |
def save(self, ** kwargs): | |
self.created_at = datetime.now() | |
super().save(** kwargs) | |
class TransactionDescription(DocType): | |
description_1 = String() | |
description_2 = String() | |
description_3 = String() | |
classified_by = String(fields={'raw': String(index='not_analyzed')}) | |
created_at = Date() | |
category = String(fields={'raw': String(index='not_analyzed')}) | |
class Meta: | |
index = 'transaction_terms' | |
def save(self, ** kwargs): | |
self.created_at = datetime.now() | |
super().save(** kwargs) | |
class DebitCardTransaction(DocType): | |
dc_description = String() | |
dc_classified_by = String(fields={'raw': String(index='not_analyzed')}) #term used for classification | |
created_at = Date() | |
dc_category = String(fields={'raw': String(index='not_analyzed')}) | |
dc_mcc = String(fields={'raw': String(index='not_analyzed')}) #the mcc applied | |
dc_gm_classification = String(fields={'raw': String(index='not_analyzed')}) #from google maps api | |
class Meta: | |
index = 'debit_card_transactions' | |
def save(self, ** kwargs): | |
self.created_at = datetime.now() | |
super().save(** kwargs) | |
class ClassificationTerm(DocType): | |
dc_term = String(fields={'raw': String(index='not_analyzed')}) #term used for classification | |
dc_term_classification = String(fields={'raw': String(index='not_analyzed')}) #term used for classification | |
dc_term_weight = String(fields={'raw': String(index='not_analyzed')}) #term used for classification | |
class Meta: | |
index = 'debit_card_classification_terms' | |
def save(self, ** kwargs): | |
self.created_at = datetime.now() | |
super().save(** kwargs) | |
class GAMerchant(DocType): | |
title = String(fields={'raw': String(index='not_analyzed')}) #term used for classification | |
ga_category = String() | |
ga_matches = String() | |
created_at = Date() | |
class Meta: | |
index = 'ga_merchants' | |
connections.create_connection(hosts=['es_server'], timeout=20,http_auth='es_admin:mediovipera') | |
def initDB(): | |
MerchantTerm.init() | |
def findByTitle(title): | |
client = Elasticsearch() | |
s = MerchantTerm.search() | |
# the search is already limited to the index and doc_type of our document | |
s = s.query('match_phrase', title=title) | |
results = s[0:10000].execute() | |
# when you execute the search the results are wrapped in your document class (Post) | |
for merchantTerm in results: | |
print(merchantTerm.meta.id + "\t" + merchantTerm.title + "\t" + merchantTerm.category + "\t" + merchantTerm.classified_by) | |
print ("Total Hits : " + str(results.hits.total)) | |
def findByCategory(category): | |
client = Elasticsearch() | |
s = MerchantTerm.search() | |
# the search is already limited to the index and doc_type of our document | |
s = s.query('match_phrase', category=category) | |
results = s[0:10000].execute() | |
# when you execute the search the results are wrapped in your document class (Post) | |
for merchantTerm in results: | |
print(merchantTerm.meta.id + "\t" + merchantTerm.title + "\t" + merchantTerm.category + "\t" + merchantTerm.classified_by) | |
print ("Total Hits : " + str(results.hits.total)) | |
def findDescriptionByCategory(category): | |
client = Elasticsearch() | |
s = TransactionDescription.search() | |
# the search is already limited to the index and doc_type of our document | |
s = s.query('match_phrase', category=category) | |
results = s[0:1000].execute() | |
# when you execute the search the results are wrapped in your document class (Post) | |
for transactionDescription in results: | |
print(transactionDescription.meta.id + "\t" + transactionDescription.description_1 + "\t" + transactionDescription.category + "\t" + transactionDescription.classified_by) | |
print ("Total Hits : " + str(results.hits.total)) | |
def findDescriptionByTerm(term): | |
client = Elasticsearch() | |
s = TransactionDescription.search() | |
# the search is already limited to the index and doc_type of our document | |
s = s.query('match_phrase', description_1=term) | |
results = s[0:1000].execute() | |
# when you execute the search the results are wrapped in your document class (Post) | |
for transactionDescription in results: | |
print(transactionDescription.meta.id + "\t" + transactionDescription.description_1 + "\t" + transactionDescription.category + "\t" + transactionDescription.classified_by) | |
#print(transactionDescription.meta.id + " : " + transactionDescription.description_2 + " : " + transactionDescription.category) | |
print ("Total Hits : " + str(results.hits.total)) | |
def findAndClassify(term,category): | |
client = Elasticsearch() | |
s = MerchantTerm.search() | |
# the search is already limited to the index and doc_type of our document | |
s = s.query('match_phrase', title=term) | |
results = s[0:100000].execute() | |
# when you execute the search the results are wrapped in your document class (Post) | |
for merchantTerm in results: | |
print(merchantTerm.meta.id + "\t" + merchantTerm.title + "\t" + merchantTerm.category + "\t" + merchantTerm.classified_by) | |
merchantTerm.category = category | |
merchantTerm.classified_by = term | |
merchantTerm.save() | |
print ("Total Hits : " + str(results.hits.total)) | |
def addTerm(term,category): | |
termList = term.split(',') | |
for currentTerm in termList: | |
theTerm = ClassificationTerm() | |
theTerm.dc_term= currentTerm.strip() | |
theTerm.dc_term_classification=category.strip() | |
theTerm.dc_term_weight=0 | |
theTerm._id = currentTerm.strip() | |
theTerm.save() | |
def findDescriptionAndClassify(term,category,match_type): | |
#match_type is 'match' for loose and 'match_phrase' for tight matching | |
addTerm(term,category) | |
termList = term.split(',') | |
category = category.strip().lower() | |
for theTerm in termList: | |
theTerm = theTerm.strip() | |
s = DebitCardTransaction.search() | |
# the search is already limited to the index and doc_type of our document | |
size= 5000 | |
start=0 | |
end = start + size | |
remainingHits=size + 1; | |
totalHits=size; | |
count = 0 | |
s = s.query('match_phrase', dc_description=theTerm)[start:end] | |
for aMatch in s.scan(): | |
count = count +1 | |
print(str(count) + ":" + aMatch.meta.id + "\t" + aMatch.dc_description + "\t" + aMatch.dc_category + "\t" + aMatch.dc_classified_by) | |
if aMatch.dc_category != category: | |
#if aMatch.dc_category == 'unclassified' : | |
print('reassigning from' , aMatch.dc_category , '>', category ) | |
aMatch.dc_category = category | |
aMatch.dc_classified_by = theTerm | |
aMatch.save() | |
def cleanMerchantSignatures(): | |
s1 = MerchantSignature.search() | |
merchantNames = set() | |
count = 0 | |
for aMatch in s1.scan(): | |
count = count + 1 | |
print(str(count) + ":" + aMatch.meta.id + "\t" + aMatch.ms_description + "\t" + aMatch.ms_category + "\t" + aMatch.ms_classified_by) | |
ms_clean = MerchantSignatureCleaned() | |
ms_clean.ms_description = aMatch.ms_description | |
ms_clean.ms_category = aMatch.ms_category | |
ms_clean.ms_classified_by = aMatch.ms_classified_by | |
ms_clean.ms_transactionCount = aMatch.ms_transactionCount | |
ms_clean.meta.id = aMatch.ms_description | |
ms_clean.save() | |
def findDebitCardMerchantsByCategory(category): | |
#match_type is 'match' for loose and 'match_phrase' for tight matching | |
outFile = open( category +'_merchants.txt' ,'w') | |
merchantNames = set() | |
s = DebitCardTransaction.search() | |
size= 5000 | |
start=0 | |
end = start + size | |
remainingHits=size + 1 | |
totalHits=size | |
count = 0 | |
s = s.query('match_phrase', dc_category=category)[start:end] | |
for aMatch in s.scan(): | |
count = count +1 | |
#print(aMatch.dc_description) | |
merchantNames.add(aMatch.dc_description) | |
for merchantName in sorted(merchantNames): | |
print(merchantName) | |
outFile.write(merchantName + '\n') | |
def findDebitCardMerchantsTermClassifications(): | |
classifications = dict() | |
s = DebitCardTransaction.search() | |
size= 5000 | |
start=0 | |
end = start + size | |
remainingHits=size + 1 | |
totalHits=size | |
count = 0 | |
s = s.query('match_all')[start:end] | |
for aMatch in s.scan(): | |
classifications[aMatch.dc_classified_by] = aMatch.dc_category | |
print(aMatch.dc_classified_by + ':' + aMatch.dc_category) | |
for k,v in sorted(classifications): | |
print(k + " : " + v) | |
def analyzeTransactionsForMerchantSignatures(): | |
s = MerchantSignature.search() | |
s = s.query('match_all') | |
for aMatch in s.scan(): | |
print(aMatch.ms_description.lower() + ':' + aMatch.ms_category) | |
s2 = DebitCardTransaction.search() | |
s2 = s2.query("match_phrase", dc_description = aMatch.ms_description) | |
results = s2.execute() | |
print (str(results.hits.total)) | |
aMatch.ms_transactionCount = results.hits.total | |
aMatch.save() | |
def exportRules(filename): | |
s = ClassificationTerm.search() | |
f = open(filename,'w') | |
for aMatch in s.scan(): | |
outstr = 'find_desc_and_classify -c ' + aMatch.dc_term_classification + ' -t ' + aMatch.dc_term + '\n' | |
print(outstr) | |
f.write(outstr) | |
f.close() | |
def changeTermsClassification(fromClassification,toClassification): | |
s = ClassificationTerm.search() | |
s = s.query("match_phrase", dc_term_classification = fromClassification) | |
for aMatch in s.scan(): | |
print('from',aMatch.dc_term_classification,'to',toClassification) | |
aMatch.dc_term_classification = toClassification | |
aMatch.save() | |
def createRulesFromGAMerchants(fromClassification,toClassification): | |
f = open('ga_merchants_' + toClassification + '.txt','a') | |
s = GAMerchant.search() | |
s = s.query("match_phrase", ga_category = fromClassification.strip()) | |
for aMatch in s.scan(): | |
outstr = 'find_desc_and_classify -c ' + toClassification + ' -t ' + aMatch.title + '\n' | |
print(outstr) | |
f.write(outstr) | |
def classifyMerchantSignature(term,category): | |
#match_type is 'match' for loose and 'match_phrase' for tight matching | |
termList = term.split(',') | |
for theTerm in termList: | |
theTerm = theTerm.strip() | |
s = MerchantSignature.search() | |
# the search is already limited to the index and doc_type of our document | |
size= 5000 | |
start=0 | |
end = start + size | |
remainingHits=size + 1; | |
totalHits=size; | |
count = 0 | |
s = s.query('match_phrase', ms_description=theTerm) | |
for aMatch in s.scan(): | |
count = count +1 | |
print(str(count) + ":" + aMatch.meta.id + "\t" + aMatch.ms_description + "\t" + aMatch.ms_category + "\t" + aMatch.ms_classified_by) | |
aMatch.ms_category = category | |
aMatch.ms_classified_by = theTerm | |
aMatch.save() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment