Skip to content

Instantly share code, notes, and snippets.

@ragingbal
Last active March 17, 2016 15:30
Show Gist options
  • Save ragingbal/fde7340dc539d7409503 to your computer and use it in GitHub Desktop.
Save ragingbal/fde7340dc539d7409503 to your computer and use it in GitHub Desktop.
Some Samples with Elastic Search DSL in python
from elasticsearch_dsl.connections import connections
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search
from datetime import datetime
from elasticsearch_dsl import DocType, String, Date, Nested, Boolean, analyzer
class MerchantTerm(DocType):
title = String()
classified_by = String()
created_at = Date()
category = String(fields={'raw': String(index='not_analyzed')})
class Meta:
index = 'merchant_terms'
def save(self, ** kwargs):
self.created_at = datetime.now()
super().save(** kwargs)
class MerchantSignature(DocType):
ms_description = String()
ms_classified_by = String(fields={'raw': String(index='not_analyzed')})
created_at = Date()
ms_category = String(fields={'raw': String(index='not_analyzed')})
class Meta:
index = 'merchant_signatures'
def save(self, ** kwargs):
self.created_at = datetime.now()
super().save(** kwargs)
class MerchantSignatureCleaned(DocType):
ms_description = String()
ms_transactionCount = String()
ms_classified_by = String(fields={'raw': String(index='not_analyzed')})
created_at = Date()
ms_category = String(fields={'raw': String(index='not_analyzed')})
class Meta:
index = 'merchant_signatures_cleaned'
def save(self, ** kwargs):
self.created_at = datetime.now()
super().save(** kwargs)
class TransactionDescription(DocType):
description_1 = String()
description_2 = String()
description_3 = String()
classified_by = String(fields={'raw': String(index='not_analyzed')})
created_at = Date()
category = String(fields={'raw': String(index='not_analyzed')})
class Meta:
index = 'transaction_terms'
def save(self, ** kwargs):
self.created_at = datetime.now()
super().save(** kwargs)
class DebitCardTransaction(DocType):
dc_description = String()
dc_classified_by = String(fields={'raw': String(index='not_analyzed')}) #term used for classification
created_at = Date()
dc_category = String(fields={'raw': String(index='not_analyzed')})
dc_mcc = String(fields={'raw': String(index='not_analyzed')}) #the mcc applied
dc_gm_classification = String(fields={'raw': String(index='not_analyzed')}) #from google maps api
class Meta:
index = 'debit_card_transactions'
def save(self, ** kwargs):
self.created_at = datetime.now()
super().save(** kwargs)
class ClassificationTerm(DocType):
dc_term = String(fields={'raw': String(index='not_analyzed')}) #term used for classification
dc_term_classification = String(fields={'raw': String(index='not_analyzed')}) #term used for classification
dc_term_weight = String(fields={'raw': String(index='not_analyzed')}) #term used for classification
class Meta:
index = 'debit_card_classification_terms'
def save(self, ** kwargs):
self.created_at = datetime.now()
super().save(** kwargs)
class GAMerchant(DocType):
title = String(fields={'raw': String(index='not_analyzed')}) #term used for classification
ga_category = String()
ga_matches = String()
created_at = Date()
class Meta:
index = 'ga_merchants'
connections.create_connection(hosts=['es_server'], timeout=20,http_auth='es_admin:mediovipera')
def initDB():
MerchantTerm.init()
def findByTitle(title):
client = Elasticsearch()
s = MerchantTerm.search()
# the search is already limited to the index and doc_type of our document
s = s.query('match_phrase', title=title)
results = s[0:10000].execute()
# when you execute the search the results are wrapped in your document class (Post)
for merchantTerm in results:
print(merchantTerm.meta.id + "\t" + merchantTerm.title + "\t" + merchantTerm.category + "\t" + merchantTerm.classified_by)
print ("Total Hits : " + str(results.hits.total))
def findByCategory(category):
client = Elasticsearch()
s = MerchantTerm.search()
# the search is already limited to the index and doc_type of our document
s = s.query('match_phrase', category=category)
results = s[0:10000].execute()
# when you execute the search the results are wrapped in your document class (Post)
for merchantTerm in results:
print(merchantTerm.meta.id + "\t" + merchantTerm.title + "\t" + merchantTerm.category + "\t" + merchantTerm.classified_by)
print ("Total Hits : " + str(results.hits.total))
def findDescriptionByCategory(category):
client = Elasticsearch()
s = TransactionDescription.search()
# the search is already limited to the index and doc_type of our document
s = s.query('match_phrase', category=category)
results = s[0:1000].execute()
# when you execute the search the results are wrapped in your document class (Post)
for transactionDescription in results:
print(transactionDescription.meta.id + "\t" + transactionDescription.description_1 + "\t" + transactionDescription.category + "\t" + transactionDescription.classified_by)
print ("Total Hits : " + str(results.hits.total))
def findDescriptionByTerm(term):
client = Elasticsearch()
s = TransactionDescription.search()
# the search is already limited to the index and doc_type of our document
s = s.query('match_phrase', description_1=term)
results = s[0:1000].execute()
# when you execute the search the results are wrapped in your document class (Post)
for transactionDescription in results:
print(transactionDescription.meta.id + "\t" + transactionDescription.description_1 + "\t" + transactionDescription.category + "\t" + transactionDescription.classified_by)
#print(transactionDescription.meta.id + " : " + transactionDescription.description_2 + " : " + transactionDescription.category)
print ("Total Hits : " + str(results.hits.total))
def findAndClassify(term,category):
client = Elasticsearch()
s = MerchantTerm.search()
# the search is already limited to the index and doc_type of our document
s = s.query('match_phrase', title=term)
results = s[0:100000].execute()
# when you execute the search the results are wrapped in your document class (Post)
for merchantTerm in results:
print(merchantTerm.meta.id + "\t" + merchantTerm.title + "\t" + merchantTerm.category + "\t" + merchantTerm.classified_by)
merchantTerm.category = category
merchantTerm.classified_by = term
merchantTerm.save()
print ("Total Hits : " + str(results.hits.total))
def addTerm(term,category):
termList = term.split(',')
for currentTerm in termList:
theTerm = ClassificationTerm()
theTerm.dc_term= currentTerm.strip()
theTerm.dc_term_classification=category.strip()
theTerm.dc_term_weight=0
theTerm._id = currentTerm.strip()
theTerm.save()
def findDescriptionAndClassify(term,category,match_type):
#match_type is 'match' for loose and 'match_phrase' for tight matching
addTerm(term,category)
termList = term.split(',')
category = category.strip().lower()
for theTerm in termList:
theTerm = theTerm.strip()
s = DebitCardTransaction.search()
# the search is already limited to the index and doc_type of our document
size= 5000
start=0
end = start + size
remainingHits=size + 1;
totalHits=size;
count = 0
s = s.query('match_phrase', dc_description=theTerm)[start:end]
for aMatch in s.scan():
count = count +1
print(str(count) + ":" + aMatch.meta.id + "\t" + aMatch.dc_description + "\t" + aMatch.dc_category + "\t" + aMatch.dc_classified_by)
if aMatch.dc_category != category:
#if aMatch.dc_category == 'unclassified' :
print('reassigning from' , aMatch.dc_category , '>', category )
aMatch.dc_category = category
aMatch.dc_classified_by = theTerm
aMatch.save()
def cleanMerchantSignatures():
s1 = MerchantSignature.search()
merchantNames = set()
count = 0
for aMatch in s1.scan():
count = count + 1
print(str(count) + ":" + aMatch.meta.id + "\t" + aMatch.ms_description + "\t" + aMatch.ms_category + "\t" + aMatch.ms_classified_by)
ms_clean = MerchantSignatureCleaned()
ms_clean.ms_description = aMatch.ms_description
ms_clean.ms_category = aMatch.ms_category
ms_clean.ms_classified_by = aMatch.ms_classified_by
ms_clean.ms_transactionCount = aMatch.ms_transactionCount
ms_clean.meta.id = aMatch.ms_description
ms_clean.save()
def findDebitCardMerchantsByCategory(category):
#match_type is 'match' for loose and 'match_phrase' for tight matching
outFile = open( category +'_merchants.txt' ,'w')
merchantNames = set()
s = DebitCardTransaction.search()
size= 5000
start=0
end = start + size
remainingHits=size + 1
totalHits=size
count = 0
s = s.query('match_phrase', dc_category=category)[start:end]
for aMatch in s.scan():
count = count +1
#print(aMatch.dc_description)
merchantNames.add(aMatch.dc_description)
for merchantName in sorted(merchantNames):
print(merchantName)
outFile.write(merchantName + '\n')
def findDebitCardMerchantsTermClassifications():
classifications = dict()
s = DebitCardTransaction.search()
size= 5000
start=0
end = start + size
remainingHits=size + 1
totalHits=size
count = 0
s = s.query('match_all')[start:end]
for aMatch in s.scan():
classifications[aMatch.dc_classified_by] = aMatch.dc_category
print(aMatch.dc_classified_by + ':' + aMatch.dc_category)
for k,v in sorted(classifications):
print(k + " : " + v)
def analyzeTransactionsForMerchantSignatures():
s = MerchantSignature.search()
s = s.query('match_all')
for aMatch in s.scan():
print(aMatch.ms_description.lower() + ':' + aMatch.ms_category)
s2 = DebitCardTransaction.search()
s2 = s2.query("match_phrase", dc_description = aMatch.ms_description)
results = s2.execute()
print (str(results.hits.total))
aMatch.ms_transactionCount = results.hits.total
aMatch.save()
def exportRules(filename):
s = ClassificationTerm.search()
f = open(filename,'w')
for aMatch in s.scan():
outstr = 'find_desc_and_classify -c ' + aMatch.dc_term_classification + ' -t ' + aMatch.dc_term + '\n'
print(outstr)
f.write(outstr)
f.close()
def changeTermsClassification(fromClassification,toClassification):
s = ClassificationTerm.search()
s = s.query("match_phrase", dc_term_classification = fromClassification)
for aMatch in s.scan():
print('from',aMatch.dc_term_classification,'to',toClassification)
aMatch.dc_term_classification = toClassification
aMatch.save()
def createRulesFromGAMerchants(fromClassification,toClassification):
f = open('ga_merchants_' + toClassification + '.txt','a')
s = GAMerchant.search()
s = s.query("match_phrase", ga_category = fromClassification.strip())
for aMatch in s.scan():
outstr = 'find_desc_and_classify -c ' + toClassification + ' -t ' + aMatch.title + '\n'
print(outstr)
f.write(outstr)
def classifyMerchantSignature(term,category):
#match_type is 'match' for loose and 'match_phrase' for tight matching
termList = term.split(',')
for theTerm in termList:
theTerm = theTerm.strip()
s = MerchantSignature.search()
# the search is already limited to the index and doc_type of our document
size= 5000
start=0
end = start + size
remainingHits=size + 1;
totalHits=size;
count = 0
s = s.query('match_phrase', ms_description=theTerm)
for aMatch in s.scan():
count = count +1
print(str(count) + ":" + aMatch.meta.id + "\t" + aMatch.ms_description + "\t" + aMatch.ms_category + "\t" + aMatch.ms_classified_by)
aMatch.ms_category = category
aMatch.ms_classified_by = theTerm
aMatch.save()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment