ragingbal · March 17, 2016 15:30
diff --git a/es_dsl_sample.py b/es_dsl_sample.py
 from elasticsearch_dsl.connections import connections
 from elasticsearch import Elasticsearch
 from elasticsearch_dsl import Search


 from datetime import datetime
 from elasticsearch_dsl import DocType, String, Date, Nested, Boolean, analyzer


 class MerchantTerm(DocType):
    title = String()
    classified_by = String()
    created_at = Date()
    category = String(fields={'raw': String(index='not_analyzed')})

    class Meta:
        index = 'merchant_terms'

    def save(self, ** kwargs):
        self.created_at = datetime.now()
        super().save(** kwargs)

 class MerchantSignature(DocType):
    ms_description = String()
    ms_classified_by = String(fields={'raw': String(index='not_analyzed')})
    created_at = Date()
    ms_category = String(fields={'raw': String(index='not_analyzed')})

    class Meta:
        index = 'merchant_signatures'

    def save(self, ** kwargs):
        self.created_at = datetime.now()
        super().save(** kwargs)

 class MerchantSignatureCleaned(DocType):
    ms_description = String()
    ms_transactionCount = String()
    ms_classified_by = String(fields={'raw': String(index='not_analyzed')})
    created_at = Date()
    ms_category = String(fields={'raw': String(index='not_analyzed')})

    class Meta:
        index = 'merchant_signatures_cleaned'

    def save(self, ** kwargs):
        self.created_at = datetime.now()
        super().save(** kwargs)

 class TransactionDescription(DocType):
    description_1 = String()
    description_2 = String()
    description_3 = String()
    classified_by = String(fields={'raw': String(index='not_analyzed')})
    created_at = Date()
    category = String(fields={'raw': String(index='not_analyzed')})

    class Meta:
        index = 'transaction_terms'

    def save(self, ** kwargs):
        self.created_at = datetime.now()
        super().save(** kwargs)

 class DebitCardTransaction(DocType):
    dc_description = String()
    dc_classified_by = String(fields={'raw': String(index='not_analyzed')}) #term used for classification
    created_at = Date()
    dc_category = String(fields={'raw': String(index='not_analyzed')}) 
    dc_mcc = String(fields={'raw': String(index='not_analyzed')}) #the mcc applied
    dc_gm_classification = String(fields={'raw': String(index='not_analyzed')}) #from google maps api

    class Meta:
        index = 'debit_card_transactions'

    def save(self, ** kwargs):
        self.created_at = datetime.now()
        super().save(** kwargs)

 class ClassificationTerm(DocType):
    dc_term = String(fields={'raw': String(index='not_analyzed')}) #term used for classification
    dc_term_classification = String(fields={'raw': String(index='not_analyzed')}) #term used for classification
    dc_term_weight = String(fields={'raw': String(index='not_analyzed')}) #term used for classification
  
    class Meta:
        index = 'debit_card_classification_terms'

    def save(self, ** kwargs):
        self.created_at = datetime.now()
        super().save(** kwargs)

 class GAMerchant(DocType):
    title = String(fields={'raw': String(index='not_analyzed')}) #term used for classification
    ga_category = String()
    ga_matches = String()
    created_at = Date()
  
    class Meta:
        index = 'ga_merchants'


 connections.create_connection(hosts=['es_server'], timeout=20,http_auth='es_admin:mediovipera')


 def initDB():
 	MerchantTerm.init()



 def findByTitle(title):
 	client = Elasticsearch()
 	s = MerchantTerm.search()

 	# the search is already limited to the index and doc_type of our document
 	s = s.query('match_phrase', title=title)

 	results = s[0:10000].execute()
 	# when you execute the search the results are wrapped in your document class (Post)
 	for merchantTerm in results:
 		print(merchantTerm.meta.id + "\t" + merchantTerm.title + "\t" + merchantTerm.category + "\t" +  merchantTerm.classified_by)
 	print ("Total Hits : " + str(results.hits.total))


 def findByCategory(category):
 	client = Elasticsearch()
 	s = MerchantTerm.search()
 	# the search is already limited to the index and doc_type of our document
 	s = s.query('match_phrase', category=category)
 	results = s[0:10000].execute()
 	# when you execute the search the results are wrapped in your document class (Post)
 	for merchantTerm in results:
 		print(merchantTerm.meta.id + "\t" + merchantTerm.title + "\t" + merchantTerm.category + "\t" +  merchantTerm.classified_by)
 	print ("Total Hits : " + str(results.hits.total))

 def findDescriptionByCategory(category):
 	client = Elasticsearch()
 	s = TransactionDescription.search()
 	# the search is already limited to the index and doc_type of our document
 	s = s.query('match_phrase', category=category)
 	results = s[0:1000].execute()
 	# when you execute the search the results are wrapped in your document class (Post)
 	for transactionDescription in results:
 		print(transactionDescription.meta.id + "\t" + transactionDescription.description_1 + "\t" + transactionDescription.category + "\t" + transactionDescription.classified_by)
 	print ("Total Hits : " + str(results.hits.total))

 def findDescriptionByTerm(term):
 	client = Elasticsearch()
 	s = TransactionDescription.search()
 	# the search is already limited to the index and doc_type of our document
 	s = s.query('match_phrase', description_1=term)

 	results = s[0:1000].execute()
 	# when you execute the search the results are wrapped in your document class (Post)
 	for transactionDescription in results:
 		print(transactionDescription.meta.id + "\t" + transactionDescription.description_1 + "\t" + transactionDescription.category + "\t" + transactionDescription.classified_by)
 		#print(transactionDescription.meta.id + " : " + transactionDescription.description_2 + " : " + transactionDescription.category)
 	print ("Total Hits : " + str(results.hits.total))


 def findAndClassify(term,category):
 	client = Elasticsearch()
 	s = MerchantTerm.search()

 	# the search is already limited to the index and doc_type of our document
 	s = s.query('match_phrase', title=term)
 	results = s[0:100000].execute()
 	# when you execute the search the results are wrapped in your document class (Post)
 	for merchantTerm in results:
 		print(merchantTerm.meta.id + "\t" + merchantTerm.title + "\t" + merchantTerm.category + "\t" +  merchantTerm.classified_by)
 		merchantTerm.category = category
 		merchantTerm.classified_by = term
 		merchantTerm.save()
 	print ("Total Hits : " + str(results.hits.total))


 def addTerm(term,category):
 	termList = term.split(',')
 	for currentTerm in termList:
 		theTerm = ClassificationTerm()
 		theTerm.dc_term= currentTerm.strip()
 		theTerm.dc_term_classification=category.strip()
 		theTerm.dc_term_weight=0
 		theTerm._id = currentTerm.strip()
 		theTerm.save()

 def findDescriptionAndClassify(term,category,match_type):
 	#match_type is 'match' for loose and 'match_phrase' for tight matching
 	
 	addTerm(term,category)
 	termList = term.split(',')
 	category = category.strip().lower()

 	for theTerm in termList:
 		theTerm = theTerm.strip()
 		
 		s = DebitCardTransaction.search()
 		# the search is already limited to the index and doc_type of our document
 		size= 5000
 		start=0
 		end = start + size

 		remainingHits=size + 1;
 		totalHits=size;
 		count = 0

 		s = s.query('match_phrase', dc_description=theTerm)[start:end]
 		for aMatch in s.scan():
 			count = count +1
 			print(str(count) + ":" + aMatch.meta.id + "\t" + aMatch.dc_description + "\t" + aMatch.dc_category + "\t" + aMatch.dc_classified_by)
 			if aMatch.dc_category != category:
 			#if aMatch.dc_category == 'unclassified' :
 				print('reassigning from' , aMatch.dc_category , '>', category )
 				aMatch.dc_category = category
 				aMatch.dc_classified_by = theTerm
 				aMatch.save()



 def cleanMerchantSignatures():
 	s1 = MerchantSignature.search()
 	merchantNames = set()
 	count = 0
 	for aMatch in s1.scan():
 		count = count + 1
 		print(str(count) + ":" + aMatch.meta.id + "\t" + aMatch.ms_description + "\t" + aMatch.ms_category + "\t" + aMatch.ms_classified_by)
 		ms_clean = MerchantSignatureCleaned()
 		ms_clean.ms_description = aMatch.ms_description
 		ms_clean.ms_category = aMatch.ms_category
 		ms_clean.ms_classified_by = aMatch.ms_classified_by
 		ms_clean.ms_transactionCount = aMatch.ms_transactionCount
 		ms_clean.meta.id = aMatch.ms_description
 		ms_clean.save()

 def findDebitCardMerchantsByCategory(category):
 	#match_type is 'match' for loose and 'match_phrase' for tight matching
 	outFile = open( category +'_merchants.txt' ,'w')
 	merchantNames = set()
 	s = DebitCardTransaction.search()
 	size= 5000
 	start=0
 	end = start + size
 	remainingHits=size + 1
 	totalHits=size
 	count = 0
 	s = s.query('match_phrase', dc_category=category)[start:end]
 	for aMatch in s.scan():
 		count = count +1
 		#print(aMatch.dc_description)
 		merchantNames.add(aMatch.dc_description)
 	for merchantName in sorted(merchantNames):
 		print(merchantName)
 		outFile.write(merchantName + '\n')
 				
 def findDebitCardMerchantsTermClassifications():
 	classifications = dict()
 	s = DebitCardTransaction.search()
 	size= 5000
 	start=0
 	end = start + size
 	remainingHits=size + 1
 	totalHits=size
 	count = 0
 	s = s.query('match_all')[start:end]
 	for aMatch in s.scan():
 		classifications[aMatch.dc_classified_by] = aMatch.dc_category
 		print(aMatch.dc_classified_by + ':' + aMatch.dc_category)
 	for k,v in sorted(classifications):
 		print(k + " : " + v)



 def analyzeTransactionsForMerchantSignatures():
 	s = MerchantSignature.search()
 	
 	s = s.query('match_all')
 	for aMatch in s.scan():
 		print(aMatch.ms_description.lower() + ':' + aMatch.ms_category)
 		s2 = DebitCardTransaction.search()
 		s2 = s2.query("match_phrase", dc_description = aMatch.ms_description)
 		results = s2.execute()
 		print (str(results.hits.total))
 		aMatch.ms_transactionCount = results.hits.total
 		aMatch.save()


 def exportRules(filename):
 	s = ClassificationTerm.search()
 	f = open(filename,'w')
 	for aMatch in s.scan():
 		outstr = 'find_desc_and_classify -c ' + aMatch.dc_term_classification + ' -t ' + aMatch.dc_term + '\n'
 		print(outstr) 
 		f.write(outstr)
 	f.close() 

 def changeTermsClassification(fromClassification,toClassification):
 	s = ClassificationTerm.search()
 	s = s.query("match_phrase", dc_term_classification = fromClassification)
 	for aMatch in s.scan():
 		
 		print('from',aMatch.dc_term_classification,'to',toClassification)
 		aMatch.dc_term_classification = toClassification
 		aMatch.save()

 def createRulesFromGAMerchants(fromClassification,toClassification):
 	f = open('ga_merchants_' + toClassification + '.txt','a')
 	s = GAMerchant.search()
 	s = s.query("match_phrase", ga_category = fromClassification.strip())
 	for aMatch in s.scan():
 		outstr = 'find_desc_and_classify -c ' + toClassification + ' -t ' + aMatch.title + '\n'
 		print(outstr) 
 		f.write(outstr)

 def classifyMerchantSignature(term,category):
 	#match_type is 'match' for loose and 'match_phrase' for tight matching
 	
 	termList = term.split(',')

 	for theTerm in termList:
 		theTerm = theTerm.strip()
 		
 		s = MerchantSignature.search()
 		# the search is already limited to the index and doc_type of our document
 		size= 5000
 		start=0
 		end = start + size

 		remainingHits=size + 1;
 		totalHits=size;
 		count = 0

 		s = s.query('match_phrase', ms_description=theTerm)
 		for aMatch in s.scan():
 			count = count +1
 			print(str(count) + ":" + aMatch.meta.id + "\t" + aMatch.ms_description + "\t" + aMatch.ms_category + "\t" + aMatch.ms_classified_by)
 			aMatch.ms_category = category
 			aMatch.ms_classified_by = theTerm
 			aMatch.save()
	from elasticsearch_dsl.connections import connections
	from elasticsearch import Elasticsearch
	from elasticsearch_dsl import Search


	from datetime import datetime
	from elasticsearch_dsl import DocType, String, Date, Nested, Boolean, analyzer


	class MerchantTerm(DocType):
	title = String()
	classified_by = String()
	created_at = Date()
	category = String(fields={'raw': String(index='not_analyzed')})

	class Meta:
	index = 'merchant_terms'

	def save(self, ** kwargs):
	self.created_at = datetime.now()
	super().save(** kwargs)

	class MerchantSignature(DocType):
	ms_description = String()
	ms_classified_by = String(fields={'raw': String(index='not_analyzed')})
	created_at = Date()
	ms_category = String(fields={'raw': String(index='not_analyzed')})

	class Meta:
	index = 'merchant_signatures'

	def save(self, ** kwargs):
	self.created_at = datetime.now()
	super().save(** kwargs)

	class MerchantSignatureCleaned(DocType):
	ms_description = String()
	ms_transactionCount = String()
	ms_classified_by = String(fields={'raw': String(index='not_analyzed')})
	created_at = Date()
	ms_category = String(fields={'raw': String(index='not_analyzed')})

	class Meta:
	index = 'merchant_signatures_cleaned'

	def save(self, ** kwargs):
	self.created_at = datetime.now()
	super().save(** kwargs)

	class TransactionDescription(DocType):
	description_1 = String()
	description_2 = String()
	description_3 = String()
	classified_by = String(fields={'raw': String(index='not_analyzed')})
	created_at = Date()
	category = String(fields={'raw': String(index='not_analyzed')})

	class Meta:
	index = 'transaction_terms'

	def save(self, ** kwargs):
	self.created_at = datetime.now()
	super().save(** kwargs)

	class DebitCardTransaction(DocType):
	dc_description = String()
	dc_classified_by = String(fields={'raw': String(index='not_analyzed')}) #term used for classification
	created_at = Date()
	dc_category = String(fields={'raw': String(index='not_analyzed')})
	dc_mcc = String(fields={'raw': String(index='not_analyzed')}) #the mcc applied
	dc_gm_classification = String(fields={'raw': String(index='not_analyzed')}) #from google maps api

	class Meta:
	index = 'debit_card_transactions'

	def save(self, ** kwargs):
	self.created_at = datetime.now()
	super().save(** kwargs)

	class ClassificationTerm(DocType):
	dc_term = String(fields={'raw': String(index='not_analyzed')}) #term used for classification
	dc_term_classification = String(fields={'raw': String(index='not_analyzed')}) #term used for classification
	dc_term_weight = String(fields={'raw': String(index='not_analyzed')}) #term used for classification

	class Meta:
	index = 'debit_card_classification_terms'

	def save(self, ** kwargs):
	self.created_at = datetime.now()
	super().save(** kwargs)

	class GAMerchant(DocType):
	title = String(fields={'raw': String(index='not_analyzed')}) #term used for classification
	ga_category = String()
	ga_matches = String()
	created_at = Date()

	class Meta:
	index = 'ga_merchants'


	connections.create_connection(hosts=['es_server'], timeout=20,http_auth='es_admin:mediovipera')


	def initDB():
	MerchantTerm.init()



	def findByTitle(title):
	client = Elasticsearch()
	s = MerchantTerm.search()

	# the search is already limited to the index and doc_type of our document
	s = s.query('match_phrase', title=title)

	results = s[0:10000].execute()
	# when you execute the search the results are wrapped in your document class (Post)
	for merchantTerm in results:
	print(merchantTerm.meta.id + "\t" + merchantTerm.title + "\t" + merchantTerm.category + "\t" + merchantTerm.classified_by)
	print ("Total Hits : " + str(results.hits.total))


	def findByCategory(category):
	client = Elasticsearch()
	s = MerchantTerm.search()
	# the search is already limited to the index and doc_type of our document
	s = s.query('match_phrase', category=category)
	results = s[0:10000].execute()
	# when you execute the search the results are wrapped in your document class (Post)
	for merchantTerm in results:
	print(merchantTerm.meta.id + "\t" + merchantTerm.title + "\t" + merchantTerm.category + "\t" + merchantTerm.classified_by)
	print ("Total Hits : " + str(results.hits.total))

	def findDescriptionByCategory(category):
	client = Elasticsearch()
	s = TransactionDescription.search()
	# the search is already limited to the index and doc_type of our document
	s = s.query('match_phrase', category=category)
	results = s[0:1000].execute()
	# when you execute the search the results are wrapped in your document class (Post)
	for transactionDescription in results:
	print(transactionDescription.meta.id + "\t" + transactionDescription.description_1 + "\t" + transactionDescription.category + "\t" + transactionDescription.classified_by)
	print ("Total Hits : " + str(results.hits.total))

	def findDescriptionByTerm(term):
	client = Elasticsearch()
	s = TransactionDescription.search()
	# the search is already limited to the index and doc_type of our document
	s = s.query('match_phrase', description_1=term)

	results = s[0:1000].execute()
	# when you execute the search the results are wrapped in your document class (Post)
	for transactionDescription in results:
	print(transactionDescription.meta.id + "\t" + transactionDescription.description_1 + "\t" + transactionDescription.category + "\t" + transactionDescription.classified_by)
	#print(transactionDescription.meta.id + " : " + transactionDescription.description_2 + " : " + transactionDescription.category)
	print ("Total Hits : " + str(results.hits.total))


	def findAndClassify(term,category):
	client = Elasticsearch()
	s = MerchantTerm.search()

	# the search is already limited to the index and doc_type of our document
	s = s.query('match_phrase', title=term)
	results = s[0:100000].execute()
	# when you execute the search the results are wrapped in your document class (Post)
	for merchantTerm in results:
	print(merchantTerm.meta.id + "\t" + merchantTerm.title + "\t" + merchantTerm.category + "\t" + merchantTerm.classified_by)
	merchantTerm.category = category
	merchantTerm.classified_by = term
	merchantTerm.save()
	print ("Total Hits : " + str(results.hits.total))


	def addTerm(term,category):
	termList = term.split(',')
	for currentTerm in termList:
	theTerm = ClassificationTerm()
	theTerm.dc_term= currentTerm.strip()
	theTerm.dc_term_classification=category.strip()
	theTerm.dc_term_weight=0
	theTerm._id = currentTerm.strip()
	theTerm.save()

	def findDescriptionAndClassify(term,category,match_type):
	#match_type is 'match' for loose and 'match_phrase' for tight matching

	addTerm(term,category)
	termList = term.split(',')
	category = category.strip().lower()

	for theTerm in termList:
	theTerm = theTerm.strip()

	s = DebitCardTransaction.search()
	# the search is already limited to the index and doc_type of our document
	size= 5000
	start=0
	end = start + size

	remainingHits=size + 1;
	totalHits=size;
	count = 0

	s = s.query('match_phrase', dc_description=theTerm)[start:end]
	for aMatch in s.scan():
	count = count +1
	print(str(count) + ":" + aMatch.meta.id + "\t" + aMatch.dc_description + "\t" + aMatch.dc_category + "\t" + aMatch.dc_classified_by)
	if aMatch.dc_category != category:
	#if aMatch.dc_category == 'unclassified' :
	print('reassigning from' , aMatch.dc_category , '>', category )
	aMatch.dc_category = category
	aMatch.dc_classified_by = theTerm
	aMatch.save()



	def cleanMerchantSignatures():
	s1 = MerchantSignature.search()
	merchantNames = set()
	count = 0
	for aMatch in s1.scan():
	count = count + 1
	print(str(count) + ":" + aMatch.meta.id + "\t" + aMatch.ms_description + "\t" + aMatch.ms_category + "\t" + aMatch.ms_classified_by)
	ms_clean = MerchantSignatureCleaned()
	ms_clean.ms_description = aMatch.ms_description
	ms_clean.ms_category = aMatch.ms_category
	ms_clean.ms_classified_by = aMatch.ms_classified_by
	ms_clean.ms_transactionCount = aMatch.ms_transactionCount
	ms_clean.meta.id = aMatch.ms_description
	ms_clean.save()

	def findDebitCardMerchantsByCategory(category):
	#match_type is 'match' for loose and 'match_phrase' for tight matching
	outFile = open( category +'_merchants.txt' ,'w')
	merchantNames = set()
	s = DebitCardTransaction.search()
	size= 5000
	start=0
	end = start + size
	remainingHits=size + 1
	totalHits=size
	count = 0
	s = s.query('match_phrase', dc_category=category)[start:end]
	for aMatch in s.scan():
	count = count +1
	#print(aMatch.dc_description)
	merchantNames.add(aMatch.dc_description)
	for merchantName in sorted(merchantNames):
	print(merchantName)
	outFile.write(merchantName + '\n')

	def findDebitCardMerchantsTermClassifications():
	classifications = dict()
	s = DebitCardTransaction.search()
	size= 5000
	start=0
	end = start + size
	remainingHits=size + 1
	totalHits=size
	count = 0
	s = s.query('match_all')[start:end]
	for aMatch in s.scan():
	classifications[aMatch.dc_classified_by] = aMatch.dc_category
	print(aMatch.dc_classified_by + ':' + aMatch.dc_category)
	for k,v in sorted(classifications):
	print(k + " : " + v)



	def analyzeTransactionsForMerchantSignatures():
	s = MerchantSignature.search()

	s = s.query('match_all')
	for aMatch in s.scan():
	print(aMatch.ms_description.lower() + ':' + aMatch.ms_category)
	s2 = DebitCardTransaction.search()
	s2 = s2.query("match_phrase", dc_description = aMatch.ms_description)
	results = s2.execute()
	print (str(results.hits.total))
	aMatch.ms_transactionCount = results.hits.total
	aMatch.save()


	def exportRules(filename):
	s = ClassificationTerm.search()
	f = open(filename,'w')
	for aMatch in s.scan():
	outstr = 'find_desc_and_classify -c ' + aMatch.dc_term_classification + ' -t ' + aMatch.dc_term + '\n'
	print(outstr)
	f.write(outstr)
	f.close()

	def changeTermsClassification(fromClassification,toClassification):
	s = ClassificationTerm.search()
	s = s.query("match_phrase", dc_term_classification = fromClassification)
	for aMatch in s.scan():

	print('from',aMatch.dc_term_classification,'to',toClassification)
	aMatch.dc_term_classification = toClassification
	aMatch.save()

	def createRulesFromGAMerchants(fromClassification,toClassification):
	f = open('ga_merchants_' + toClassification + '.txt','a')
	s = GAMerchant.search()
	s = s.query("match_phrase", ga_category = fromClassification.strip())
	for aMatch in s.scan():
	outstr = 'find_desc_and_classify -c ' + toClassification + ' -t ' + aMatch.title + '\n'
	print(outstr)
	f.write(outstr)

	def classifyMerchantSignature(term,category):
	#match_type is 'match' for loose and 'match_phrase' for tight matching

	termList = term.split(',')

	for theTerm in termList:
	theTerm = theTerm.strip()

	s = MerchantSignature.search()
	# the search is already limited to the index and doc_type of our document
	size= 5000
	start=0
	end = start + size

	remainingHits=size + 1;
	totalHits=size;
	count = 0

	s = s.query('match_phrase', ms_description=theTerm)
	for aMatch in s.scan():
	count = count +1
	print(str(count) + ":" + aMatch.meta.id + "\t" + aMatch.ms_description + "\t" + aMatch.ms_category + "\t" + aMatch.ms_classified_by)
	aMatch.ms_category = category
	aMatch.ms_classified_by = theTerm
	aMatch.save()
No results found