|
#!/usr/bin/env python |
|
from __future__ import division |
|
import json |
|
from pprint import pprint |
|
import sys, getopt |
|
import re |
|
|
|
|
|
class Match(object): |
|
def __init__(self, match, ratio , matchIn): |
|
self.match = match |
|
self.ratio = ratio |
|
self.matchIn = matchIn |
|
|
|
def __repr__(self): |
|
return '{}: {} has weight {} and matched in {}\n'.format(self.__class__.__name__, |
|
self.match, |
|
self.ratio, |
|
self.matchIn) |
|
|
|
def __cmp__(self, other): |
|
return self.ratio.__gt__(other.ratio) |
|
|
|
# Calculate similarity as weight between @param match and @param against based on their attributes values. |
|
# weight is 0 if no match found |
|
# weight value is increased when match found |
|
# big weight value mean the similarity is very close |
|
def sku_match(match , against): |
|
weight = 0 |
|
matchIn = [] |
|
for idx, att in enumerate(against): |
|
a_att = match[att] |
|
b_att = against[att] |
|
if( a_att is not None and b_att == a_att): |
|
matchIn.append(att) |
|
# weight is increased |
|
# weight of attribute determined by integer value {char} at att-{char} |
|
_weight = ord(att.split('-')[1]) |
|
# substract by 96 'a ascii -1' and to make a is more weight than b 1/SUBSTRACT |
|
_weight = 1 / (_weight - 96) |
|
weight = weight + 1 + _weight |
|
|
|
return {'weight' : weight , 'matchIn' : matchIn} |
|
|
|
## brute force through skus and compraing search value with skus and assigning match ratio value |
|
## extract top ten ratio values |
|
def similar(file , match): |
|
print 'search for match with ', match , ' in ' , file |
|
matched = []; |
|
with open(file) as data_file: |
|
skus = json.load(data_file) |
|
|
|
_a = skus[match] |
|
for sku in skus: |
|
if(match == sku) : continue |
|
_mat = sku_match(_a , skus[sku]) |
|
if(_mat['weight'] > 0) : |
|
matched.append(Match(sku, _mat['weight'] , _mat['matchIn'])) |
|
|
|
if (len(matched) > 0): |
|
# extract top ten elements based on ratio |
|
topTen = sorted(matched, key = lambda x : x.ratio, reverse = True)[:10] |
|
print 'top matched docs :' |
|
for item in topTen : |
|
print item |
|
else: print 'no match found' |
|
|
|
if __name__ == '__main__': |
|
argv = sys.argv[1:] |
|
try: |
|
opts, args = getopt.getopt(argv,"hi:m:t:",["ifile=","match="]) |
|
except getopt.GetoptError: |
|
print 'match.py -i <inputfile.json> -m <match>' |
|
sys.exit(2) |
|
for opt, arg in opts: |
|
if opt == '-h': |
|
print 'match.py -i <inputfile.json> -m <match>' |
|
sys.exit() |
|
elif opt in ("-i", "--ifile"): |
|
inputfile = arg |
|
elif opt in ("-m", "--match"): |
|
match = arg |
|
similar(inputfile , match ); |