Last active
June 29, 2020 13:48
-
-
Save alabrashJr/909257239356434b8d93d12851d9ba1d to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pysolr | |
from xml.etree import ElementTree | |
import argparse | |
skip = set(["shippingDetails", "priceDetail", "dailyOfferCategory", "stockInfo_availability", "abroad", | |
"imageDetail", "bigImageUrl", "specialOffer", "editorsChoice", "shippingDate", "shippingTime", "types", | |
"stockInfo", | |
"hasVariant", "member", "images", "shippingFee", "image", "globalTradeItemNumber", "quantity", | |
"soldQuantity", "variants", "subTitle" | |
, 'windowOptionFlag', 'productLine', 'affiliate']) | |
def str2bool(v): | |
if isinstance(v, bool): | |
return v | |
if v.lower() in ('yes', 'true', 't', 'y', '1'): | |
return True | |
elif v.lower() in ('no', 'false', 'f', 'n', '0'): | |
return False | |
else: | |
raise argparse.ArgumentTypeError('Boolean value expected.') | |
def args(): | |
parser = argparse.ArgumentParser() | |
parser.add_argument("solr_adress", help="set solr address such as \'http://127.0.0.1:8983/solr/sherlock\'") | |
parser.add_argument("filenames", help="set input xml paths and split them by comma path1,path2") | |
parser.add_argument("always_commit", type=str2bool, nargs='?', | |
const=True, default=True, | |
help="set solr commit mode yes/no , true/false,1/0") | |
return parser.parse_args() | |
def insert(row): | |
solr.add(docs=row) | |
def update(id,updated_dic:dict): | |
result=list(solr.search(q='id:'+id)) | |
for k,v in updated_dic: | |
result[0][k]=v | |
solr.add(docs=result) | |
def delete(id): | |
solr.delete(id=id) | |
def delete_all(): | |
solr.delete(q="*") | |
def query(q=None): | |
if q: | |
return [x for x in solr.search(q)] | |
return solr.search(q="*") | |
def iter_xml_all_category(filename,skip=skip): | |
"""Iterates over xml files rows and yields dict of rows attributes""" | |
doc = ElementTree.iterparse(filename, events=('start', 'end'),) | |
_, root = next(doc) | |
global warinings | |
global all_categoies | |
row={} | |
path=[] | |
for event, element in doc: | |
if event=="start" : | |
path.append(element.tag) | |
elif element.tag!="product" and element.tag!="products" and event=="end" : | |
if element.text and len(element.text.strip("\n ")) >= 1 and not set(path) & skip: | |
t="_".join(path[1:]) | |
if t in row.keys(): | |
temp=row[t] | |
if type(temp)!=list: | |
temp=[temp] | |
row[t]=temp | |
temp.append(element.text) | |
else: | |
temp.append(element.text) | |
else: | |
row[t]= element.text | |
if element.tag in path: | |
path.remove(element.tag) | |
root.clear() | |
elif event == "end" and element.tag == "product": | |
if "categories_category_name" in row.keys(): | |
category = row["categories_category_name"][0] | |
_row={} | |
for k,v in row.items(): | |
if k=="price": | |
k=k+"_fns" | |
elif type(v)==list: | |
k="attr_"+k | |
elif k not in ["id","title"]: | |
k=k+"_sns" | |
_row[k]=v | |
yield (category,_row) | |
else: | |
warinings.append(row) | |
row = {} | |
path = [] | |
def get_n(ite,n:int): | |
return list(next(ite) for _ in range(n)) | |
def upload_xml(filename,): | |
iterator=iter_xml_all_category(filename) | |
for cat,x in iterator: | |
insert(x) | |
if __name__ == '__main__': | |
arg=args() | |
solr = pysolr.Solr(arg.solr_adress, always_commit=arg.always_commit)#http://127.0.0.1:8983/solr/sherlock | |
for filename in arg.filenames.split(","): | |
upload_xml(filename=filename) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment