Skip to content

Instantly share code, notes, and snippets.

@alabrashJr
Last active June 29, 2020 13:48
Show Gist options
  • Save alabrashJr/909257239356434b8d93d12851d9ba1d to your computer and use it in GitHub Desktop.
Save alabrashJr/909257239356434b8d93d12851d9ba1d to your computer and use it in GitHub Desktop.
import pysolr
from xml.etree import ElementTree
import argparse
skip = set(["shippingDetails", "priceDetail", "dailyOfferCategory", "stockInfo_availability", "abroad",
"imageDetail", "bigImageUrl", "specialOffer", "editorsChoice", "shippingDate", "shippingTime", "types",
"stockInfo",
"hasVariant", "member", "images", "shippingFee", "image", "globalTradeItemNumber", "quantity",
"soldQuantity", "variants", "subTitle"
, 'windowOptionFlag', 'productLine', 'affiliate'])
def str2bool(v):
if isinstance(v, bool):
return v
if v.lower() in ('yes', 'true', 't', 'y', '1'):
return True
elif v.lower() in ('no', 'false', 'f', 'n', '0'):
return False
else:
raise argparse.ArgumentTypeError('Boolean value expected.')
def args():
parser = argparse.ArgumentParser()
parser.add_argument("solr_adress", help="set solr address such as \'http://127.0.0.1:8983/solr/sherlock\'")
parser.add_argument("filenames", help="set input xml paths and split them by comma path1,path2")
parser.add_argument("always_commit", type=str2bool, nargs='?',
const=True, default=True,
help="set solr commit mode yes/no , true/false,1/0")
return parser.parse_args()
def insert(row):
solr.add(docs=row)
def update(id,updated_dic:dict):
result=list(solr.search(q='id:'+id))
for k,v in updated_dic:
result[0][k]=v
solr.add(docs=result)
def delete(id):
solr.delete(id=id)
def delete_all():
solr.delete(q="*")
def query(q=None):
if q:
return [x for x in solr.search(q)]
return solr.search(q="*")
def iter_xml_all_category(filename,skip=skip):
"""Iterates over xml files rows and yields dict of rows attributes"""
doc = ElementTree.iterparse(filename, events=('start', 'end'),)
_, root = next(doc)
global warinings
global all_categoies
row={}
path=[]
for event, element in doc:
if event=="start" :
path.append(element.tag)
elif element.tag!="product" and element.tag!="products" and event=="end" :
if element.text and len(element.text.strip("\n ")) >= 1 and not set(path) & skip:
t="_".join(path[1:])
if t in row.keys():
temp=row[t]
if type(temp)!=list:
temp=[temp]
row[t]=temp
temp.append(element.text)
else:
temp.append(element.text)
else:
row[t]= element.text
if element.tag in path:
path.remove(element.tag)
root.clear()
elif event == "end" and element.tag == "product":
if "categories_category_name" in row.keys():
category = row["categories_category_name"][0]
_row={}
for k,v in row.items():
if k=="price":
k=k+"_fns"
elif type(v)==list:
k="attr_"+k
elif k not in ["id","title"]:
k=k+"_sns"
_row[k]=v
yield (category,_row)
else:
warinings.append(row)
row = {}
path = []
def get_n(ite,n:int):
return list(next(ite) for _ in range(n))
def upload_xml(filename,):
iterator=iter_xml_all_category(filename)
for cat,x in iterator:
insert(x)
if __name__ == '__main__':
arg=args()
solr = pysolr.Solr(arg.solr_adress, always_commit=arg.always_commit)#http://127.0.0.1:8983/solr/sherlock
for filename in arg.filenames.split(","):
upload_xml(filename=filename)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment