kabir0st · February 8, 2020 04:57
diff --git a/simple_parser.py b/simple_parser.py
 import xml.etree.ElementTree as et
 import json 
 json_format = {
    'amud':'','daf':'','perek_id':1,'text_eng':[], 'text_heb':[], 'type':0
 }
 output_json = []
 data = et.parse('xm.xml')
 stories = data.findall('Story')
 for story in stories:
    engs = story.findall('Talmud_eng')
    for eng in engs:
        
        # for amud
        try:
            daf_text = eng[0][0][1][1].text
            daf_texts = daf_text.split()
            json_format['daf'] = str(daf_texts[1])
            json_format['amud'] = str(daf_texts[3])
            print(json_format)
        except:
            pass
        
        #for perek_id
        try:
            perek_id_text = eng[0][1][1][1].text
            json_format['perek_id'] = str(perek_id_text.split()[1])
        except:
            pass

        # for heb in a tablmud_eng
        heb = eng.findall(".//Talmud_Heb")
        print(heb)
        for x in heb:
            json_format['text_heb'].append(x.text)
        
        #for text_eng
        tag_names = {t.tag for t in eng.findall('.//*')}
        for tag_name in tag_names:
            if tag_name.__contains__('Talmud_') and tag_name != 'Talmud_Heb':
                for tag in eng.findall(tag_name):
                    if (tag.text):
                        json_format['text_eng'].append(tag.text)
        output_json.append(json_format)
        #resetting 
        json_format['text_eng'] = []
        json_format['text_heb'] = []

 with open('output.json', 'w') as output_file:
    json.dump(output_json, output_file)
	import xml.etree.ElementTree as et
	import json
	json_format = {
	'amud':'','daf':'','perek_id':1,'text_eng':[], 'text_heb':[], 'type':0
	}
	output_json = []
	data = et.parse('xm.xml')
	stories = data.findall('Story')
	for story in stories:
	engs = story.findall('Talmud_eng')
	for eng in engs:

	# for amud
	try:
	daf_text = eng[0][0][1][1].text
	daf_texts = daf_text.split()
	json_format['daf'] = str(daf_texts[1])
	json_format['amud'] = str(daf_texts[3])
	print(json_format)
	except:
	pass

	#for perek_id
	try:
	perek_id_text = eng[0][1][1][1].text
	json_format['perek_id'] = str(perek_id_text.split()[1])
	except:
	pass

	# for heb in a tablmud_eng
	heb = eng.findall(".//Talmud_Heb")
	print(heb)
	for x in heb:
	json_format['text_heb'].append(x.text)

	#for text_eng
	tag_names = {t.tag for t in eng.findall('.//*')}
	for tag_name in tag_names:
	if tag_name.__contains__('Talmud_') and tag_name != 'Talmud_Heb':
	for tag in eng.findall(tag_name):
	if (tag.text):
	json_format['text_eng'].append(tag.text)
	output_json.append(json_format)
	#resetting
	json_format['text_eng'] = []
	json_format['text_heb'] = []

	with open('output.json', 'w') as output_file:
	json.dump(output_json, output_file)