Skip to content

Instantly share code, notes, and snippets.

@kabir0st
Last active February 8, 2020 04:57
Show Gist options
  • Save kabir0st/f8158ef935999c01ff619ae1c30659df to your computer and use it in GitHub Desktop.
Save kabir0st/f8158ef935999c01ff619ae1c30659df to your computer and use it in GitHub Desktop.
import xml.etree.ElementTree as et
import json
json_format = {
'amud':'','daf':'','perek_id':1,'text_eng':[], 'text_heb':[], 'type':0
}
output_json = []
data = et.parse('xm.xml')
stories = data.findall('Story')
for story in stories:
engs = story.findall('Talmud_eng')
for eng in engs:
# for amud
try:
daf_text = eng[0][0][1][1].text
daf_texts = daf_text.split()
json_format['daf'] = str(daf_texts[1])
json_format['amud'] = str(daf_texts[3])
print(json_format)
except:
pass
#for perek_id
try:
perek_id_text = eng[0][1][1][1].text
json_format['perek_id'] = str(perek_id_text.split()[1])
except:
pass
# for heb in a tablmud_eng
heb = eng.findall(".//Talmud_Heb")
print(heb)
for x in heb:
json_format['text_heb'].append(x.text)
#for text_eng
tag_names = {t.tag for t in eng.findall('.//*')}
for tag_name in tag_names:
if tag_name.__contains__('Talmud_') and tag_name != 'Talmud_Heb':
for tag in eng.findall(tag_name):
if (tag.text):
json_format['text_eng'].append(tag.text)
output_json.append(json_format)
#resetting
json_format['text_eng'] = []
json_format['text_heb'] = []
with open('output.json', 'w') as output_file:
json.dump(output_json, output_file)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment