amon-ra · February 18, 2021 16:51
diff --git a/newsml-import.py b/newsml-import.py
 import xml.etree.ElementTree as ET
 import os 
 import re
 import sys
 import traceback
 from datetime import datetime

 # Creates a csv to import with wp-all-import and redirect (last two columns)
 CSV_FORMAT = "id;owner;title;subtitle;abstract;content;date;image;tags;category;slug;link;link2"
 FNAME = "data.csv"

 def parse_article(item):
    #data = item.attrib
    cat = ''
    c = item.find('./NewsItem/NewsComponent/NewsComponent[1]/NewsComponent')
    cat = c.find('./DescriptiveMetadata/Property[1]').attrib
    cat = cat['Value']
    post_id = c.attrib
    post_id = post_id['Euid']
    date = c.find('./DescriptiveMetadata/DateLineDate').text
    dt = datetime.strptime(date[:8],"%Y%m%d")
    title = c.find('./NewsLines/HeadLine').text or ''
    subtitle = c.find('./NewsLines/SubHeadLine').text or ''
    abstract = c.find('./ContentItem/DataContent/nitf/body/body.head/abstract/p').text or ''
    content = c.find('./ContentItem/DataContent/nitf/body/body.content').text or ''
    owner = c.find('./ContentItem/DataContent/nitf/body/body.head/rights/rights.owner').text or 'periodicoclm'
    if 'periodicoclm' in owner.lower(): owner = 'periodicoclm'
    tags = c.find('./ContentItem/DataContent/nitf/head/docdata/key-list/keyword').attrib
    if tags: tags = tags['key']
    link = c.find('./ContentItem').attrib
    link = link['Href'].replace('http://www.periodicoclm.es/','')
    slug = link.split('/')[2]
    link2 = '/'+dt.strftime('%Y/%m/%d')+'/'+slug+'/'
    # slug = re.sub(r'\/*?[0-9].*html$','',link) + '/'
    image = ''
    try:
        image = item.find('./NewsItem/NewsComponent/NewsComponent[2]/NewsComponent/NewsComponent/ContentItem').attrib
        image = image['Href']
    except Exception:
        pass
    # for h in item.iter('HeadLine'):
    #     print(h.text)
    # for sh in item.iter('SubHeadLine'):
    #     print(sh.text) 
    return [post_id,owner,'"'+title.replace('"','”')+'"','"'+subtitle.replace('"','”')+'"','"'+abstract.replace('"','”')+'"','"'+content.replace('"','”')+'"',date,image,tags,cat,slug,link,link2]   

 def parse_opinion(item):
    #data = item.attrib
    cat = 'opinion'
    c = item.find('./NewsItem/NewsComponent/NewsComponent[1]/NewsComponent')
    post_id = c.attrib
    post_id = post_id['Euid']
    date = c.find('./DescriptiveMetadata/DateLineDate').text
    dt = datetime.strptime(date[:8],"%Y%m%d")
    title = c.find('./NewsLines/HeadLine').text or ''
    subtitle = c.find('./NewsLines/SubHeadLine').text or ''
    abstract = c.find('./ContentItem/DataContent/nitf/body/body.head/abstract/p').text or ''
    content = c.find('./ContentItem/DataContent/nitf/body/body.content').text or ''
    owner = c.find('./ContentItem/DataContent/nitf/body/body.head/rights/rights.owner').text or 'periodicoclm'
    if 'periodicoclm' in owner.lower(): owner = 'periodicoclm'
    tags = c.find('./ContentItem/DataContent/nitf/head/docdata/key-list/keyword').attrib
    if tags: tags = tags['key']
    link = c.find('./ContentItem').attrib
    link = link['Href'].replace('http://www.periodicoclm.es/','')
    slug = link.split('/')[2]
    link2 = '/'+dt.strftime('%Y/%m/%d')+'/'+slug+'/'
    # slug = re.sub(r'\/*?[0-9].*html$','',link) + '/'
    image = ''
    try:
        image = item.find('./NewsItem/NewsComponent/NewsComponent[2]/NewsComponent/NewsComponent/ContentItem').attrib
        image = image['Href']
    except Exception:
        pass
    # for h in item.iter('HeadLine'):
    #     print(h.text)
    # for sh in item.iter('SubHeadLine'):
    #     print(sh.text) 
    return [post_id,owner,'"'+title.replace('"','”')+'"','"'+subtitle.replace('"','”')+'"','"'+abstract.replace('"','”')+'"','"'+content.replace('"','”')+'"',date,image,tags,cat,slug,link,link2]   


 ret = """<?xml version="1.0" encoding="utf-8"?>
 <NewsML Version="1.2">
 """
 f = open(FNAME,'w')
 print(CSV_FORMAT,file=f)
 for xml_file in os.listdir("periodicoclm/"):
    if xml_file == "index.xml": continue
    print(xml_file)
    try:
        line = []
        tree = ET.parse("periodicoclm/"+xml_file)
        root = tree.getroot()
        if xml_file.startswith('article'):
            line = parse_article(tree)
        elif xml_file.startswith('opinion'):
            line = parse_opinion(tree)
        if line:
            print(';'.join(line), file=f)
        # line = ET.tostring(root[1], encoding='utf8').decode('utf8')
        # ret += re.sub(r'^<\?xml version.*\?>','',line) 
        # print(ret)
    except Exception as e:
        print(e)
        traceback.print_exc()

 # ret +="""
 # </NewsML>
 # """
 # f = open("periodicoclm/index.xml", "w")
 # f.write(ret)
 f.close()
	import xml.etree.ElementTree as ET
	import os
	import re
	import sys
	import traceback
	from datetime import datetime

	# Creates a csv to import with wp-all-import and redirect (last two columns)
	CSV_FORMAT = "id;owner;title;subtitle;abstract;content;date;image;tags;category;slug;link;link2"
	FNAME = "data.csv"

	def parse_article(item):
	#data = item.attrib
	cat = ''
	c = item.find('./NewsItem/NewsComponent/NewsComponent[1]/NewsComponent')
	cat = c.find('./DescriptiveMetadata/Property[1]').attrib
	cat = cat['Value']
	post_id = c.attrib
	post_id = post_id['Euid']
	date = c.find('./DescriptiveMetadata/DateLineDate').text
	dt = datetime.strptime(date[:8],"%Y%m%d")
	title = c.find('./NewsLines/HeadLine').text or ''
	subtitle = c.find('./NewsLines/SubHeadLine').text or ''
	abstract = c.find('./ContentItem/DataContent/nitf/body/body.head/abstract/p').text or ''
	content = c.find('./ContentItem/DataContent/nitf/body/body.content').text or ''
	owner = c.find('./ContentItem/DataContent/nitf/body/body.head/rights/rights.owner').text or 'periodicoclm'
	if 'periodicoclm' in owner.lower(): owner = 'periodicoclm'
	tags = c.find('./ContentItem/DataContent/nitf/head/docdata/key-list/keyword').attrib
	if tags: tags = tags['key']
	link = c.find('./ContentItem').attrib
	link = link['Href'].replace('http://www.periodicoclm.es/','')
	slug = link.split('/')[2]
	link2 = '/'+dt.strftime('%Y/%m/%d')+'/'+slug+'/'
	# slug = re.sub(r'\/?[0-9].html$','',link) + '/'
	image = ''
	try:
	image = item.find('./NewsItem/NewsComponent/NewsComponent[2]/NewsComponent/NewsComponent/ContentItem').attrib
	image = image['Href']
	except Exception:
	pass
	# for h in item.iter('HeadLine'):
	# print(h.text)
	# for sh in item.iter('SubHeadLine'):
	# print(sh.text)
	return [post_id,owner,'"'+title.replace('"','”')+'"','"'+subtitle.replace('"','”')+'"','"'+abstract.replace('"','”')+'"','"'+content.replace('"','”')+'"',date,image,tags,cat,slug,link,link2]

	def parse_opinion(item):
	#data = item.attrib
	cat = 'opinion'
	c = item.find('./NewsItem/NewsComponent/NewsComponent[1]/NewsComponent')
	post_id = c.attrib
	post_id = post_id['Euid']
	date = c.find('./DescriptiveMetadata/DateLineDate').text
	dt = datetime.strptime(date[:8],"%Y%m%d")
	title = c.find('./NewsLines/HeadLine').text or ''
	subtitle = c.find('./NewsLines/SubHeadLine').text or ''
	abstract = c.find('./ContentItem/DataContent/nitf/body/body.head/abstract/p').text or ''
	content = c.find('./ContentItem/DataContent/nitf/body/body.content').text or ''
	owner = c.find('./ContentItem/DataContent/nitf/body/body.head/rights/rights.owner').text or 'periodicoclm'
	if 'periodicoclm' in owner.lower(): owner = 'periodicoclm'
	tags = c.find('./ContentItem/DataContent/nitf/head/docdata/key-list/keyword').attrib
	if tags: tags = tags['key']
	link = c.find('./ContentItem').attrib
	link = link['Href'].replace('http://www.periodicoclm.es/','')
	slug = link.split('/')[2]
	link2 = '/'+dt.strftime('%Y/%m/%d')+'/'+slug+'/'
	# slug = re.sub(r'\/?[0-9].html$','',link) + '/'
	image = ''
	try:
	image = item.find('./NewsItem/NewsComponent/NewsComponent[2]/NewsComponent/NewsComponent/ContentItem').attrib
	image = image['Href']
	except Exception:
	pass
	# for h in item.iter('HeadLine'):
	# print(h.text)
	# for sh in item.iter('SubHeadLine'):
	# print(sh.text)
	return [post_id,owner,'"'+title.replace('"','”')+'"','"'+subtitle.replace('"','”')+'"','"'+abstract.replace('"','”')+'"','"'+content.replace('"','”')+'"',date,image,tags,cat,slug,link,link2]


	ret = """<?xml version="1.0" encoding="utf-8"?>
	<NewsML Version="1.2">
	"""
	f = open(FNAME,'w')
	print(CSV_FORMAT,file=f)
	for xml_file in os.listdir("periodicoclm/"):
	if xml_file == "index.xml": continue
	print(xml_file)
	try:
	line = []
	tree = ET.parse("periodicoclm/"+xml_file)
	root = tree.getroot()
	if xml_file.startswith('article'):
	line = parse_article(tree)
	elif xml_file.startswith('opinion'):
	line = parse_opinion(tree)
	if line:
	print(';'.join(line), file=f)
	# line = ET.tostring(root[1], encoding='utf8').decode('utf8')
	# ret += re.sub(r'^<\?xml version.*\?>','',line)
	# print(ret)
	except Exception as e:
	print(e)
	traceback.print_exc()

	# ret +="""
	# </NewsML>
	# """
	# f = open("periodicoclm/index.xml", "w")
	# f.write(ret)
	f.close()