Last active
February 18, 2021 16:51
-
-
Save amon-ra/a2996b0e6bc2b3810fa57f2b66ba61e9 to your computer and use it in GitHub Desktop.
NEWSML to csv
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import xml.etree.ElementTree as ET | |
import os | |
import re | |
import sys | |
import traceback | |
from datetime import datetime | |
# Creates a csv to import with wp-all-import and redirect (last two columns) | |
CSV_FORMAT = "id;owner;title;subtitle;abstract;content;date;image;tags;category;slug;link;link2" | |
FNAME = "data.csv" | |
def parse_article(item): | |
#data = item.attrib | |
cat = '' | |
c = item.find('./NewsItem/NewsComponent/NewsComponent[1]/NewsComponent') | |
cat = c.find('./DescriptiveMetadata/Property[1]').attrib | |
cat = cat['Value'] | |
post_id = c.attrib | |
post_id = post_id['Euid'] | |
date = c.find('./DescriptiveMetadata/DateLineDate').text | |
dt = datetime.strptime(date[:8],"%Y%m%d") | |
title = c.find('./NewsLines/HeadLine').text or '' | |
subtitle = c.find('./NewsLines/SubHeadLine').text or '' | |
abstract = c.find('./ContentItem/DataContent/nitf/body/body.head/abstract/p').text or '' | |
content = c.find('./ContentItem/DataContent/nitf/body/body.content').text or '' | |
owner = c.find('./ContentItem/DataContent/nitf/body/body.head/rights/rights.owner').text or 'periodicoclm' | |
if 'periodicoclm' in owner.lower(): owner = 'periodicoclm' | |
tags = c.find('./ContentItem/DataContent/nitf/head/docdata/key-list/keyword').attrib | |
if tags: tags = tags['key'] | |
link = c.find('./ContentItem').attrib | |
link = link['Href'].replace('http://www.periodicoclm.es/','') | |
slug = link.split('/')[2] | |
link2 = '/'+dt.strftime('%Y/%m/%d')+'/'+slug+'/' | |
# slug = re.sub(r'\/*?[0-9].*html$','',link) + '/' | |
image = '' | |
try: | |
image = item.find('./NewsItem/NewsComponent/NewsComponent[2]/NewsComponent/NewsComponent/ContentItem').attrib | |
image = image['Href'] | |
except Exception: | |
pass | |
# for h in item.iter('HeadLine'): | |
# print(h.text) | |
# for sh in item.iter('SubHeadLine'): | |
# print(sh.text) | |
return [post_id,owner,'"'+title.replace('"','”')+'"','"'+subtitle.replace('"','”')+'"','"'+abstract.replace('"','”')+'"','"'+content.replace('"','”')+'"',date,image,tags,cat,slug,link,link2] | |
def parse_opinion(item): | |
#data = item.attrib | |
cat = 'opinion' | |
c = item.find('./NewsItem/NewsComponent/NewsComponent[1]/NewsComponent') | |
post_id = c.attrib | |
post_id = post_id['Euid'] | |
date = c.find('./DescriptiveMetadata/DateLineDate').text | |
dt = datetime.strptime(date[:8],"%Y%m%d") | |
title = c.find('./NewsLines/HeadLine').text or '' | |
subtitle = c.find('./NewsLines/SubHeadLine').text or '' | |
abstract = c.find('./ContentItem/DataContent/nitf/body/body.head/abstract/p').text or '' | |
content = c.find('./ContentItem/DataContent/nitf/body/body.content').text or '' | |
owner = c.find('./ContentItem/DataContent/nitf/body/body.head/rights/rights.owner').text or 'periodicoclm' | |
if 'periodicoclm' in owner.lower(): owner = 'periodicoclm' | |
tags = c.find('./ContentItem/DataContent/nitf/head/docdata/key-list/keyword').attrib | |
if tags: tags = tags['key'] | |
link = c.find('./ContentItem').attrib | |
link = link['Href'].replace('http://www.periodicoclm.es/','') | |
slug = link.split('/')[2] | |
link2 = '/'+dt.strftime('%Y/%m/%d')+'/'+slug+'/' | |
# slug = re.sub(r'\/*?[0-9].*html$','',link) + '/' | |
image = '' | |
try: | |
image = item.find('./NewsItem/NewsComponent/NewsComponent[2]/NewsComponent/NewsComponent/ContentItem').attrib | |
image = image['Href'] | |
except Exception: | |
pass | |
# for h in item.iter('HeadLine'): | |
# print(h.text) | |
# for sh in item.iter('SubHeadLine'): | |
# print(sh.text) | |
return [post_id,owner,'"'+title.replace('"','”')+'"','"'+subtitle.replace('"','”')+'"','"'+abstract.replace('"','”')+'"','"'+content.replace('"','”')+'"',date,image,tags,cat,slug,link,link2] | |
ret = """<?xml version="1.0" encoding="utf-8"?> | |
<NewsML Version="1.2"> | |
""" | |
f = open(FNAME,'w') | |
print(CSV_FORMAT,file=f) | |
for xml_file in os.listdir("periodicoclm/"): | |
if xml_file == "index.xml": continue | |
print(xml_file) | |
try: | |
line = [] | |
tree = ET.parse("periodicoclm/"+xml_file) | |
root = tree.getroot() | |
if xml_file.startswith('article'): | |
line = parse_article(tree) | |
elif xml_file.startswith('opinion'): | |
line = parse_opinion(tree) | |
if line: | |
print(';'.join(line), file=f) | |
# line = ET.tostring(root[1], encoding='utf8').decode('utf8') | |
# ret += re.sub(r'^<\?xml version.*\?>','',line) | |
# print(ret) | |
except Exception as e: | |
print(e) | |
traceback.print_exc() | |
# ret +=""" | |
# </NewsML> | |
# """ | |
# f = open("periodicoclm/index.xml", "w") | |
# f.write(ret) | |
f.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment