Last active
April 25, 2019 20:19
-
-
Save joffilyfe/dfc4b4629dbd4f7a99b0108ce4eb0ef5 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import itertools | |
from lxml import etree | |
from documentstore_migracao.config import BASE_PATH | |
def find_assets_nodes(xmltree: etree.ElementTree) -> iter: | |
"""Busca por todos os assets e retorna um iterator com | |
de todos os nós etree.Element encontrados""" | |
paths = [ | |
".//ext-link[@xlink:href]", | |
".//graphic[@xlink:href]", | |
".//inline-graphic[@xlink:href]", | |
".//inline-supplementary-material[@xlink:href]", | |
".//media[@xlink:href]", | |
".//supplementary-material[@xlink:href]", | |
] | |
iterators = [ | |
xmltree.iterfind(path, namespaces={"xlink": "http://www.w3.org/1999/xlink"}) | |
for path in paths | |
] | |
return itertools.chain(*iterators) | |
def get_assets_list(assets_nodes: iter) -> list: | |
"""Retorna uma lista contendo dicionários de ativos digitais""" | |
link_selector = "{http://www.w3.org/1999/xlink}href" | |
assets = [] | |
for node in assets_nodes: | |
asset_name = node.get(link_selector).split("/")[-1] | |
asset_name = os.path.splitext(asset_name)[0] | |
assets.append({"id": asset_name, "url": node.get(link_selector)}) | |
# import pdb; pdb.set_trace() | |
return assets | |
path = os.path.join( | |
BASE_PATH, "xml/sps_packages/S0066-782X2004001200001/1678-4170-abc-82-s06-1-14.xml" | |
) | |
xml = etree.parse(path) | |
assets_iterator = find_assets_nodes(xml) | |
from documentstore_migracao.utils.xylose_converter import parse_date | |
from uuid import uuid4 | |
def get_document_bundle_manifest(document: etree.ElementTree, document_url: str) -> dict: | |
"""Cria um manifesto no formato do Kernel a partir de um | |
documento xml""" | |
try: | |
_id = document.find(".//article-id[@pub-id-type='scielo-id']").text | |
except AttributeError: | |
# raise ValueError("Document requires an scielo-id") from None | |
_id = str(uuid4()) | |
# TODO: Qual é a data adequada pra usar como "creation_date"? | |
date = document.find(".//pub-date[@pub-type='epub-ppub']") | |
if date is None: | |
raise ValueError("A creation date is required") from None | |
_creation_date = parse_date( | |
"%s-%s" % (date.find(".//year").text, date.find(".//month").text) | |
) | |
_version = {"data": document_url, "assets": {}, "timestamp": _creation_date} | |
_document = {"id": _id, "versions": [_version]} | |
asset_nodes = find_assets_nodes(document) | |
asset_list = get_assets_list(asset_nodes) | |
for asset in asset_list: | |
_version["assets"][asset.get("id")] = [[_creation_date, asset.get("url")]] | |
return _document | |
get_document_bundle_manifest(xml, "http://scielo-bucket.scielo.br/document.xml") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
+-----------------------------+ +------------------------------+ | |
| | | | | |
| | | | | |
| | | Converte XMLS | | |
| Extrai XMLS +------> Extraídos | | |
| | | | | |
| | | | | |
| | | | | |
+-----------------------------+ +---------------+--------------+ | |
| | |
| | |
| | |
+------------------------------+ +---------------v--------------+ | |
| | | | | |
| | | | | |
| Gera pacotes SPS | | | | |
| dos XMLS válidos <------+ Valida os XMLS convertidos | | |
| | | | | |
| | | | | |
| | | | | |
+---------------+--------------+ +------------------------------+ | |
| | |
| | |
| | |
+---------------v--------------+ +------------------------------+ | |
| | | | | |
| | | | | |
| Aplica o XML | | Envia pacotes SPS para | | |
| Constructor +------> o Object Storage | | |
| | | | | |
| | | | | |
| | | | | |
+------------------------------+ +--------------+---------------+ | |
| | |
v | |
+--------------+----------------+ | |
| | | |
| | | |
| Gera o manifesto do XML | | |
| e registra no Banco | | |
| de dados | | |
| | | |
| | | |
+-------------------------------+ |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"id": "e8ec3b28-825d-4b60-9711-7dbc106ecfbf", | |
"versions": [ | |
{ | |
"data": "http://scielo-bucket.scielo.br/document.xml", | |
"assets": { | |
"rad": [ | |
[ | |
"2004-04-01T00:00:00.000000Z", | |
"http://www.vtmed.org/vascular/rad.htm" | |
] | |
], | |
"1678-4170-abc-82-s06-1-14-glogo": [ | |
[ | |
"2004-04-01T00:00:00.000000Z", | |
"1678-4170-abc-82-s06-1-14-glogo.gif" | |
] | |
], | |
"1678-4170-abc-82-s06-1-14-ga01img01": [ | |
[ | |
"2004-04-01T00:00:00.000000Z", | |
"1678-4170-abc-82-s06-1-14-ga01img01.gif" | |
] | |
], | |
"1678-4170-abc-82-s06-1-14-ga01img02": [ | |
[ | |
"2004-04-01T00:00:00.000000Z", | |
"1678-4170-abc-82-s06-1-14-ga01img02.gif" | |
] | |
], | |
"1678-4170-abc-82-s06-1-14-ga01img03": [ | |
[ | |
"2004-04-01T00:00:00.000000Z", | |
"1678-4170-abc-82-s06-1-14-ga01img03.gif" | |
] | |
], | |
"1678-4170-abc-82-s06-1-14-ga01img04": [ | |
[ | |
"2004-04-01T00:00:00.000000Z", | |
"1678-4170-abc-82-s06-1-14-ga01img04.gif" | |
] | |
], | |
"1678-4170-abc-82-s06-1-14-ga01img05": [ | |
[ | |
"2004-04-01T00:00:00.000000Z", | |
"1678-4170-abc-82-s06-1-14-ga01img05.gif" | |
] | |
], | |
"1678-4170-abc-82-s06-1-14-ga01img06": [ | |
[ | |
"2004-04-01T00:00:00.000000Z", | |
"1678-4170-abc-82-s06-1-14-ga01img06.gif" | |
] | |
], | |
"1678-4170-abc-82-s06-1-14-ga01img07": [ | |
[ | |
"2004-04-01T00:00:00.000000Z", | |
"1678-4170-abc-82-s06-1-14-ga01img07.gif" | |
] | |
] | |
}, | |
"timestamp": "2004-04-01T00:00:00.000000Z" | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment