Last active
November 5, 2019 17:56
-
-
Save joffilyfe/6359c01d5321e2f6abf180bc406cb0ad to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
S1517-106X2013000200012 | |
S1517-106X2012000100007 | |
S1414-753X2003000300008 | |
S1414-753X2001000800007 | |
S1414-753X1999000200004 | |
S1414-753X2013000400006 | |
S1414-753X2009000100004 | |
S1414-753X2014000300005 | |
S1414-753X2014000300013 | |
S1414-753X2013000100006 | |
S1414-753X2012000200002 | |
S0365-05962007000600013 | |
S0365-05962007000400002 | |
S0365-05962007000400012 | |
S0365-05962006000300011 | |
S0365-05962004000100003 | |
S0365-05962003000100004 | |
S0365-05962008000400001 | |
S0365-05962011000200034 | |
S0365-05962011000200035 | |
S0365-05962011000200036 | |
S0365-05962011000200037 | |
S0365-05962011000500030 | |
S0365-05962011000500031 | |
S0365-05962011000500032 | |
S0365-05962011000500033 | |
S0365-05962011000400046 | |
S0365-05962011000100002 | |
S0365-05962011000100003 | |
S0365-05962011000100004 | |
S0365-05962011000100005 | |
S0365-05962011000100006 | |
S0365-05962011000100007 | |
S0365-05962011000100008 | |
S0365-05962011000100009 | |
S0365-05962011000100010 | |
S0365-05962011000100011 | |
S0365-05962011000100012 | |
S0365-05962011000100013 | |
S0365-05962011000100014 | |
S0365-05962011000100015 | |
S0365-05962011000100016 | |
S0365-05962011000100017 | |
S0365-05962011000100018 | |
S0365-05962011000100019 | |
S0365-05962011000100020 | |
S0365-05962011000100021 | |
S0365-05962011000100022 | |
S0365-05962011000100023 | |
S0365-05962011000100024 | |
S0365-05962011000100025 | |
S0365-05962011000100026 | |
S0365-05962011000100027 | |
S0365-05962011000100028 | |
S0365-05962011000100029 | |
S0365-05962011000100030 | |
S0365-05962011000100031 | |
S0365-05962011000100032 | |
S0365-05962011000100033 | |
S0365-05962011000100034 | |
S0365-05962011000100035 | |
S0365-05962011000100036 | |
S0365-05962011000700001 | |
S0365-05962011000700002 | |
S0365-05962011000700003 | |
S0365-05962011000700004 | |
S0365-05962011000700005 | |
S0365-05962011000700006 | |
S0365-05962011000700007 | |
S0365-05962011000700008 | |
S0365-05962011000700009 | |
S0365-05962011000700010 | |
S0365-05962011000700011 | |
S0365-05962011000700012 | |
S0365-05962011000700013 | |
S0365-05962011000700014 | |
S0365-05962011000700015 | |
S0365-05962011000700016 | |
S0365-05962011000700017 | |
S0365-05962011000700018 | |
S0365-05962011000700019 | |
S0365-05962011000700020 | |
S0365-05962011000700021 | |
S0365-05962011000700022 | |
S0365-05962011000700023 | |
S0365-05962011000700024 | |
S0365-05962011000700025 | |
S0365-05962011000700026 | |
S0365-05962011000700027 | |
S0365-05962011000700028 | |
S0365-05962011000700029 | |
S0365-05962011000700030 | |
S0365-05962011000700031 | |
S0365-05962011000700032 | |
S0365-05962011000700033 | |
S0365-05962011000700034 | |
S0365-05962011000700035 | |
S0365-05962011000700036 | |
S0365-05962011000700037 | |
S0365-05962011000700038 | |
S0365-05962011000700039 | |
S0365-05962011000700040 | |
S0365-05962011000700041 | |
S0365-05962011000700042 | |
S0365-05962011000700043 | |
S0365-05962011000700044 | |
S0365-05962011000700045 | |
S0365-05962011000700046 | |
S0365-05962011000700047 | |
S0365-05962011000700048 | |
S0365-05962011000700049 | |
S0365-05962011000700050 | |
S0365-05962012000100014 | |
S0365-05962012000100025 | |
S0365-05962012000100009 | |
S0365-05962012000500011 | |
S0365-05962012000500024 | |
S0001-37652000000200009 | |
S0001-37652002000100002 | |
S0001-37652002000300012 | |
S0001-37652004000200004 | |
S0001-37652006000400007 | |
S0001-37652007000200015 | |
S0001-37652009000100016 | |
S0001-37652010000200020 | |
S0001-37652010000400028 | |
S0001-37652011000100021 | |
S0001-37652011000300021 | |
S0001-37652012000100011 | |
S0001-37652012000300022 | |
S0001-37652012000400029 | |
S0301-80592000000400001 | |
S0301-80592000000400002 | |
S0301-80592000000400003 | |
S0301-80592000000400004 | |
S0301-80592000000400005 | |
S0301-80592000000400006 | |
S0301-80592000000400007 | |
S0301-80592000000400008 | |
S0301-80592000000400009 | |
S0301-80592000000400010 | |
S0301-80592000000400011 | |
S0301-80592000000400012 | |
S0301-80592000000400013 | |
S0301-80592000000400014 | |
S0301-80592000000400015 | |
S0301-80592000000400016 | |
S0301-80592000000400017 | |
S0301-80592000000400018 | |
S0301-80592000000400019 | |
S0301-80592000000400020 | |
S0301-80592000000400021 | |
S0301-80592000000400022 | |
S0301-80592000000400023 | |
S0301-80592000000400024 | |
S0301-80592000000400025 | |
S0301-80592000000400026 | |
S0301-80592000000400027 | |
S0301-80592000000400028 | |
S0301-80592000000400029 | |
S0301-80592000000400030 | |
S0301-80592000000300001 | |
S0301-80592000000300002 | |
S0301-80592000000300003 | |
S0301-80592000000300004 | |
S0301-80592000000300005 | |
S0301-80592000000300006 | |
S0301-80592000000300007 | |
S0301-80592000000300008 | |
S0301-80592000000300009 | |
S0301-80592000000300010 | |
S0301-80592000000300011 | |
S0301-80592000000300012 | |
S0301-80592000000300013 | |
S0301-80592000000300014 | |
S0301-80592000000300015 | |
S0301-80592000000300016 | |
S0301-80592000000300017 | |
S0301-80592000000300018 | |
S0301-80592000000300019 | |
S0301-80592000000300020 | |
S0301-80592000000300021 | |
S0301-80592000000300022 | |
S0301-80592000000300023 | |
S0301-80592000000300024 | |
S0301-80592000000300025 | |
S0301-80592000000200001 | |
S0301-80592000000200002 | |
S0301-80592000000200003 | |
S0301-80592000000200004 | |
S0301-80592000000200005 | |
S0301-80592000000200006 | |
S0301-80592000000200007 | |
S0301-80592000000200008 | |
S0301-80592000000200009 | |
S0301-80592000000200010 | |
S0301-80592000000200011 | |
S0301-80592000000200012 | |
S0301-80592000000200013 | |
S0301-80592000000200014 | |
S0301-80592000000200015 | |
S0301-80592000000200016 | |
S0301-80592000000200017 | |
S0301-80592000000200018 | |
S0301-80592000000200019 | |
S0301-80592000000200020 | |
S0301-80592000000200021 | |
S0301-80592000000200022 | |
S0301-80592000000100001 | |
S0301-80592000000100002 | |
S0301-80592000000100003 | |
S0301-80592000000100004 | |
S0301-80592000000100005 | |
S0301-80592000000100006 | |
S0301-80592000000100007 | |
S0301-80592000000100008 | |
S0301-80592000000100009 | |
S0301-80592000000100010 | |
S0301-80592000000100011 | |
S0301-80592000000100012 | |
S0301-80592000000100013 | |
S0301-80592000000100014 | |
S0301-80592000000100015 | |
S0301-80592000000100016 | |
S0301-80592000000100017 | |
S0301-80592000000100018 | |
S0301-80592000000100019 | |
S0301-80592000000100020 | |
S0301-80592000000100021 | |
S0301-80592000000100022 | |
S0301-80592000000100023 | |
S0301-80591999000300001 | |
S0301-80591999000300002 | |
S0301-80591999000300003 | |
S0301-80591999000300004 | |
S0301-80591999000300005 | |
S0301-80591999000300006 | |
S0301-80591999000300007 | |
S0301-80591999000300008 | |
S0301-80591999000300009 | |
S0301-80591999000300010 | |
S0301-80591999000300011 | |
S0301-80591999000300012 | |
S0301-80591999000300013 | |
S0301-80591999000300014 | |
S0301-80591999000300015 | |
S0301-80591999000300016 | |
S0301-80591999000300017 | |
S0301-80591999000300018 | |
S0301-80591999000300019 | |
S0301-80591999000300020 | |
S0301-80591999000300021 | |
S0301-80591999000300022 | |
S0301-80591999000300023 | |
S0301-80591999000300024 | |
S0301-80591999000300025 | |
S0301-80591999000300026 | |
S0301-80591999000300027 | |
S0301-80591999000400001 | |
S0301-80591999000400002 | |
S0301-80591999000400003 | |
S0301-80591999000400004 | |
S0301-80591999000400005 | |
S0301-80591999000400006 | |
S0301-80591999000400007 | |
S0301-80591999000400008 | |
S0301-80591999000400009 | |
S0301-80591999000400010 | |
S0301-80591999000400011 | |
S0301-80591999000400012 | |
S0301-80591999000400013 | |
S0301-80591999000400014 | |
S0301-80591999000400015 | |
S0301-80591999000400016 | |
S0301-80591999000400017 | |
S0301-80591999000400018 | |
S0301-80591999000400019 | |
S0301-80591999000400020 | |
S0301-80591999000400021 | |
S0301-80591999000100001 | |
S0301-80591999000100002 | |
S0301-80591999000100003 | |
S0301-80591999000100004 | |
S0301-80591999000100005 | |
S0301-80591999000100006 | |
S0301-80591999000100007 | |
S0301-80591999000100008 | |
S0301-80591999000100009 | |
S0301-80591999000100010 | |
S0301-80591999000100011 | |
S0301-80591999000100012 | |
S0301-80591999000100013 | |
S0301-80591999000100014 | |
S0301-80591999000100015 | |
S0301-80591999000100016 | |
S0301-80591999000100017 | |
S0301-80591999000100018 | |
S0301-80591999000100019 | |
S0301-80591999000100020 | |
S0301-80591999000100021 | |
S0301-80591999000200001 | |
S0301-80591999000200002 | |
S0301-80591999000200003 | |
S0301-80591999000200004 | |
S0301-80591999000200005 | |
S0301-80591999000200006 | |
S0301-80591999000200007 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import html | |
import requests | |
import json | |
import argparse | |
import logging | |
from typing import List | |
from lxml import etree | |
from itertools import chain | |
from copy import deepcopy | |
from io import StringIO, BytesIO | |
ARTICLE_META_URL = ( | |
"http://articlemeta.scielo.org/api/v1/article/?format=json&body=true&code=%s" | |
) | |
html_parser = etree.HTMLParser() | |
logger = logging.getLogger(__name__) | |
def _config_logging(logging_level="INFO", logging_file=None): | |
allowed_levels = { | |
"DEBUG": logging.DEBUG, | |
"INFO": logging.INFO, | |
"WARNING": logging.WARNING, | |
"ERROR": logging.ERROR, | |
"CRITICAL": logging.CRITICAL, | |
} | |
formatter = logging.Formatter( | |
"%(asctime)s - %(name)s - %(levelname)s - %(message)s" | |
) | |
logger.setLevel(allowed_levels.get(logging_level, "INFO")) | |
if logging_file: | |
hl = logging.FileHandler(logging_file, mode="a") | |
else: | |
hl = logging.StreamHandler() | |
hl.setFormatter(formatter) | |
hl.setLevel(allowed_levels.get(logging_level, "INFO")) | |
logger.addHandler(hl) | |
return logger | |
def get_nested(node, *path, default=""): | |
try: | |
for p in path: | |
node = node[p] | |
except (IndexError, KeyError): | |
return default | |
return node | |
def get_citations_nodes_by_references_comments(tree): | |
"""Retorna uma lista de citações a partir de dos comentários que delimitam | |
as referências no artigo. | |
Ideia retirada do transformador via XIS (ptf?) | |
https://github.com/scieloorg/Web/blob/5c55a8596ac581fb9246a68d9a84501fd7ed8f66/cgi-bin/ScieloXML/paragraphs.xis#L265 | |
""" | |
def _get_references(body: str) -> list: | |
start_at = 0 | |
start_tag = "<!-- ref -->" | |
end_tag = "<!-- end-ref -->" | |
references = [] | |
while body.find(start_tag, start_at) >= 0: | |
start_index = body.find(start_tag, start_at) | |
end_index = body.find(end_tag, start_at) | |
reference = body[start_index : (end_index + len(end_tag))] | |
references.append(reference) | |
start_at = end_index + len(end_tag) | |
return references | |
parents = [] | |
body = etree.tostring(tree).decode() | |
body = body.replace("<br/>", "") | |
for reference in _get_references(body): | |
node = etree.parse(StringIO(reference), parser=html_parser) | |
root = node.find(".//font") | |
if root is None: | |
root = node.find(".//body") | |
root.tag = "font" | |
if root is not None: | |
for comment in root.xpath(".//comment()"): | |
parent = comment.getparent() | |
parent.remove(comment) | |
parents.append(root) | |
return parents | |
def wrap_citations_into_dict(raw_citations): | |
"""Retorna as citações em formato de dicionário Python. | |
O dicionário resultante do encapsulamento das citações contém as chaves | |
`numer` e `text`. A chave `number` representa o número capturado a partir | |
do texto da citação, a chave `text` representa o resto do conteúdo da | |
citação sem espaços duplicados e quebras de linhas. | |
Params: | |
raw_citations (List[etree.Element]): Lista crua de citações encapsuladas | |
no seu próprio elemento (<font>, <*>). | |
Returns: | |
citations (List[Dict]): Lista contendo todas as citaçõesm em formato | |
de dicionário, exemplo: | |
`[{"number": "1", "text": "Texto da citação", "citation": "<font>Texto da citação</font>}]`.""" | |
def _get_full_content(node) -> str: | |
"""Retorna o conteúdo em HTML de um elemento.""" | |
if node.find(".//font") is not None: | |
node = node.find(".//font") | |
node_text = list(filter(lambda n: isinstance(n, str), node.text or [])) | |
node_tail = list(filter(lambda n: isinstance(n, str), node.tail or [])) | |
node_children = list(map(etree.tostring, node.xpath(".//*"))) | |
node_children = list(map(lambda n: n.decode(), node_children)) | |
nodes = chain(node_text, node_children, node_tail) | |
nodes_text = "".join(nodes) | |
nodes_text = re.sub(r"[\s\n]+", " ", nodes_text) | |
nodes_text = nodes_text.strip() | |
nodes_text = html.unescape(nodes_text) # transforma símbolos html | |
return nodes_text | |
def _get_citation_number(node) -> str: | |
"""Retorna o número da referência""" | |
node_to_string = etree.tostring(node).decode() | |
match = CITATION_NUMBER_REGEX.match(node_to_string) | |
if match: | |
groups = match.groupdict() | |
return groups["number"] | |
return None | |
def _get_citation_text_only(node) -> str: | |
etree.strip_tags(node, "*") | |
return html.unescape(etree.tostring(node).decode()) | |
citations = [] | |
CITATION_REGEX = re.compile(r"(?P<number>\d+)?\.?\s?(?P<citation>.*)") | |
CITATION_NUMBER_REGEX = re.compile(r".*>(?P<number>\d+?)\.\s.*", re.MULTILINE) | |
for raw_citation in raw_citations: | |
full_content = _get_full_content(raw_citation) | |
match = CITATION_REGEX.match(full_content) | |
number = _get_citation_number(raw_citation) | |
text_content = _get_citation_text_only(raw_citation) | |
if match is None: | |
continue | |
groups = match.groupdict() | |
groups["text"] = _get_citation_text_only(raw_citation) | |
if groups.get("number") is None and number is not None: | |
groups["number"] = number | |
citations.append(groups) | |
return citations | |
def _get_article_body_as_etree(bodies: dict): | |
"""Retorna o corpo do artigo como árvore etree""" | |
for body in bodies.values(): | |
try: | |
return etree.fromstring("<body>" + body + "</body>") | |
except etree.XMLSyntaxError: | |
pass | |
return None | |
def _text_has_all_citation_words(words, text) -> bool: | |
"""Verifica se todas as palavras estão em um determinado texto""" | |
if len(words) == 0: | |
return False | |
for word in words: | |
if word not in text: | |
return False | |
return True | |
def _get_citation_words(citation: dict) -> list: | |
"""Retorna a lista de palavras contidas no título da citação. | |
Nos casos onde o título é vazio é retornado uma array vazia.""" | |
text = ( | |
get_nested(citation, "v12", 0, "_") | |
or get_nested(citation, "v18", 0, "_") | |
or get_nested(citation, "v30", 0, "_") # type article | |
or get_nested(citation, "v801", 0, "_") | |
).lower() | |
if len(text) == 0: | |
return [] | |
return re.split(r"[\W\s]+", text) | |
def _citation_and_text_authors_matches(citation: dict, citation_text: str) -> bool: | |
"""Verifica se os autores de uma citação estão no texto informado""" | |
authors = citation.get("v10", []) or citation.get("v16", []) | |
matches = 0 | |
for author in authors: | |
surname = author.get("s", "").lower() | |
name = author.get("n", "").lower() | |
if surname in citation_text or name in citation_text: | |
matches += 1 | |
if matches >= (len(authors) // 2) + 1: | |
return True | |
return False | |
def get_mixed_citations( | |
citations: dict, bodies: str, pid: str, collection: str = "scl", debug_mode=False | |
) -> List[dict]: | |
"""Retorna o conteúdo `mixed_citation`das citações de um artigo. | |
A partir das citações de um artigo no formato `isis2json` e do seu body em | |
HTMl é formada lista de `mixed_citation` contendo apenas as citações que | |
foram casadas com o texto extraído do corpo HTML. | |
Params: | |
citations (List[dict]): citações em formato isis2json | |
bodies (str): Corpos do artigo em HTML (Idiomas em pt, es, en, etc) | |
pid (str): Identificador do artigo na base ISIS/ArticleMeta | |
collection (str): Acrônimo da coleção a ser processada | |
debug_mode (bool): Ativa ou desativa o modo de debug do código | |
Returns: | |
mixed_citations (List[dict]): Citações que foram casadas com o texto | |
extraído do corpo HTML do artigo.""" | |
if citations is None or bodies is None: | |
return [] | |
body_etree = _get_article_body_as_etree(bodies) | |
if body_etree is None: | |
return [] | |
mixed_citations = [] | |
raw_citations = get_citations_nodes_by_references_comments(body_etree) | |
wrapped_citations = wrap_citations_into_dict(raw_citations) | |
if debug_mode: | |
article_citations_indexes = list(range(0, len(citations))) | |
for citation_index, citation in enumerate(citations): | |
words = _get_citation_words(citation) | |
for wrapped_citation in wrapped_citations: | |
wrapped_text = wrapped_citation["text"].lower() | |
matched_text = _text_has_all_citation_words(words, wrapped_text) | |
matched_authors = _citation_and_text_authors_matches(citation, wrapped_text) | |
if matched_text or matched_authors: | |
mixed_citations.append( | |
{ | |
"mixed": wrapped_citation["citation"], | |
"pid": pid, | |
"collection": collection, | |
"order": citation_index + 1, | |
} | |
) | |
if debug_mode: | |
article_citations_indexes.remove(citation_index) | |
break | |
if debug_mode and len(article_citations_indexes) > 0: | |
logger.debug("Índices das citações não preenchidas: %s, PID: %s", article_citations_indexes, pid) | |
return mixed_citations | |
def main(): | |
parser = argparse.ArgumentParser(description="Gerador de citações mixadas") | |
parser.add_argument( | |
"input", | |
type=argparse.FileType("r"), | |
help="Arquivo com um PID por linha representando os artigos a serem processados", | |
) | |
parser.add_argument( | |
"output", | |
type=argparse.FileType("w"), | |
help="Arquivo onde as citações serão armazenadas", | |
) | |
parser.add_argument( | |
"--collection", "-c", default="scl", help="Coleção SciELO a ser processada" | |
) | |
parser.add_argument( | |
"--logging_level", | |
"-l", | |
default="INFO", | |
choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], | |
help="Logggin level", | |
) | |
args = parser.parse_args() | |
pids = args.input.readlines() | |
collection = args.collection | |
_config_logging(logging_level=args.logging_level) | |
debug_mode = args.logging_level == "DEBUG" | |
with args.output as output: | |
for pid in pids: | |
pid = pid.strip() | |
try: | |
article = requests.get(ARTICLE_META_URL % pid, timeout=10).json() | |
if article is None: | |
logger.info("Could not fetch %s." % pid) | |
continue | |
for mixed_citation in get_mixed_citations( | |
article.get("citations", []), | |
article.get("body", ""), | |
pid, | |
collection, | |
debug_mode=debug_mode | |
): | |
output.write(json.dumps(mixed_citation) + "\n") | |
except Exception as e: | |
logger.error("%s. %s", pid, e) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment