Last active
April 14, 2020 20:30
-
-
Save joffilyfe/bfc656e878dee6376234e6b1edc668be to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import json | |
import logging | |
import argparse | |
logging.basicConfig(format="%(asctime)s - %(message)s", level=logging.DEBUG) | |
logger = logging.getLogger(__name__) | |
def main(): | |
parser = argparse.ArgumentParser( | |
description="Programa para agrupar as mixed citations a partir" | |
" de arquivos de parágrafos" | |
) | |
parser.add_argument( | |
"pids", | |
help="Lista de pids que serão utilizados para extrair os parágrafos" | |
" de arquivos JSON", | |
type=argparse.FileType("r"), | |
) | |
parser.add_argument( | |
"paragraphs", | |
help="Diretório contendo os arquivos de parágrafos no formatos JSON.", | |
) | |
parser.add_argument( | |
"output", | |
help="Arquivo com o resultado do processamento.", | |
type=argparse.FileType("w"), | |
) | |
args = parser.parse_args() | |
for pid in args.pids: | |
pid = pid.strip() | |
paragraph_file = os.path.abspath(os.path.join(args.paragraphs, pid + ".json")) | |
if not os.path.exists(paragraph_file): | |
logger.error("Paragraph file '%s' does not exist.", paragraph_file) | |
continue | |
with open(paragraph_file, "r") as f: | |
for line in f.readlines(): | |
paragraph: dict = json.loads(line) | |
article_pid: str = paragraph.get("v880", [{}])[0].get("_") | |
raw_mixed_citation: str = paragraph.get("v704", [{}])[0].get("_") | |
raw_mixed_citation_index: str = paragraph.get("v888", [{}])[0].get("_") | |
if ( | |
article_pid is not None | |
and raw_mixed_citation is not None | |
and raw_mixed_citation_index is not None | |
and raw_mixed_citation_index.isdigit() | |
): | |
mixed_citation = { | |
"mixed": raw_mixed_citation, | |
"order": int(raw_mixed_citation_index) - 1, | |
"collection": "scl", | |
} | |
args.output.write(json.dumps(mixed_citation) + "\n") | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment