Created
July 5, 2021 01:09
-
-
Save Lucs1590/4e6c4659ee64afb38c3357d744ec100e to your computer and use it in GitHub Desktop.
This is a test, in which I try to put in a neo4j database, data translated from a json document (with 700,000 data).
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from py2neo import Graph, Node | |
from time import time | |
import json | |
import re | |
def main(): | |
t1 = time() | |
connection = connect_database() | |
dataset = get_data( | |
'https://world.openfoodfacts.org/ingredients.json')["tags"] | |
insert_data(connection, dataset, "Ingredient", "name") | |
print("Execution Time: ", time() - t1) | |
def connect_database(): | |
return Graph( | |
host='54.173.133.27', | |
port=*****, | |
password='****' | |
) | |
def get_data(url, type_request="GET", headers={}, querystring={}): | |
response = requests.request( | |
type_request, url, headers=headers, params=querystring) | |
return json.loads(response.text) | |
def insert_data(connection, dataset, label, attribute): | |
for data in dataset: | |
translated_data, english = translate_data(data[attribute]) | |
if translate_data != " ": | |
ingredient = Node(label, name=translated_data, english=english) if english == True else Node( | |
label, name=translated_data) | |
ingredient.__primarylabel__ = label | |
ingredient.__primarykey__ = attribute | |
connection.merge(ingredient) | |
print(translated_data) | |
def translate_data(data): | |
data = filter_data(data) | |
url = "https://systran-systran-platform-for-language-processing-v1.p.rapidapi.com/translation/text/translate" | |
querystring = {"source": "en", "target": "pt", "input": data} | |
headers = { | |
'x-rapidapi-host': "systran-systran-platform-for-language-processing-v1.p.rapidapi.com", | |
'x-rapidapi-key': "b2448ece4bmsh2e999bf748c5de3p1b0cb4jsn39b94190ee41" | |
} | |
translated_data = get_data(url, "GET", headers, querystring) | |
try: | |
return str(translated_data["outputs"][0]["output"]).lower(), False | |
except: | |
return data.replace("%20", " "), True | |
def filter_data(data): | |
re_validation = re.findall( | |
r"[a-zA-Záàâãéèêíïóôõöúçñ][^0-9]\w*", data, re.IGNORECASE) | |
return " ".join(re_validation).lower() | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment