Created
January 20, 2016 00:33
-
-
Save u1735067/b06cec878f44f9a2d403 to your computer and use it in GitHub Desktop.
ArangoDB : test d'import (unitaire et en masse, beaucoup plus rapide)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!python3 | |
import sys, glob, os, re, datetime, time, json | |
import traceback | |
import arango | |
from arango import Arango | |
client = Arango(host="localhost", port=8529) | |
db = client.db('jeu') | |
types = ['combinaisons', 'masse-enjeu', 'participants', 'rapports-definitifs', 'courses', 'tirelire', 'reunion', 'pronostics-detailles', 'performances-detaillees'] | |
for t in types: | |
try: | |
db.create_collection(t) | |
except: | |
continue | |
#files = sorted(glob.glob('data/20130311*'), key=os.path.basename) | |
file_mask = re.compile('(?P<key>(?P<date>[0-9]{8})_(R(?P<reunion>[0-9]+)-)?(C(?P<course>[0-9]+)-)?)(?P<type>\S+)\.json') | |
def process_file(file, data): | |
res = re.match(file_mask, os.path.basename(file)) | |
dateYmd = res.group('date') | |
key = res.group('key') | |
if (key[-1:] == '-' or key[-1:] == '_'): | |
key = key[:-1] | |
reunion = res.group('reunion') | |
if (reunion is not None): | |
reunion = int(reunion) | |
course = res.group('course') | |
if (course is not None): | |
course = int(course) | |
type = res.group('type') | |
timestamp = str(int(time.mktime(datetime.datetime.strptime(dateYmd, '%Y%m%d').timetuple())))+'000' #Win | |
# timestamp = datetime.datetime.strptime(dateYmd, '%Y%m%d').strftime('%s000') | |
if type == 'courses': | |
data = data['programme'] | |
elif type == 'reunion': | |
data['date'] = data['dateReunion'] | |
data['numeroReunion'] = reunion | |
elif (type == 'participants') or (type == 'pronostics-detailles'): | |
data['date'] = timestamp | |
data['timezoneOffset'] = 3600000 | |
data['numeroReunion'] = reunion | |
data['numeroCourse'] = course | |
elif type == 'masse-enjeu': | |
data= {'enjeu': data} | |
data['date'] = timestamp | |
data['timezoneOffset'] = 3600000 | |
data['numeroReunion'] = reunion | |
data['numeroCourse'] = course | |
elif type == 'rapports-definitifs': | |
data = {'rapports': data} | |
data['date'] = timestamp | |
data['timezoneOffset'] = 3600000 | |
data['numeroReunion'] = reunion | |
data['numeroCourse'] = course | |
elif type == 'combinaisons': | |
data['date'] = data['dateProgramme'] | |
elif type == 'performances-detaillees': | |
data['date'] = timestamp | |
data['numeroReunion'] = reunion | |
data['numeroCourse'] = course | |
elif type == 'tirelire': | |
data = data | |
else: | |
raise Exception('type inconnu') | |
data['dateYmd'] = int(dateYmd) | |
data['_key'] = key | |
return data | |
commit_interval = 5000 | |
i = 0 | |
nb_files = len(sorted(glob.glob('data/*.json'), key=os.path.basename)) | |
for type in types: | |
j = 0 | |
collection = db.col(type) | |
files = sorted(glob.glob('data/*'+type+'.json'), key=os.path.basename) | |
docs = [] | |
for file in files: | |
try: | |
with open(file, 'r', encoding="utf-8") as f: | |
data = json.load(f) | |
docs.append(process_file(file, data)) | |
print('Read file '+file) | |
if (j % commit_interval == 0): | |
print('Read '+str(commit_interval)+' files, sending') | |
collection.import_documents(docs, complete=False, details=True) | |
docs = [] | |
except arango.exceptions.DocumentCreateError as e: | |
if (e.args[0] == 'cannot create document, unique constraint violated'): | |
print('Failure with '+file+' : already added') | |
else: | |
print('/!\ Failure with '+file+' : ') | |
raise | |
except: | |
print('/!\ Failure with '+file+' : ') | |
# +str(traceback.print_exc()) | |
raise | |
i += 1 | |
j += 1 | |
if (j % 500 == 0): | |
print('Done '+type+' : '+str(j)+'/'+str(len(files))+' ('+str(int(j/len(files)*100))+'%) / '+str(i)+'/'+str(nb_files)+' ('+str(int(i/nb_files*100))+'%)', file=sys.stderr) | |
print('Done '+type+' : '+str(j)+'/'+str(len(files))+' ('+str(int(j/len(files)*100))+'%) / '+str(i)+'/'+str(nb_files)+' ('+str(int(i/nb_files*100))+'%)', file=sys.stderr) | |
collection.import_documents(docs, complete=False, details=True) | |
''' | |
FOR i IN reunion | |
FILTER i.meteo != null | |
RETURN {'date': i.dateYmd, 'meteo': i.meteo} | |
https://www.arangodb.com/download/ | |
! https://github.com/joowani/python-arango | |
https://github.com/saeschdivara/ArangoPy | |
https://docs.arangodb.com/cookbook/XCopyInstallWindows.html | |
http://127.0.0.1:8529/_db/jeu/_admin/aardvark/standalone.html#collection/combinaisons/20141123_R2-C8 | |
http://api.mongodb.org/python/current/api/pymongo/database.html | |
https://stackoverflow.com/questions/15478127/remove-final-character-from-string-python | |
https://stackoverflow.com/questions/19801727/convert-datetime-to-unix-timestamp-and-convert-it-back-in-python | |
https://stackoverflow.com/questions/23086383/how-to-test-nonetype-in-python | |
https://docs.arangodb.com/HttpBulkImports/index.html | |
https://www.arangodb.com/2012/09/bulk-insert-benchmark-tool/ | |
http://vschart.com/compare/arangodb/vs/mongodb/vs/couchbase | |
https://docs.arangodb.com/IndexHandling/Geo.html | |
http://fr.slideshare.net/arangodb/introduction-to-column-oriented-databases | |
https://dzone.com/articles/introducing-arangodb | |
https://dzone.com/articles/practical-tips-to-reduce-sql-server-database-table | |
http://blog.sqlauthority.com/2015/11/30/sql-server-practical-tips-to-reduce-sql-server-database-table-size-experts-opinion/ | |
''' |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!python3 | |
import sys, glob, os, re, datetime, time, json | |
import traceback | |
import arango | |
from arango import Arango | |
client = Arango(host="localhost", port=8529) | |
db = client.db('jeu') | |
for t in ['combinaisons', 'masse-enjeu', 'participants', 'rapports-definitifs', 'courses', 'tirelire', 'reunion', 'pronostics-detailles', 'performances-detaillees']: | |
try: | |
db.create_collection(t) | |
except: | |
continue | |
files = sorted(glob.glob('data/*'), key=os.path.basename) | |
#files = sorted(glob.glob('data/20130311*'), key=os.path.basename) | |
file_mask = re.compile('(?P<key>(?P<date>[0-9]{8})_(R(?P<reunion>[0-9]+)-)?(C(?P<course>[0-9]+)-)?)(?P<type>\S+)\.json') | |
def process_file(file, data): | |
res = re.match(file_mask, os.path.basename(file)) | |
dateYmd = res.group('date') | |
key = res.group('key') | |
if (key[-1:] == '-' or key[-1:] == '_'): | |
key = key[:-1] | |
reunion = res.group('reunion') | |
if (reunion is not None): | |
reunion = int(reunion) | |
course = res.group('course') | |
if (course is not None): | |
course = int(course) | |
type = res.group('type') | |
timestamp = str(int(time.mktime(datetime.datetime.strptime(dateYmd, '%Y%m%d').timetuple())))+'000' #Win | |
# timestamp = datetime.datetime.strptime(dateYmd, '%Y%m%d').strftime('%s000') | |
if type == 'courses': | |
data = data['programme'] | |
elif type == 'reunion': | |
data['date'] = data['dateReunion'] | |
data['numeroReunion'] = reunion | |
elif (type == 'participants') or (type == 'pronostics-detailles'): | |
data['date'] = timestamp | |
data['timezoneOffset'] = 3600000 | |
data['numeroReunion'] = reunion | |
data['numeroCourse'] = course | |
elif type == 'masse-enjeu': | |
data= {'enjeu': data} | |
data['date'] = timestamp | |
data['timezoneOffset'] = 3600000 | |
data['numeroReunion'] = reunion | |
data['numeroCourse'] = course | |
elif type == 'rapports-definitifs': | |
data = {'rapports': data} | |
data['date'] = timestamp | |
data['timezoneOffset'] = 3600000 | |
data['numeroReunion'] = reunion | |
data['numeroCourse'] = course | |
elif type == 'combinaisons': | |
data['date'] = data['dateProgramme'] | |
elif type == 'performances-detaillees': | |
data['date'] = timestamp | |
data['numeroReunion'] = reunion | |
data['numeroCourse'] = course | |
elif type == 'tirelire': | |
data = data | |
else: | |
raise Exception('type inconnu') | |
data['dateYmd'] = dateYmd | |
data['_key'] = key | |
collection = db.col(type) | |
collection.create_document(data) | |
i = 0 | |
for file in files: | |
try: | |
with open(file, 'r', encoding="utf-8") as f: | |
data = json.load(f) | |
process_file(file, data) | |
print('Done with '+file) | |
except arango.exceptions.DocumentCreateError as e: | |
if (e.args[0] == 'cannot create document, unique constraint violated'): | |
print('Failure with '+file+' : already added') | |
else: | |
print('/!\ Failure with '+file+' : ') | |
raise | |
except: | |
print('/!\ Failure with '+file+' : ') | |
# +str(traceback.print_exc()) | |
raise | |
i += 1 | |
if (i % 500 == 0): | |
print('Done: '+str(i)+'/'+str(len(files))+' ('+str(int(i/len(files)*100))+'%)', file=sys.stderr) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment