u1735067 · January 20, 2016 00:33
diff --git a/importArango.py b/importArango.py
 #!python3

 import sys, glob, os, re, datetime, time, json
 import traceback
 import arango
 from arango import Arango
 client = Arango(host="localhost", port=8529)
 db = client.db('jeu')

 types = ['combinaisons', 'masse-enjeu', 'participants', 'rapports-definitifs', 'courses', 'tirelire', 'reunion', 'pronostics-detailles', 'performances-detaillees']
 for t in types:
 	try:
 		db.create_collection(t)
 	except:
 		continue

 #files = sorted(glob.glob('data/20130311*'), key=os.path.basename)
 file_mask = re.compile('(?P<key>(?P<date>[0-9]{8})_(R(?P<reunion>[0-9]+)-)?(C(?P<course>[0-9]+)-)?)(?P<type>\S+)\.json')

 def process_file(file, data): 
 	res = re.match(file_mask, os.path.basename(file))
 	dateYmd = res.group('date')
 	key = res.group('key')
 	if (key[-1:] == '-' or key[-1:] == '_'):
 		key = key[:-1]
 	reunion = res.group('reunion')
 	if (reunion is not None):
 		reunion = int(reunion)
 	course = res.group('course')
 	if (course is not None):
 		course = int(course)
 	type = res.group('type')
 	timestamp = str(int(time.mktime(datetime.datetime.strptime(dateYmd, '%Y%m%d').timetuple())))+'000' #Win
 	# timestamp = datetime.datetime.strptime(dateYmd, '%Y%m%d').strftime('%s000')
 	
 	if type == 'courses':
 		data = data['programme']
 	elif type == 'reunion':
 		data['date'] = data['dateReunion']
 		data['numeroReunion'] = reunion
 	elif (type == 'participants') or (type == 'pronostics-detailles'):
 		data['date'] = timestamp
 		data['timezoneOffset'] = 3600000
 		data['numeroReunion'] = reunion
 		data['numeroCourse'] = course
 	elif type == 'masse-enjeu':
 		data= {'enjeu': data}
 		data['date'] = timestamp
 		data['timezoneOffset'] = 3600000
 		data['numeroReunion'] = reunion
 		data['numeroCourse'] = course
 	elif type == 'rapports-definitifs':
 		data = {'rapports': data}
 		data['date'] = timestamp
 		data['timezoneOffset'] = 3600000
 		data['numeroReunion'] = reunion
 		data['numeroCourse'] = course
 	elif type == 'combinaisons':
 		data['date'] = data['dateProgramme']
 	elif type == 'performances-detaillees':
 		data['date'] = timestamp
 		data['numeroReunion'] = reunion
 		data['numeroCourse'] = course
 	elif type == 'tirelire':
 		data = data
 	else:
 		raise Exception('type inconnu')
 	
 	data['dateYmd'] = int(dateYmd)
 	data['_key'] = key
 	
 	return data


 commit_interval = 5000
 i = 0
 nb_files = len(sorted(glob.glob('data/*.json'), key=os.path.basename))
 for type in types:
 	j = 0
 	collection = db.col(type)
 	files = sorted(glob.glob('data/*'+type+'.json'), key=os.path.basename)
 	docs = []
 	
 	for file in files:
 		try:
 			with open(file, 'r', encoding="utf-8") as f:
 				data = json.load(f)
 				docs.append(process_file(file, data))
 			
 			print('Read file '+file)
 			
 			if (j % commit_interval == 0):
 				print('Read '+str(commit_interval)+' files, sending')
 				collection.import_documents(docs, complete=False, details=True)
 				docs = []
 		
 		except arango.exceptions.DocumentCreateError as e:
 			if (e.args[0] == 'cannot create document, unique constraint violated'):
 				print('Failure with '+file+' : already added')
 			else:
 				print('/!\ Failure with '+file+' : ')
 				raise
 		
 		except:
 			print('/!\ Failure with '+file+' : ')
 			# +str(traceback.print_exc())
 			raise
 		
 		i += 1
 		j += 1
 		if (j % 500 == 0):
 			print('Done '+type+' : '+str(j)+'/'+str(len(files))+' ('+str(int(j/len(files)*100))+'%) / '+str(i)+'/'+str(nb_files)+' ('+str(int(i/nb_files*100))+'%)', file=sys.stderr)
 	
 	print('Done '+type+' : '+str(j)+'/'+str(len(files))+' ('+str(int(j/len(files)*100))+'%) / '+str(i)+'/'+str(nb_files)+' ('+str(int(i/nb_files*100))+'%)', file=sys.stderr)
 	collection.import_documents(docs, complete=False, details=True)

 '''

 FOR i IN reunion
  FILTER i.meteo != null
  RETURN {'date': i.dateYmd, 'meteo': i.meteo}


 https://www.arangodb.com/download/
 ! https://github.com/joowani/python-arango
 https://github.com/saeschdivara/ArangoPy
 https://docs.arangodb.com/cookbook/XCopyInstallWindows.html
 http://127.0.0.1:8529/_db/jeu/_admin/aardvark/standalone.html#collection/combinaisons/20141123_R2-C8

 http://api.mongodb.org/python/current/api/pymongo/database.html
 https://stackoverflow.com/questions/15478127/remove-final-character-from-string-python
 https://stackoverflow.com/questions/19801727/convert-datetime-to-unix-timestamp-and-convert-it-back-in-python
 https://stackoverflow.com/questions/23086383/how-to-test-nonetype-in-python
 https://docs.arangodb.com/HttpBulkImports/index.html
 https://www.arangodb.com/2012/09/bulk-insert-benchmark-tool/

 http://vschart.com/compare/arangodb/vs/mongodb/vs/couchbase
 https://docs.arangodb.com/IndexHandling/Geo.html

 http://fr.slideshare.net/arangodb/introduction-to-column-oriented-databases
 https://dzone.com/articles/introducing-arangodb
 https://dzone.com/articles/practical-tips-to-reduce-sql-server-database-table
 http://blog.sqlauthority.com/2015/11/30/sql-server-practical-tips-to-reduce-sql-server-database-table-size-experts-opinion/



 '''
diff --git a/importArango.v1.py b/importArango.v1.py
 #!python3

 import sys, glob, os, re, datetime, time, json
 import traceback
 import arango
 from arango import Arango
 client = Arango(host="localhost", port=8529)
 db = client.db('jeu')

 for t in ['combinaisons', 'masse-enjeu', 'participants', 'rapports-definitifs', 'courses', 'tirelire', 'reunion', 'pronostics-detailles', 'performances-detaillees']:
 	try:
 		db.create_collection(t)
 	except:
 		continue

 files = sorted(glob.glob('data/*'), key=os.path.basename)
 #files = sorted(glob.glob('data/20130311*'), key=os.path.basename)
 file_mask = re.compile('(?P<key>(?P<date>[0-9]{8})_(R(?P<reunion>[0-9]+)-)?(C(?P<course>[0-9]+)-)?)(?P<type>\S+)\.json')

 def process_file(file, data): 
 	res = re.match(file_mask, os.path.basename(file))
 	dateYmd = res.group('date')
 	key = res.group('key')
 	if (key[-1:] == '-' or key[-1:] == '_'):
 		key = key[:-1]
 	reunion = res.group('reunion')
 	if (reunion is not None):
 		reunion = int(reunion)
 	course = res.group('course')
 	if (course is not None):
 		course = int(course)
 	type = res.group('type')
 	timestamp = str(int(time.mktime(datetime.datetime.strptime(dateYmd, '%Y%m%d').timetuple())))+'000' #Win
 	# timestamp = datetime.datetime.strptime(dateYmd, '%Y%m%d').strftime('%s000')
 	
 	if type == 'courses':
 		data = data['programme']
 	elif type == 'reunion':
 		data['date'] = data['dateReunion']
 		data['numeroReunion'] = reunion
 	elif (type == 'participants') or (type == 'pronostics-detailles'):
 		data['date'] = timestamp
 		data['timezoneOffset'] = 3600000
 		data['numeroReunion'] = reunion
 		data['numeroCourse'] = course
 	elif type == 'masse-enjeu':
 		data= {'enjeu': data}
 		data['date'] = timestamp
 		data['timezoneOffset'] = 3600000
 		data['numeroReunion'] = reunion
 		data['numeroCourse'] = course
 	elif type == 'rapports-definitifs':
 		data = {'rapports': data}
 		data['date'] = timestamp
 		data['timezoneOffset'] = 3600000
 		data['numeroReunion'] = reunion
 		data['numeroCourse'] = course
 	elif type == 'combinaisons':
 		data['date'] = data['dateProgramme']
 	elif type == 'performances-detaillees':
 		data['date'] = timestamp
 		data['numeroReunion'] = reunion
 		data['numeroCourse'] = course
 	elif type == 'tirelire':
 		data = data
 	else:
 		raise Exception('type inconnu')
 	
 	data['dateYmd'] = dateYmd
 	data['_key'] = key
 	collection = db.col(type)
 	collection.create_document(data)


 i = 0
 for file in files: 
 	try: 
 		with open(file, 'r', encoding="utf-8") as f:
 			data = json.load(f)
 			process_file(file, data)
 		print('Done with '+file)
 	except arango.exceptions.DocumentCreateError as e:
 		if (e.args[0] == 'cannot create document, unique constraint violated'):
 			print('Failure with '+file+' : already added')
 		else:
 			print('/!\ Failure with '+file+' : ')
 			raise
 	except:
 		print('/!\ Failure with '+file+' : ')
 		# +str(traceback.print_exc())
 		raise
 	i += 1
 	if (i % 500 == 0):
 		print('Done: '+str(i)+'/'+str(len(files))+' ('+str(int(i/len(files)*100))+'%)', file=sys.stderr)
	#!python3

	import sys, glob, os, re, datetime, time, json
	import traceback
	import arango
	from arango import Arango
	client = Arango(host="localhost", port=8529)
	db = client.db('jeu')

	types = ['combinaisons', 'masse-enjeu', 'participants', 'rapports-definitifs', 'courses', 'tirelire', 'reunion', 'pronostics-detailles', 'performances-detaillees']
	for t in types:
	try:
	db.create_collection(t)
	except:
	continue

	#files = sorted(glob.glob('data/20130311*'), key=os.path.basename)
	file_mask = re.compile('(?P<key>(?P<date>[0-9]{8})_(R(?P<reunion>[0-9]+)-)?(C(?P<course>[0-9]+)-)?)(?P<type>\S+)\.json')

	def process_file(file, data):
	res = re.match(file_mask, os.path.basename(file))
	dateYmd = res.group('date')
	key = res.group('key')
	if (key[-1:] == '-' or key[-1:] == '_'):
	key = key[:-1]
	reunion = res.group('reunion')
	if (reunion is not None):
	reunion = int(reunion)
	course = res.group('course')
	if (course is not None):
	course = int(course)
	type = res.group('type')
	timestamp = str(int(time.mktime(datetime.datetime.strptime(dateYmd, '%Y%m%d').timetuple())))+'000' #Win
	# timestamp = datetime.datetime.strptime(dateYmd, '%Y%m%d').strftime('%s000')

	if type == 'courses':
	data = data['programme']
	elif type == 'reunion':
	data['date'] = data['dateReunion']
	data['numeroReunion'] = reunion
	elif (type == 'participants') or (type == 'pronostics-detailles'):
	data['date'] = timestamp
	data['timezoneOffset'] = 3600000
	data['numeroReunion'] = reunion
	data['numeroCourse'] = course
	elif type == 'masse-enjeu':
	data= {'enjeu': data}
	data['date'] = timestamp
	data['timezoneOffset'] = 3600000
	data['numeroReunion'] = reunion
	data['numeroCourse'] = course
	elif type == 'rapports-definitifs':
	data = {'rapports': data}
	data['date'] = timestamp
	data['timezoneOffset'] = 3600000
	data['numeroReunion'] = reunion
	data['numeroCourse'] = course
	elif type == 'combinaisons':
	data['date'] = data['dateProgramme']
	elif type == 'performances-detaillees':
	data['date'] = timestamp
	data['numeroReunion'] = reunion
	data['numeroCourse'] = course
	elif type == 'tirelire':
	data = data
	else:
	raise Exception('type inconnu')

	data['dateYmd'] = int(dateYmd)
	data['_key'] = key

	return data


	commit_interval = 5000
	i = 0
	nb_files = len(sorted(glob.glob('data/*.json'), key=os.path.basename))
	for type in types:
	j = 0
	collection = db.col(type)
	files = sorted(glob.glob('data/*'+type+'.json'), key=os.path.basename)
	docs = []

	for file in files:
	try:
	with open(file, 'r', encoding="utf-8") as f:
	data = json.load(f)
	docs.append(process_file(file, data))

	print('Read file '+file)

	if (j % commit_interval == 0):
	print('Read '+str(commit_interval)+' files, sending')
	collection.import_documents(docs, complete=False, details=True)
	docs = []

	except arango.exceptions.DocumentCreateError as e:
	if (e.args[0] == 'cannot create document, unique constraint violated'):
	print('Failure with '+file+' : already added')
	else:
	print('/!\ Failure with '+file+' : ')
	raise

	except:
	print('/!\ Failure with '+file+' : ')
	# +str(traceback.print_exc())
	raise

	i += 1
	j += 1
	if (j % 500 == 0):
	print('Done '+type+' : '+str(j)+'/'+str(len(files))+' ('+str(int(j/len(files)100))+'%) / '+str(i)+'/'+str(nb_files)+' ('+str(int(i/nb_files100))+'%)', file=sys.stderr)

	print('Done '+type+' : '+str(j)+'/'+str(len(files))+' ('+str(int(j/len(files)100))+'%) / '+str(i)+'/'+str(nb_files)+' ('+str(int(i/nb_files100))+'%)', file=sys.stderr)
	collection.import_documents(docs, complete=False, details=True)

	'''

	FOR i IN reunion
	FILTER i.meteo != null
	RETURN {'date': i.dateYmd, 'meteo': i.meteo}


	https://www.arangodb.com/download/
	! https://github.com/joowani/python-arango
	https://github.com/saeschdivara/ArangoPy
	https://docs.arangodb.com/cookbook/XCopyInstallWindows.html
	http://127.0.0.1:8529/_db/jeu/_admin/aardvark/standalone.html#collection/combinaisons/20141123_R2-C8

	http://api.mongodb.org/python/current/api/pymongo/database.html
	https://stackoverflow.com/questions/15478127/remove-final-character-from-string-python
	https://stackoverflow.com/questions/19801727/convert-datetime-to-unix-timestamp-and-convert-it-back-in-python
	https://stackoverflow.com/questions/23086383/how-to-test-nonetype-in-python
	https://docs.arangodb.com/HttpBulkImports/index.html
	https://www.arangodb.com/2012/09/bulk-insert-benchmark-tool/

	http://vschart.com/compare/arangodb/vs/mongodb/vs/couchbase
	https://docs.arangodb.com/IndexHandling/Geo.html

	http://fr.slideshare.net/arangodb/introduction-to-column-oriented-databases
	https://dzone.com/articles/introducing-arangodb
	https://dzone.com/articles/practical-tips-to-reduce-sql-server-database-table
	http://blog.sqlauthority.com/2015/11/30/sql-server-practical-tips-to-reduce-sql-server-database-table-size-experts-opinion/



	'''