Skip to content

Instantly share code, notes, and snippets.

@mloayzagahona
Forked from dimo414/cat.py
Created March 23, 2018 17:56
Show Gist options
  • Save mloayzagahona/1150897e791323d2edce83e593ba4144 to your computer and use it in GitHub Desktop.
Save mloayzagahona/1150897e791323d2edce83e593ba4144 to your computer and use it in GitHub Desktop.
Fast File Concatenation in Python
'''
Tests different methods of concatenating files in Python.
'''
from __future__ import print_function
import json,os,shutil,subprocess
import util
def verify(file,expected):
count = 0
with open(file) as f:
for _ in (json.loads(i) for i in util.yield_json(f)):
count += 1
if count != expected:
print("Found %d objects, but expected %d! "
"File didn't write correctly." % (count,expected))
@util.timed
def cat_files_py(jdir,file,op):
using = ("Readlines" if op == 1 else "Read") if op else "ShUtil"
print("Using %s, " % using,end='')
with open(file,'wb') as out:
for i in os.listdir(jdir):
with open(os.path.join(jdir,i),'rb') as f:
if op == 1:
out.writelines(f.readlines())
elif op == 2:
out.write(f.read())
else:
shutil.copyfileobj(f, out)
@util.timed
def cat_files_sys(jdir,file,secondary):
if os.name == 'nt':
if secondary:
print("Using Copy, ",end='')
cmd = "copy \"%s\\*\" \"%s\" 1> nul 2>&1"
else:
print("Using Type, ",end='')
cmd = "type \"%s\\*\" > \"%s\" 2> nul"
else:
if secondary:
print("Using Cat, ",end='')
cmd = "cat \"%s\"/* > \"%s\""
else:
print("Using Xargs, ",end='')
cmd = "find \"%s\" -type f -print0 | xargs -0 cat > %s"
return subprocess.call(cmd % (jdir, file), shell=True)
if __name__ == '__main__':
testdir = "/tmp/json"
cachefile = "delme.txt"
count = len(os.listdir(testdir))
# Python
for i in range(3):
util.clear_cache()
cat_files_py(testdir,cachefile,i)
verify(cachefile,count)
# System Calls
for i in (True,False):
util.clear_cache()
cat_files_sys(testdir,cachefile,i)
verify(cachefile,count)
'''
Tests speed difference between seeking over many thousands of JSON files and reading one directly
'''
import hashlib,json,os,random,string,time
import util
def create_json_files(jdir,num=50000,static="Static_String"):
try:
os.makedirs(jdir)
except:
pass
for crypt in (hashlib.sha1((static+str(i)).encode('utf-8')).hexdigest() for i in range(num)):
d = dict(id=crypt,arg="Argument",arg2="AnotherArgument",time=time.time(),
text=
[''.join(random.choice(string.ascii_lowercase) for _ in range(100))
for _ in range(10)])
with open(os.path.join(jdir,crypt+".txt"),'w') as file:
json.dump(d,file,indent=1,sort_keys=True)
@util.timed
def read_files(jdir):
for f in os.listdir(jdir):
with open(os.path.join(jdir,f)) as file:
json.load(file)
@util.timed
def concatenate(jdir,file):
with open(file,'wb') as out:
for i in os.listdir(jdir):
with open(os.path.join(jdir,i),'rb') as f:
out.writelines(f.readlines())
@util.timed
def read_file(file,expected):
count = 0
with open(file) as f:
for i in util.yield_json(f):
json.loads(i)
count += 1
if count != expected:
print("Found %d objects, but expected %d! "
"File didn't write correctly." % (count,expected))
if __name__ == '__main__':
testdir = "/tmp/json"
cachefile = "delme.txt"
#util.clear_cache()
#create_json_files(testdir) # No need to run this more than once
util.clear_cache()
read_files(testdir)
util.clear_cache()
concatenate(testdir,cachefile)
util.clear_cache()
read_file(cachefile,len(os.listdir(testdir)))
'''
Utilities for testing file concatenation in Python.
'''
import subprocess,time
def timed(f):
def func(*args):
start = time.time()
ret = f(*args)
took = time.time() - start
print("%s took %f" % (f.__name__,took))
return ret
return func
def yield_json(lines):
'''Iterates over a file yeilding JSON objects. Expects the files
to be indented, such that root objects end '}' at the first index
of the line.
'''
store = []
for ln in lines:
if ln and ln[0] == '}': # End of object
store.append('}')
ret = store
store = [ln[1:]]
yield ''.join(ret)
else:
store.append(ln)
use_clean_cache=False
def clear_cache():
'''Attempts to clear disk caches on Linux - must be run as root'''
if use_clean_cache:
subprocess.call("sync; echo 3 > /proc/sys/vm/drop_caches", shell=True)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment