Skip to content

Instantly share code, notes, and snippets.

Created December 13, 2011 00:07
Show Gist options
  • Save anonymous/1469760 to your computer and use it in GitHub Desktop.
Save anonymous/1469760 to your computer and use it in GitHub Desktop.
Two scripts for 1. generating a git repo with lots of files and 2. generating and timing a series of small, synthetic commits to a repo
#!/usr/local/bin/python -t
# Originally by Joshua Redstone
#
# Hacky script to generate gobs of files to test git operations as a function
# of the number of files in the repo.
#
# Call at least with setting baseDir
# ./loadGen.py --baseDir=foo
# This creates a git repo with lots of files in dir 'foo'
#
import sys
import os
import random
import subprocess
import time
import optparse
def scanFiles(gitRoot):
"""Return tuple of list of all files and a list of all dirs in repo"""
print "Walking tree rooted at %s..." % gitRoot
# Build up list of all files in the repo currently
allFiles=[]
allDirs=[gitRoot]
for root, subFolders, files in os.walk(gitRoot):
if root.startswith(os.path.join(gitRoot, '.git')):
continue
for file in files:
fullFile=os.path.join(root,file)
allFiles.append(fullFile)
for subfolder in subFolders:
fullDir=os.path.join(root,subfolder)
if fullDir == os.path.join(gitRoot, '.git'):
continue
allDirs.append(fullDir)
return (allFiles, allDirs)
parser = optparse.OptionParser()
parser.add_option('--numFiles', help='Num files to create', default=1000000)
parser.add_option('--filesPerDir', help='Max files per directory', default=1000)
parser.add_option('--baseDir', help='Root of git repo to create')
(options, args) = parser.parse_args()
assert options.baseDir is not None
if not os.path.isdir(options.baseDir):
os.makedirs(options.baseDir)
subprocess.check_call(["git", "init", options.baseDir])
(allFiles, allDirs) = scanFiles(options.baseDir)
print "Generating files...."
for count in xrange(len(allFiles), options.numFiles):
dirNum = int(count / options.filesPerDir)
dirName = os.path.join(options.baseDir, "dir-num-%08d" % dirNum)
if not os.path.isdir(dirName):
os.mkdir(dirName)
fileName = os.path.join(dirName, "file-num-%08d" % count)
fh = open(fileName, "w")
fh.write("some contents for file %s" % fileName)
fh.close()
if (count % 10000) == 0:
print "Created file number %d" % count
os.chdir(options.baseDir)
print "Git adding new files"
subprocess.check_call(["git", "add", "."])
print "Git committing new files"
subprocess.check_call(["git", "commit", "-m", "some new files", "--no-status", "--quiet"])
#!/usr/local/bin/python -t
# Originally by Joshua Redstone
#
# Script to generate small, synthetic commits to a git repo and time the
# operations. There's cruft in this script around running gdb or evaluating
# pygit2
#
# run with:
# git reset --hard HEAD && simulate.py . ~/local/git/git
# where '.' is the root of the git repo and the second arg is the path to the
# git binary to use.
#
#
import sys
import os
import random
import subprocess
import time
import math
usePygit = False
#import pygit2
gitProg = "/usr/bin/git"
assert len(sys.argv) > 1
gitRoot=sys.argv[1]
os.chdir(gitRoot)
if len(sys.argv) > 2:
gitProg = sys.argv[2]
print "Reading dict..."
# All english words
words=[]
for word in open('/usr/share/dict/words','r').readlines():
words.append(word.strip())
def genLine(maxLen=75):
"""Gen random string of word-like stuff"""
res=""
while len(res) < maxLen:
if res:
res += " "
res += words[random.randint(0, len(words)-1)]
return res
profFile = "/tmp/gdb-poorman"
fh = open(profFile, "w")
fh.write("set pagination 0\n")
fh.write("thread apply all bt\n")
#fh.write("print 7\n")
#fh.write("detach\n")
#fh.write("print 3\n")
fh.close()
print "Walking tree rooted at %s..." % gitRoot
# Build up list of all files in the repo currently
allFiles=[]
allDirs=[gitRoot]
for root, subFolders, files in os.walk(gitRoot):
if root.startswith(os.path.join(gitRoot, '.git')):
continue
for file in files:
fullFile=os.path.join(root,file)
allFiles.append(fullFile)
for subfolder in subFolders:
fullDir=os.path.join(root,subfolder)
if fullDir == os.path.join(gitRoot, '.git'):
continue
allDirs.append(fullDir)
if usePygit:
repo = pygit2.Repository(os.path.join(gitRoot, '.git'))
lastCommitSecs=1
newFiles=[]
commitCount = 0
while True:
tt = time.time()
print "%s %0.3f: Bonus commit %5d" % (time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(tt)),
tt - math.floor(tt), commitCount)
# Slowed down things by like 8x
#print time.strftime(" %H:%M:%S", time.localtime()) + ": Executing optional read-tree"
#subprocess.check_call([gitProg,"read-tree", "HEAD"])
#print time.strftime(" %H:%M:%S", time.localtime()) + ": Executing optional update index"
#subprocess.check_call([gitProg,"update-index", "--refresh"])
print time.strftime(" %H:%M:%S", time.localtime()) + ": Modifying files"
modifiedFiles=[]
totalNumFiles = len(allFiles) + len(newFiles)
numFilesToMod = random.randint(1, 5)
for i in xrange(numFilesToMod):
# Generate filename
fileIdx = random.randint(0, totalNumFiles - 1)
if fileIdx < len(allFiles):
filename = allFiles[fileIdx]
else:
filename = newFiles[fileIdx - len(allFiles)]
modifiedFiles.append(filename)
print " Modifying file %s" % filename
# Mess up some data in the file
size = os.path.getsize(filename)
fh = open(filename, 'r+')
if size <= 80:
fh.seek(0)
else:
fh.seek(random.randint(0, size-80))
newStr=genLine()[:75] + "\n"
fh.write(newStr)
# write a few lines at the end
fh.seek(0, os.SEEK_END)
for j in xrange(random.randint(1, 4)):
fh.write(genLine())
fh.close()
# Every third commit create a new file
if (commitCount % 3) == 0:
idx = random.randint(0, len(allDirs)-1)
inDir = allDirs[idx]
newName = os.path.join(inDir,
"new-file-%010d-%010d" % (commitCount, random.randint(0, 10000000)))
print " Adding new file %s(%d %s)" % (newName, idx, inDir)
fh = open(newName, 'w')
for j in xrange(random.randint(10, 15)):
fh.write(genLine())
fh.write("\n")
fh.close()
modifiedFiles.append(newName)
# Now do the actual commit
print time.strftime(" %H:%M:%S", time.localtime()) + ": Adding files.."
if usePygit:
index = repo.index
for filen in modifiedFiles:
assert filen.startswith(gitRoot + "/")
modf = filen[(len(gitRoot)+1):]
index.add(modf)
index.write()
author = pygit2.Signature('Test', '[email protected]', int(time.time()), 0)
print author
masterSha = repo.lookup_reference('refs/heads/master')
else:
subprocess.check_call([gitProg,"add"] + modifiedFiles)
#print time.strftime(" %H:%M:%S", time.localtime()) + ": Executing optional write-tree"
#subprocess.check_call([gitProg,"write-tree"])
msg = genLine(50)
print time.strftime(" %H:%M:%S", time.localtime()) + ": Committing with msg \"%s\".." % msg
starts = time.time()
if usePygit:
sha = repo.create_commit("refs/heads/master", author, author, msg, index.create_tree(), [masterSha.oid])
#repo.create_reference('refs/heads/master', sha)
ends = time.time()
else:
comm = [gitProg, "commit", "--no-status", "-m", msg]
#proc = subprocess.Popen(["gdb", "--args"] + comm)
proc = subprocess.Popen(comm)
pid = proc.pid
if False:
time.sleep(random.uniform(0.1, lastCommitSecs))
print "popen"
traceProc = subprocess.Popen(["gdb", "--command=" + profFile, "-batch", "--pid=%d" % (pid), "-l", "2", gitProg ])
print "About to wait for tracepid %d of pid %d" % (traceProc.pid, pid)
assert proc.wait() == 0
ends = time.time()
lastCommitSecs = ends - starts
#time.sleep(1)
#traceProc.kill()
print " Commit took %.3f seconds. Whole op took %.3f seconds" % (ends-starts,
ends-tt)
commitCount += 1
#break
print "Should never get here"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment