Skip to content

Instantly share code, notes, and snippets.

@larsxschneider
Last active October 8, 2016 13:09
Show Gist options
  • Save larsxschneider/b587862d1044e42d9935b7646f6536a2 to your computer and use it in GitHub Desktop.
Save larsxschneider/b587862d1044e42d9935b7646f6536a2 to your computer and use it in GitHub Desktop.
Scripts to help migrating source code to Git / Git LFS
#!/usr/bin/env python
#
# Script to identify extensions in a Git repository that should go to LFS
#
import os
cwd = os.getcwd()
min_size_mb = 0.5
result = {}
def is_binary(filename):
"""Return true if the given filename is binary.
@raise EnvironmentError: if the file does not exist or cannot be accessed.
@attention: found @ http://bytes.com/topic/python/answers/21222-determine-file-type-binary-text on 6/08/2010
@author: Trent Mick <[email protected]>
@author: Jorge Orpinel <[email protected]>"""
fin = open(filename, 'rb')
try:
CHUNKSIZE = 1024
while 1:
chunk = fin.read(CHUNKSIZE)
if '\0' in chunk: # found null byte
return True
if len(chunk) < CHUNKSIZE:
break # done
finally:
fin.close()
return False
def add_file(ext, size_mb):
if ext not in result:
result[ext] = { 'count_large' : 0, 'size_large' : 0, 'count_all' : 0, 'size_all' : 0 }
result[ext]['count_all'] = result[ext]['count_all'] + 1
result[ext]['size_all'] = result[ext]['size_all'] + size_mb
if size_mb > min_size_mb:
result[ext]['count_large'] = result[ext]['count_large'] + 1
result[ext]['size_large'] = result[ext]['size_large'] + size_mb
if not 'max' in result[ext] or size_mb > result[ext]['max']:
result[ext]['max'] = size_mb
if not 'min' in result[ext] or size_mb < result[ext]['min']:
result[ext]['min'] = size_mb
def print_line(ext, share_large, count_large, count_all, size_all, min, max):
print '{}{}{}{}{}{}{}'.format(
ext.ljust(30),
str(share_large).rjust(10),
str(count_large).rjust(10),
str(count_all).rjust(10),
str(size_all).rjust(10),
str(min).rjust(10),
str(max).rjust(10)
)
for root, dirs, files in os.walk(cwd):
for basename in files:
filename = os.path.join(root, basename)
try:
size_mb = float(os.path.getsize(filename)) / 1024 / 1024
if not filename.startswith(os.path.join(cwd, '.git')) and size_mb > 0:
if is_binary(filename):
file_type = "bin"
else:
file_type = "txt"
ext = filename
add_file('** all **', size_mb)
while ext.find('.') >= 0:
ext = ext[ext.find('.')+1:]
if ext.find('.') <= 0:
add_file(file_type + " - " + ext, size_mb)
except Exception, e:
print e
print_line('Extension', 'LShare', 'LCount', 'Count', 'Size', 'Min', 'Max')
print_line('-------','-------','-------','-------','-------','-------','-------')
for ext in sorted(result.keys(), key=lambda x: result[x]['size_large'], reverse=True):
if result[ext]['count_large'] > 0:
large_share = 100*result[ext]['count_large']/result[ext]['count_all']
print_line(
ext,
str(large_share) + ' %',
result[ext]['count_large'],
result[ext]['count_all'],
int(result[ext]['size_all']),
int(result[ext]['min']),
int(result[ext]['max'])
)
#!/bin/bash
#
# Script that searches the entire history of a Git repository for
# (potentially) unwanted directories. E.g. 3rd party directories,
# temp, build or Perforce stream directories.
# The script also prints the number of files under each directory
# to see the impact on the Git tree.
#
DIRS=$(git log --all --name-only --pretty=format: \
| awk -F'[^/]*$' '{print $1}' \
| sort -u \
| grep -i \
-e 3p \
-e 3rd \
-e artifacts \
-e assemblies \
-e backup \
-e bin \
-e build \
-e components \
-e debug \
-e deploy \
-e generated \
-e install \
-e lib \
-e modules \
-e obj \
-e output \
-e packages \
-e party \
-e recycle.bin \
-e release \
-e resources \
-e streams \
-e temp \
-e third \
-e tmp \
-e tools \
-e util \
-e vendor \
-e x64 \
-e x86 \
)
IFS=$'\n'
for i in $DIRS; do
if [ -e "$i" ]; then
FILE_COUNT=$(find "$i" -type f | wc -l)
echo "$FILE_COUNT $i"
else
echo "99999 !! DELETED !! $i"
fi
done | sort -n
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment