Last active
April 19, 2024 13:44
-
-
Save sueszli/c8bd7ec5d821e281be9cabcf2fa51fef to your computer and use it in GitHub Desktop.
bypassing github storage service
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import hashlib | |
import sys | |
import pathlib | |
import subprocess | |
""" | |
github commits are restricted to 25-50 MiB, varying based on the push method [^1]. | |
to handle files beyond this limit, git lfs (large file storage) pointers are necessary, referencing an external lfs server [^2]. | |
however, this method incurs a monthly cloud storage fee to github [^3]. | |
this is a failed attempt at bypassing the file size limit by committing a large file in small chunks: | |
> remote: warning: File huge-ass-file.tar is 60.00 MB; this is larger than GitHub's recommended maximum file size of 50.00 MB | |
> remote: error: Trace: 2fa983a46f7b5205ea9bbef6e118069f7426f07618935e67ed6225df9647d617 | |
> remote: error: See https://gh.io/lfs for more information. | |
> ... | |
> remote: error: File huge-ass-file.tar is 150.00 MB; this exceeds GitHub's file size limit of 100.00 MB | |
> remote: error: File huge-ass-file.tar is 200.00 MB; this exceeds GitHub's file size limit of 100.00 MB | |
[^1]: docs: https://docs.github.com/en/repositories/working-with-files/managing-large-files/about-large-files-on-github#file-size-limits | |
[^2]: nice comment: wokwokwok, 2021 on hackernews, https://news.ycombinator.com/item?id=27134972#:~:text=of%20such%20projects-,wokwokwok,-on%20May%2013 | |
[^3]: https://docs.github.com/en/billing/managing-billing-for-git-large-file-storage/about-billing-for-git-large-file-storage | |
""" | |
def assert_matching_checksums(filepath1: pathlib.Path, filepath2: pathlib.Path) -> None: | |
print(f"verifying checksums...") | |
checksum1 = hashlib.md5(pathlib.Path(filepath1).read_bytes()).hexdigest() | |
checksum2 = hashlib.md5(pathlib.Path(filepath2).read_bytes()).hexdigest() | |
assert checksum1 == checksum2, f"checksums do not match: {checksum1} != {checksum2}" | |
print(f"checksums match: {checksum1} == {checksum2}") | |
def assert_matching_filesizes(filepath1: pathlib.Path, filepath2: pathlib.Path) -> None: | |
print(f"verifying file sizes...") | |
filesize1 = filepath1.stat().st_size | |
filesize2 = filepath2.stat().st_size | |
assert filesize1 == filesize2, f"file sizes do not match: {filesize1} != {filesize2}" | |
print(f"file sizes match: {filesize1} == {filesize2}") | |
if __name__ == "__main__": | |
file = pathlib.Path(sys.argv[1]) | |
assert pathlib.Path(".git").exists(), "put this script inside the git directory you want to copy the file to" | |
assert file.exists(), f"file does not exist: {file}" | |
assert file.is_file(), f"not a file: {file}" | |
assert not any([sibling.name == ".git" for sibling in list(file.parent.glob("*"))]), f"{file} should not be in a .git directory" | |
filesize = file.stat().st_size | |
print(f"{file.name} size: {file.stat().st_size}") | |
print(f"copying and committing chunks to github...") | |
chunk_size = 30 * 1024 * 1024 | |
num_chunks = (file.stat().st_size // chunk_size) + 1 | |
with open(file.name, "wb") as f: | |
pass | |
for i in range(num_chunks): | |
with open(file, "rb") as f: | |
# read | |
f.seek(i * chunk_size) | |
chunk = f.read(chunk_size) | |
if not chunk: | |
print(f"no more chunks to read at iteration {i}") | |
break | |
# append to file in this directory | |
with open(file.name, "ab") as g: | |
g.write(chunk) | |
# push to github | |
subprocess.run(["git", "add", file.name]) | |
subprocess.run(["git", "commit", "-m", f"git lfs exploit auto commit: {file.name} - {i}/{num_chunks}"]) | |
subprocess.run(["git", "push"]) | |
print(f"\033[92mprogress: {i}/{num_chunks} \033[0m") | |
assert_matching_checksums(file, pathlib.Path(file.name)) | |
assert_matching_filesizes(file, pathlib.Path(file.name)) | |
print(f"finished! {file.name} pushed to github") |
for some reason with the version above the checksum doesn't match and the file is corrupted
here's a much nicer version:
split:
file_path=$1
if [ -z $file_path ]; then echo "file path not given"; exit 1; fi
if [ ! -f $file_path ]; then echo "file not found"; exit 1; fi
if [ ! -s $file_path ]; then echo "file is empty"; exit 1; fi
echo "file found: $file_path"
# validate .gitignore
if [ ! -f .gitignore ]; then echo ".gitignore not found"; exit 1; fi
if ! grep -q "tmp/" .gitignore; then echo "tmp/ not in .gitignore"; exit 1; fi
if ! grep -q "data-merged/" .gitignore; then echo "data-merged/ not in .gitignore"; exit 1; fi
# create tmp directory
rm -rf tmp
mkdir tmp
echo "created tmp directory"
# copy file to tmp directory
cp $file_path tmp
echo "copied $file_path to tmp directory"
# create checksum file
checksum=$(md5sum tmp/$(basename $file_path) | awk '{ print $1 }')
echo $checksum > tmp/$(basename $file_path).md5
echo "created checksum file: $(basename $file_path).md5"
# split file into chunks in tmp directory
chunk_size=$((50 * 1024 * 1024))
split -b $chunk_size tmp/$(basename $file_path) tmp/$(basename $file_path)-chunk-
# create data directory
rm -rf data
mkdir data
echo "created data directory"
# copy checksum
mv tmp/$(basename $file_path).md5 data
# iterate over chunks, push to git
num_chunks=$(ls tmp/$(basename $file_path)-chunk-* | wc -l)
counter=0
for chunk in tmp/$(basename $file_path)-chunk-*; do
counter=$((counter + 1))
progress_str=$(printf "%d/%d" $counter $num_chunks)
mv $chunk data
git add .
git commit -m "auto commit: $(basename $chunk) $progress_str"
git push
echo "🟢 pushed $(basename $chunk) $progress_str"
done
echo "🟢 done"
exit 0
merge:
# validate .gitignore
if [ ! -f .gitignore ]; then echo ".gitignore not found"; exit 1; fi
if ! grep -q "tmp/" .gitignore; then echo "tmp/ not in .gitignore"; exit 1; fi
if ! grep -q "data-merged/" .gitignore; then echo "data-merged/ not in .gitignore"; exit 1; fi
# validate ./data/* files
if [ ! -d data ]; then echo "data/ directory not found"; exit 1; fi
if ! ls data/*-chunk-* &> /dev/null && ! ls data/*.md5 &> /dev/null; then echo "invalid files found in data/"; exit 1; fi
# create data-merged directory
rm -rf data-merged
mkdir data-merged
echo "created data-merged directory"
# merge chunks into data-merged directory
cat data/*-chunk-* > data-merged/merged.tar
echo "merged chunks into data-merged/merged.tar"
# validate checksum
expected_checksum=$(cat data/*.md5)
actual_checksum=$(md5sum data-merged/merged.tar | awk '{ print $1 }')
if [ $expected_checksum != $actual_checksum ]; then echo "checksum mismatch"; exit 1; fi
echo "checksum matched: $expected_checksum == $actual_checksum"
echo "🟢 done"
exit 0
update: the script above worked just fine and it was surprisingly easy to set up
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
here's what i meant: