Last active
June 30, 2022 17:19
-
-
Save lstrojny/6d29aea45179668725f43650fa46c4e7 to your computer and use it in GitHub Desktop.
Use git-filter-repo to import lfs objects
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
from os import path | |
import shutil | |
import subprocess | |
from functools import lru_cache | |
import re | |
from fnmatch import translate | |
from hashlib import sha256 | |
from tempfile import NamedTemporaryFile | |
from collections.abc import Iterable | |
from vendor import git_filter_repo as fr | |
def chunked(size: int, chunk_size: int) -> Iterable[int]: | |
remaining = size | |
while remaining > 0: | |
yield min(remaining, chunk_size) | |
remaining -= chunk_size | |
def git(commands: list[str], dir: str = None) -> None: | |
args = ['git'] | |
if dir is not None: | |
args = [*args, '-C', dir] | |
args = [*args, *commands] | |
run_command(args, env={'GIT_LFS_SKIP_SMUDGE': '1'}) | |
def run_command(args: list[str], env=None, cwd=None) -> None: | |
env = env or {} | |
subprocess.run(args, check=True, env={**os.environ, **env}, cwd=cwd) | |
def lfs_import(source_dir, target_dir) -> None: | |
git(['init', '-b', 'prod', target_dir]) | |
cwd = os.getcwd() | |
try: | |
os.chdir(target_dir) | |
glob_expressions = [] | |
with open(path.join(source_dir, '.gitattributes')) as gitattributes: | |
for line in gitattributes.readlines(): | |
if 'filter=lfs' in line: | |
glob_expressions.append(line.split(' ')[0]) | |
glob_expressions = tuple(glob_expressions) | |
options = [ | |
'--source', source_dir, | |
'--target', target_dir, | |
'--quiet', | |
'--refs', 'prod' | |
] | |
cat_file_proc = subprocess.Popen(['git', '-C', source_dir, 'cat-file', '--batch'], | |
stdin=subprocess.PIPE, | |
stdout=subprocess.PIPE) | |
lfs_object_content = """version https://git-lfs.github.com/spec/v1 | |
oid sha256:{} | |
size {} | |
""" | |
migrated_lfs_blobs = {} | |
lfs_object_path = '.git/lfs/objects' | |
def migrate_lfs_commits(commit: fr.Commit, metadata: dict[str, any]): | |
change: fr.FileChange | |
for change in commit.file_changes: | |
if change.type == b'D': | |
continue # deleted files have no remaining content to filter | |
if change.mode in (b'120000', b'160000'): | |
continue # symlinks and submodules aren't text files we can filter | |
if change.blob_id in migrated_lfs_blobs: | |
change.blob_id = migrated_lfs_blobs[change.blob_id] | |
continue | |
if match_expressions(change.filename.decode(), glob_expressions): | |
orig_rev = fr.ID_TO_HASH[change.blob_id] | |
cat_file_proc.stdin.write(orig_rev + b'\n') | |
cat_file_proc.stdin.flush() | |
objhash, objtype, objsize = cat_file_proc.stdout.readline().split() | |
remaining = int(objsize) | |
checksum = sha256() | |
migrate = True | |
tmp = NamedTemporaryFile() | |
position = 0 | |
for chunk in chunked(int(objsize), 2 ** 13): | |
bytes_chunk = cat_file_proc.stdout.read(chunk) | |
# Maybe it already is an LFS object | |
if position == 0 and bytes_chunk.startswith(b'version https://git-lfs.github.com/spec/v1'): | |
migrate = False | |
position += 1 | |
checksum.update(bytes_chunk) | |
tmp.write(bytes_chunk) | |
assert b"\n" == cat_file_proc.stdout.read(1) | |
if not migrate: | |
tmp.seek(0) | |
for lfs_line in tmp.readlines(): | |
if lfs_line.startswith(b'oid sha256'): | |
sha256_checksum = lfs_line.strip()[11:].decode() | |
sub_folder = path.join(lfs_object_path, sha256_checksum[0:2], sha256_checksum[2:4]) | |
os.makedirs(path.join(target_dir, sub_folder), exist_ok=True) | |
source = path.join(source_dir, sub_folder, sha256_checksum) | |
shutil.copy(source, path.join(target_dir, sub_folder, sha256_checksum)) | |
print('LFS {}: "{}" preserved'.format(change.blob_id, change.filename.decode())) | |
break | |
continue | |
tmp.flush() | |
sha256_checksum = checksum.hexdigest() | |
content = lfs_object_content.format(sha256_checksum, objsize.decode()).encode() | |
lfs_blob = fr.Blob(content) | |
filter.insert(lfs_blob) | |
migrated_lfs_blobs[change.blob_id] = lfs_blob.id | |
change.blob_id = lfs_blob.id | |
sub_folder = path.join(target_dir, lfs_object_path, sha256_checksum[0:2], sha256_checksum[2:4]) | |
os.makedirs(sub_folder, exist_ok=True) | |
shutil.copy(tmp.name, path.join(sub_folder, sha256_checksum)) | |
print('LFS {}: "{}" imported ({}, {}, {}, {})'.format(change.blob_id, change.filename.decode(), | |
sha256_checksum, orig_rev, objsize, | |
remaining)) | |
filter = fr.RepoFilter(fr.FilteringOptions.parse_args(options), commit_callback=migrate_lfs_commits) | |
filter.run() | |
finally: | |
os.chdir(cwd) | |
@lru_cache | |
def compile_expressions(expressions: tuple[str]): | |
regexes = [translate(expression) for expression in expressions] | |
return re.compile('({})'.format('|'.join(regexes))).match | |
@lru_cache | |
def match_expressions(name: str, expressions: tuple[str]) -> bool: | |
match = compile_expressions(tuple(expressions)) | |
# This implementation is not fully compliant with gitattributes but good enough for my use case | |
previous_elements = [] | |
for element in name.split('/')[::-1]: | |
if match(path.join(element, *previous_elements)): | |
return True | |
previous_elements = [element, *previous_elements] | |
return False |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment