Last active
February 14, 2024 10:43
-
-
Save sukharevd/6202222 to your computer and use it in GitHub Desktop.
Synchronizes directory with gzipped content of Amazon S3 bucket with local one to avoid redundant synchronization requests when files were not changed, but MD5 sums of Gzipped files are different.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# -*- coding: utf-8 -*- | |
''' | |
AWS S3 Gzip compression utility | |
Author: Dmitriy Sukharev | |
Modified: 2013-09-11 | |
------- | |
Synchronizes directory with gzipped content of Amazon S3 bucket with local | |
one to avoid redundant synchronization requests when files were not changed, | |
but MD5 sums of Gzipped files are different. This script is part of article | |
http://sukharevd.net/gzipping-website-in-amazon-s3-bucket.html | |
''' | |
''' | |
Algorithm: | |
Precondition: last compressed publication is in the last_publication | |
directory, sha512sum is in file sha512.digest | |
1. Read sha512.digest into dictionary | |
2. For each file in output directory: | |
-- If sha512 differ or dictionary doesn't contain hash sum, update | |
last_publication directory with gzipped version of the file. | |
3. Rewrite sha512.digest | |
''' | |
import os, sys, gzip, hashlib, shutil | |
if len(sys.argv) != 3: | |
print 'Command should have 2 arguments: output dir and publication dir' | |
sys.exit(0) | |
OUTPUT_DIR = sys.argv[1] | |
PUBLICATION_DIR = sys.argv[2] | |
HASH_SUM_FILE = sys.argv[2] + '/SHA512SUM' | |
GZIPPED_EXTENSIONS = ('html', 'js', 'css', 'xml') | |
def read_hash_codes(filename): | |
hashes = {} | |
try: | |
with open(filename) as file: | |
lines = file.readlines() | |
for line in lines: | |
split = line.split() | |
assert(len(split) == 2) | |
hashes[split[1]] = split[0] | |
file.close() | |
return hashes | |
except: | |
return {} | |
def update_gzipped_publications(output_dir, publication_dir): | |
for root, subs, files in os.walk(output_dir): | |
for f in files: | |
filename = os.path.join(root, f) | |
relpath = os.path.relpath(filename, output_dir); | |
if (relpath.endswith(GZIPPED_EXTENSIONS)): | |
# can be a problem if files are big: | |
current_hash = hashlib.sha512(open(filename).read()).hexdigest() | |
if not (relpath in hashes and hashes[relpath] == current_hash): | |
publicatedFile = os.path.join(publication_dir, relpath); | |
directoryOfFile = os.path.dirname(publicatedFile) | |
if not os.path.exists(directoryOfFile): | |
os.makedirs(directoryOfFile) | |
with gzip.open(publicatedFile, 'w') as fw: | |
with open(filename) as fr: | |
blocksize = 65536 | |
buf = fr.read(blocksize) | |
while len(buf) > 0: | |
fw.write(buf) | |
buf = fr.read(blocksize) | |
hashes[relpath] = current_hash | |
print filename + ' renewed' | |
else: | |
publicated_file = os.path.join(publication_dir, relpath); | |
directory_of_file = os.path.dirname(publicated_file) | |
if not os.path.exists(directory_of_file): | |
os.makedirs(directory_of_file) | |
shutil.copy(filename, directory_of_file) | |
def rewrite_hash_codes(hash_sum_file, hashes): | |
with open(hash_sum_file, 'w') as fw: | |
for key in hashes: | |
fw.write(hashes[key] + ' ' + key + '\n') | |
hashes = read_hash_codes(os.path.abspath(HASH_SUM_FILE)) | |
update_gzipped_publications(os.path.abspath(OUTPUT_DIR), os.path.abspath(PUBLICATION_DIR)) | |
rewrite_hash_codes(os.path.abspath(HASH_SUM_FILE), hashes) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
@sukharevd This is to work with python3 and few pylint fixes.