Last active
November 17, 2022 00:50
-
-
Save 123andy/ade63284496877e1ad40eb5f03265ad2 to your computer and use it in GitHub Desktop.
A Plugin for Gzipping old Edocs for local EDOC storage on REDCap
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
This is a plugin that was used to gzip old edocs that were created prior to when REDCap introduced Gzip. | |
Which was with version 6 in 2014! So, if you've only been REDCapping for a while, then this might not | |
have that much importance. If, like us, you've got an old instance with lots of old files, this could save | |
you some substantial disk space | |
Compressing old .csv, .r, .xls, .sas exports can free up about 90% of their previous storage space. | |
On our server, prior to gzipping all our older exports, we were using 28 GB for older files. After running this | |
plugin over and over again in batches of 10,000 files, I was able to compress some 157,618 previously uncompressed | |
edoc files and go from around 28GB to 3GB for all these files. This is still a small percentage of our total | |
edocs of 1.1TB... So, no miracles here but at lease it is consistent. | |
*/ | |
require_once "../redcap_connect.php"; | |
// Only allowed projects or superuser | |
if (!SUPER_USER) { | |
die("superuser required"); | |
} | |
class EdocZipper { | |
public $conn; | |
const ZIPPABLE_SUFFIXES = ['csv','sas','sps','do','r','xml']; | |
public $errors = []; | |
public $missing_files = []; | |
public $deleted_files = []; | |
public $size_change = []; | |
public $archive_path; | |
public $last_doc_id; | |
// Find docs that are marked a server_side deleted but are still present | |
public function getDeletedEdocs() { | |
$sql = "select * from redcap_edocs_metadata rem where date_deleted_server is not null"; | |
$q = db_query($sql); | |
$should_be_deleted_files = []; | |
$i=0; | |
while ($row = db_fetch_assoc($q)) { | |
$i++; | |
$stored_name = $row['stored_name']; | |
if (file_exists(EDOC_PATH . $stored_name)) { | |
$should_be_deleted_files[] = "rm -f " . EDOC_PATH . $stored_name; | |
} | |
} | |
echo "\nScanned $i files and found " . count($should_be_deleted_files) . " that are still present"; | |
echo "<pre>" . implode("\n",$should_be_deleted_files) . "</pre>"; | |
} | |
// Find docs that are marked for deletion but not marked as deleted | |
public function getDeletedEdocs2() { | |
$sql = "select * from redcap_edocs_metadata rem where delete_date is not null and date_deleted_server is null"; | |
$q = db_query($sql); | |
$should_be_deleted_files = []; | |
$i=0; | |
while ($row = db_fetch_assoc($q)) { | |
$i++; | |
$stored_name = $row['stored_name']; | |
if (file_exists(EDOC_PATH . $stored_name)) { | |
// $should_be_deleted_files[] = "rm -f " . EDOC_PATH . $stored_name; | |
$should_be_deleted_files[] = "$stored_name / " . $row['delete_date']; | |
} | |
} | |
echo "\nScanned $i files and found " . count($should_be_deleted_files) . " that present but missing date_deleted_server"; | |
echo "<pre>" . implode("\n",$should_be_deleted_files) . "</pre>"; | |
} | |
public function getArchiveFolder($makeIfMissing=true) { | |
$this->archive_path = EDOC_PATH . "gzipped"; | |
if(is_dir($this->archive_path)) { | |
// already created | |
return $this->archive_path; | |
} else if ($makeIfMissing) { | |
exec("mkdir $this->archive_path"); | |
return $this->getArchiveFolder(false); | |
} else { | |
throw new Exception("Unable to create/find archive path: $this->archive_path"); | |
} | |
} | |
public function getArchiveCount() { | |
// Get number of files in the archive folder | |
$result = exec("ls -1q " . $this->getArchiveFolder() ." | wc -l"); | |
return intval($result); | |
} | |
public function getCountUnzippedFiles($starting_after = 0) { | |
$sql = "select count(*), sum(doc_size) from redcap_edocs_metadata where file_extension in " . | |
"('" . implode("','", self::ZIPPABLE_SUFFIXES) . "') " . | |
"and gzipped = 0 and doc_id > $starting_after order by doc_id"; | |
$q = db_query($sql); | |
$row = db_fetch_row($q); | |
$count = $row[0]; | |
$size = $row[1]; | |
return array($count, $size); | |
} | |
public function getUnzippedFiles($count = 100, $starting_after = 0) { | |
$sql = "select * from redcap_edocs_metadata where file_extension in " . | |
"('" . implode("','", self::ZIPPABLE_SUFFIXES) . "') " . | |
"and gzipped = 0 and doc_id > " . $starting_after . " order by doc_id limit " . $count; | |
$q = db_query($sql); | |
echo "$sql\n"; | |
$rows = []; | |
while($row=db_fetch_assoc($q)) { | |
$rows[] = $row; | |
} | |
return $rows; | |
} | |
public function updateEdoc($doc_id, $stored_name, $gzipped = 1) { | |
$sql = "update redcap_edocs_metadata set stored_name='" . | |
db_real_escape_string($stored_name) . "', gzipped=1 " . | |
"where doc_id=$doc_id"; | |
$q = db_query($sql); | |
if ($q === false) { | |
$this->errors[] = "Error updating $sql"; | |
} | |
return $q; | |
} | |
public function processFiles($edocRows) { | |
$archive_path = $this->getArchiveFolder(); | |
$i = 0; | |
$success = []; | |
foreach ($edocRows as $row) { | |
$i++; | |
$doc_id = $row['doc_id']; | |
$this->last_doc_id = $doc_id; | |
$stored_name = $row['stored_name']; | |
$delete_date = $row['delete_date']; | |
$date_deleted_server = $row['date_deleted_server']; | |
$size_before = $row['doc_size']; | |
if ($row['doc_size'] == 0) { | |
echo "[$i:$doc_id] Has zero size - skipping\n"; | |
continue; | |
} | |
if (file_exists(EDOC_PATH . $stored_name)) { | |
if (!empty($date_deleted_server)) { | |
$this->errors[] = "[$i:$doc_id] WARNING: Edoc source found even though it is marked as deleted: $stored_name / $delete_date"; | |
continue; | |
} | |
if (!empty($delete_date)) { | |
// logically deleted -- ignore | |
continue; | |
} | |
} else { | |
// EDOC file not found | |
if (empty($delete_date)) { | |
$this->errors[] = "[$i:$doc_id] FILE NOT FOUND at " . EDOC_PATH . "$stored_name : project " . $row['project_id']; | |
continue; | |
} else { | |
// deleted - skip | |
continue; | |
} | |
} | |
// File should exist! | |
$filename_tmp = Files::copyEdocToTemp($doc_id,true,true); | |
if (!file_exists($filename_tmp)) { | |
$this->errors[] = "[$i:$doc_id] Unable to create temp file from $stored_name as $filename_tmp"; | |
continue; | |
} | |
// Read file | |
$content = file_get_contents($filename_tmp); | |
if (empty($content)) { | |
$this->errors[] = "[$i:$doc_id] UNABLE TO READ CONTENT from $filename_tmp -- skipping"; | |
unlink($filename_tmp); | |
continue; | |
} | |
// Make .gz file | |
$temp_file = APP_PATH_TEMP . $stored_name . ".gz"; | |
$b = file_put_contents($temp_file, gzcompress($content,9)); | |
if ($b) { | |
echo "[$i:$doc_id] Saved gzipped edoc to temp $temp_file\n"; | |
} else { | |
$this->errors[] = "[$i:$doc_id] Unable to make gz file for doc $doc_id"; | |
continue; | |
} | |
$old_edoc_file = EDOC_PATH . $stored_name; | |
$new_edoc_file = EDOC_PATH . $stored_name . ".gz"; | |
if (file_exists($new_edoc_file)) { | |
$this->errors[] = "[$i:$doc_id] $new_edoc_file ALREADY EXISTS - skipping\n"; | |
continue; | |
} else { | |
// Get the size of the compressed folder | |
$size_after = filesize($temp_file); | |
// Move the temp file to edocs | |
$cmd = "mv $temp_file $new_edoc_file"; | |
$result = exec($cmd); | |
if (!empty($result)) { | |
echo "[$i:$doc_id] Failed to move new temp gz edoc to edocs: " . json_encode($result) . "\n"; | |
continue; | |
} else { | |
echo "[$i:$doc_id] Created $new_edoc_file\n"; | |
} | |
unlink($temp_file); | |
unlink($filename_tmp); | |
echo "[$i:$doc_id] Removed temp files\n"; | |
// Move original to new gzipped folder | |
$result = exec("mv $old_edoc_file $archive_path/$stored_name"); | |
if (!empty($result)) { | |
$this->errors[] = "[$i:$doc_id] Error archiving old edoc: " . json_encode($result); | |
continue; | |
} else { | |
echo "[$i:$doc_id] Archived $stored_name to $archive_path \n"; | |
} | |
$this->size_change[$doc_id] = [ | |
$size_before, | |
$size_after | |
]; | |
// Update edoc | |
$this->updateEdoc($row['doc_id'], $stored_name . ".gz",1); | |
$success[] = $doc_id; | |
} | |
$i++; | |
} | |
return $success; | |
} | |
} | |
global $edoc_storage_option; | |
//exit(json_encode($edoc_storage_option)); | |
if ($edoc_storage_option !== "0") { | |
throw new Exception("Only file-based edocs supported"); | |
} | |
$ez = new EdocZipper(); | |
$count = isset($_POST['count']) && is_numeric($_POST['count']) ? $_POST['count'] : 100; | |
$starting_after = isset($_POST['starting_after']) && is_numeric($_POST['starting_after']) ? $_POST['starting_after'] : 0; | |
list($file_count, $total_size) = $ez->getCountUnzippedFiles($starting_after); | |
?> | |
<html> | |
<head> | |
<style type="text/css"> | |
pre { | |
max-height: 200px; | |
max-width: 95%; | |
overflow: scroll; | |
border: 1px solid gray; | |
font-size: smaller; | |
} | |
</style> | |
</head> | |
<body> | |
<h1> | |
EDOC Gzip Utility | |
</h1> | |
<h2> | |
Gzip any old Edocs that were created before compression was added to REDCap | |
</h2> | |
<div> | |
This script will convert older edocs files that were not saved with compression to ones | |
with compression. This can significantly reduce the size of your edocs folder. | |
This will only process files if your edocs folder is in a server directory -- it does not | |
support webdav, s3, gcp, or other storage options. | |
</div> | |
<div><strong>DO NOT RUN THIS UNLESS YOU KNOW WHAT YOU ARE DOING!</strong></div> | |
<hr/> | |
<form id="form" name="form" method="post"> | |
<label for="count">Number to process:</label><input type="number" value="<?php echo $count ?>" name="count"/> | |
<label for="count">Start after (counting up) doc_id:</label><input type="number" name="starting_after" id="starting_after"/> | |
<button id="btn">Gzip Some Files</button> | |
<div>There are currently <?php echo $file_count ?> potential files to compress (starting after <?php echo $starting_after ?>) | |
with a total unzipped size of <?php echo floor($total_size/1024/1024) ?> MB</div> | |
</form> | |
<hr/> | |
<div> | |
<p> | |
This script automatically archives the original (non .gz) files to a sub-folder at | |
<code><?php echo $ez->getArchiveFolder() ?></code> in case something went badly wrong. | |
If all goes well, you can/should clean up and remove this folder when satisfied. | |
</p> | |
<p> | |
Currently, this folder contains <strong><?php echo $ez->getArchiveCount() ?></strong> files. | |
</p> | |
</div> | |
<hr/> | |
<div> | |
<?php | |
if ($_SERVER['REQUEST_METHOD'] == 'POST') { | |
$count = is_numeric($_POST['count']) ? $_POST['count'] : 100; | |
if ($count == -1) { | |
$ez->getDeletedEdocs(); | |
exit(); | |
} | |
if ($count == -2) { | |
$ez->getDeletedEdocs2(); | |
exit(); | |
} | |
$files = $ez->getUnzippedFiles($count, $starting_after); | |
echo "<h3>Processing $count of " . count($files) . " files that are gzippable and not currently " . | |
"zipped starting after $starting_after</h3>"; | |
echo "<pre>"; | |
$success = $ez->processFiles($files); | |
echo "</pre>"; | |
echo "<h3>Successfully processed " . count($success) . " edocs</h3>"; | |
echo "<pre>"; | |
echo "\n" . json_encode($success) . "\n"; | |
file_put_contents(APP_PATH_TEMP . "zipper/edoc_zipper_success.log", "\n" . json_encode($success), FILE_APPEND); | |
echo "</pre>"; | |
if (!empty($ez->deleted_files)) { | |
echo "<h4>" . count($ez->deleted_files) . " deleted edocs:</h4><pre>" . json_encode($ez->deleted_files) . "</pre>"; | |
file_put_contents(APP_PATH_TEMP . "zipper/edoc_zipper_deleted_files.log", "\n" . json_encode($ez->deleted_files), FILE_APPEND); | |
} | |
if (!empty($ez->errors)) { | |
echo "<h4>" . count($ez->errors) . " ERRORS:</h4><pre>" . print_r($ez->errors, true) . "</pre>"; | |
file_put_contents(APP_PATH_TEMP . "zipper/edoc_zipper_errors.log", "\n" . json_encode($ez->errors), FILE_APPEND); | |
} | |
if (!empty($ez->missing_files)) { | |
echo "<h4>" . count($ez->missing_files) . " MISSING FILES:</h4><pre>" . print_r($ez->missing_files, true) . "</pre>"; | |
file_put_contents(APP_PATH_TEMP . "zipper/edoc_zipper_missing_files.log", "\n" . json_encode($ez->missing_files), FILE_APPEND); | |
} | |
if (!empty($ez->size_change)) { | |
$total_before = 0; | |
$total_after = 0; | |
$total_delta = 0; | |
foreach ($ez->size_change as $d) { | |
list($before, $after) = $d; | |
$total_before = $total_before + $before; | |
$total_after = $total_after + $after; | |
$total_delta = $total_delta + ($before - $after); | |
} | |
$total_before = floor($total_before / 1024 / 1024); | |
$total_after = floor($total_after / 1024 / 1024); | |
$total_delta = floor($total_delta / 1024 / 1024); | |
$size_count = count($ez->size_change); | |
echo "<h4>Size Change:</h4>" . | |
"<pre> Count: $size_count files\nBefore: $total_before MB\n After: $total_after MB\nTotals: $total_delta MB Saved</pre>"; | |
file_put_contents(APP_PATH_TEMP . "zipper/edoc_zipper_size_change.log", "\n" . json_encode($ez->size_change), FILE_APPEND); | |
} | |
if (!empty($ez->last_doc_id)) { | |
echo "<div>Last doc_id processed: " . $ez->last_doc_id . "</div>"; | |
?> | |
<script> | |
document.getElementById('starting_after').value = '<?php echo $ez->last_doc_id ?>'; | |
</script> | |
<?php | |
} | |
} | |
?> | |
</div> | |
<script> | |
let f = document.getElementById('form'); | |
f.addEventListener('submit', function (event) { | |
document.getElementById('btn').insertAdjacentHTML('afterend', '<span>...running...</span>'); | |
document.getElementById('btn').remove(); | |
}); | |
</script> | |
</body> | |
</html> | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment