Skip to content

Instantly share code, notes, and snippets.

@123andy
Last active November 17, 2022 00:50
Show Gist options
  • Save 123andy/ade63284496877e1ad40eb5f03265ad2 to your computer and use it in GitHub Desktop.
Save 123andy/ade63284496877e1ad40eb5f03265ad2 to your computer and use it in GitHub Desktop.
A Plugin for Gzipping old Edocs for local EDOC storage on REDCap
<?php
/**
This is a plugin that was used to gzip old edocs that were created prior to when REDCap introduced Gzip.
Which was with version 6 in 2014! So, if you've only been REDCapping for a while, then this might not
have that much importance. If, like us, you've got an old instance with lots of old files, this could save
you some substantial disk space
Compressing old .csv, .r, .xls, .sas exports can free up about 90% of their previous storage space.
On our server, prior to gzipping all our older exports, we were using 28 GB for older files. After running this
plugin over and over again in batches of 10,000 files, I was able to compress some 157,618 previously uncompressed
edoc files and go from around 28GB to 3GB for all these files. This is still a small percentage of our total
edocs of 1.1TB... So, no miracles here but at lease it is consistent.
*/
require_once "../redcap_connect.php";
// Only allowed projects or superuser
if (!SUPER_USER) {
die("superuser required");
}
class EdocZipper {
public $conn;
const ZIPPABLE_SUFFIXES = ['csv','sas','sps','do','r','xml'];
public $errors = [];
public $missing_files = [];
public $deleted_files = [];
public $size_change = [];
public $archive_path;
public $last_doc_id;
// Find docs that are marked a server_side deleted but are still present
public function getDeletedEdocs() {
$sql = "select * from redcap_edocs_metadata rem where date_deleted_server is not null";
$q = db_query($sql);
$should_be_deleted_files = [];
$i=0;
while ($row = db_fetch_assoc($q)) {
$i++;
$stored_name = $row['stored_name'];
if (file_exists(EDOC_PATH . $stored_name)) {
$should_be_deleted_files[] = "rm -f " . EDOC_PATH . $stored_name;
}
}
echo "\nScanned $i files and found " . count($should_be_deleted_files) . " that are still present";
echo "<pre>" . implode("\n",$should_be_deleted_files) . "</pre>";
}
// Find docs that are marked for deletion but not marked as deleted
public function getDeletedEdocs2() {
$sql = "select * from redcap_edocs_metadata rem where delete_date is not null and date_deleted_server is null";
$q = db_query($sql);
$should_be_deleted_files = [];
$i=0;
while ($row = db_fetch_assoc($q)) {
$i++;
$stored_name = $row['stored_name'];
if (file_exists(EDOC_PATH . $stored_name)) {
// $should_be_deleted_files[] = "rm -f " . EDOC_PATH . $stored_name;
$should_be_deleted_files[] = "$stored_name / " . $row['delete_date'];
}
}
echo "\nScanned $i files and found " . count($should_be_deleted_files) . " that present but missing date_deleted_server";
echo "<pre>" . implode("\n",$should_be_deleted_files) . "</pre>";
}
public function getArchiveFolder($makeIfMissing=true) {
$this->archive_path = EDOC_PATH . "gzipped";
if(is_dir($this->archive_path)) {
// already created
return $this->archive_path;
} else if ($makeIfMissing) {
exec("mkdir $this->archive_path");
return $this->getArchiveFolder(false);
} else {
throw new Exception("Unable to create/find archive path: $this->archive_path");
}
}
public function getArchiveCount() {
// Get number of files in the archive folder
$result = exec("ls -1q " . $this->getArchiveFolder() ." | wc -l");
return intval($result);
}
public function getCountUnzippedFiles($starting_after = 0) {
$sql = "select count(*), sum(doc_size) from redcap_edocs_metadata where file_extension in " .
"('" . implode("','", self::ZIPPABLE_SUFFIXES) . "') " .
"and gzipped = 0 and doc_id > $starting_after order by doc_id";
$q = db_query($sql);
$row = db_fetch_row($q);
$count = $row[0];
$size = $row[1];
return array($count, $size);
}
public function getUnzippedFiles($count = 100, $starting_after = 0) {
$sql = "select * from redcap_edocs_metadata where file_extension in " .
"('" . implode("','", self::ZIPPABLE_SUFFIXES) . "') " .
"and gzipped = 0 and doc_id > " . $starting_after . " order by doc_id limit " . $count;
$q = db_query($sql);
echo "$sql\n";
$rows = [];
while($row=db_fetch_assoc($q)) {
$rows[] = $row;
}
return $rows;
}
public function updateEdoc($doc_id, $stored_name, $gzipped = 1) {
$sql = "update redcap_edocs_metadata set stored_name='" .
db_real_escape_string($stored_name) . "', gzipped=1 " .
"where doc_id=$doc_id";
$q = db_query($sql);
if ($q === false) {
$this->errors[] = "Error updating $sql";
}
return $q;
}
public function processFiles($edocRows) {
$archive_path = $this->getArchiveFolder();
$i = 0;
$success = [];
foreach ($edocRows as $row) {
$i++;
$doc_id = $row['doc_id'];
$this->last_doc_id = $doc_id;
$stored_name = $row['stored_name'];
$delete_date = $row['delete_date'];
$date_deleted_server = $row['date_deleted_server'];
$size_before = $row['doc_size'];
if ($row['doc_size'] == 0) {
echo "[$i:$doc_id] Has zero size - skipping\n";
continue;
}
if (file_exists(EDOC_PATH . $stored_name)) {
if (!empty($date_deleted_server)) {
$this->errors[] = "[$i:$doc_id] WARNING: Edoc source found even though it is marked as deleted: $stored_name / $delete_date";
continue;
}
if (!empty($delete_date)) {
// logically deleted -- ignore
continue;
}
} else {
// EDOC file not found
if (empty($delete_date)) {
$this->errors[] = "[$i:$doc_id] FILE NOT FOUND at " . EDOC_PATH . "$stored_name : project " . $row['project_id'];
continue;
} else {
// deleted - skip
continue;
}
}
// File should exist!
$filename_tmp = Files::copyEdocToTemp($doc_id,true,true);
if (!file_exists($filename_tmp)) {
$this->errors[] = "[$i:$doc_id] Unable to create temp file from $stored_name as $filename_tmp";
continue;
}
// Read file
$content = file_get_contents($filename_tmp);
if (empty($content)) {
$this->errors[] = "[$i:$doc_id] UNABLE TO READ CONTENT from $filename_tmp -- skipping";
unlink($filename_tmp);
continue;
}
// Make .gz file
$temp_file = APP_PATH_TEMP . $stored_name . ".gz";
$b = file_put_contents($temp_file, gzcompress($content,9));
if ($b) {
echo "[$i:$doc_id] Saved gzipped edoc to temp $temp_file\n";
} else {
$this->errors[] = "[$i:$doc_id] Unable to make gz file for doc $doc_id";
continue;
}
$old_edoc_file = EDOC_PATH . $stored_name;
$new_edoc_file = EDOC_PATH . $stored_name . ".gz";
if (file_exists($new_edoc_file)) {
$this->errors[] = "[$i:$doc_id] $new_edoc_file ALREADY EXISTS - skipping\n";
continue;
} else {
// Get the size of the compressed folder
$size_after = filesize($temp_file);
// Move the temp file to edocs
$cmd = "mv $temp_file $new_edoc_file";
$result = exec($cmd);
if (!empty($result)) {
echo "[$i:$doc_id] Failed to move new temp gz edoc to edocs: " . json_encode($result) . "\n";
continue;
} else {
echo "[$i:$doc_id] Created $new_edoc_file\n";
}
unlink($temp_file);
unlink($filename_tmp);
echo "[$i:$doc_id] Removed temp files\n";
// Move original to new gzipped folder
$result = exec("mv $old_edoc_file $archive_path/$stored_name");
if (!empty($result)) {
$this->errors[] = "[$i:$doc_id] Error archiving old edoc: " . json_encode($result);
continue;
} else {
echo "[$i:$doc_id] Archived $stored_name to $archive_path \n";
}
$this->size_change[$doc_id] = [
$size_before,
$size_after
];
// Update edoc
$this->updateEdoc($row['doc_id'], $stored_name . ".gz",1);
$success[] = $doc_id;
}
$i++;
}
return $success;
}
}
global $edoc_storage_option;
//exit(json_encode($edoc_storage_option));
if ($edoc_storage_option !== "0") {
throw new Exception("Only file-based edocs supported");
}
$ez = new EdocZipper();
$count = isset($_POST['count']) && is_numeric($_POST['count']) ? $_POST['count'] : 100;
$starting_after = isset($_POST['starting_after']) && is_numeric($_POST['starting_after']) ? $_POST['starting_after'] : 0;
list($file_count, $total_size) = $ez->getCountUnzippedFiles($starting_after);
?>
<html>
<head>
<style type="text/css">
pre {
max-height: 200px;
max-width: 95%;
overflow: scroll;
border: 1px solid gray;
font-size: smaller;
}
</style>
</head>
<body>
<h1>
EDOC Gzip Utility
</h1>
<h2>
Gzip any old Edocs that were created before compression was added to REDCap
</h2>
<div>
This script will convert older edocs files that were not saved with compression to ones
with compression. This can significantly reduce the size of your edocs folder.
This will only process files if your edocs folder is in a server directory -- it does not
support webdav, s3, gcp, or other storage options.
</div>
<div><strong>DO NOT RUN THIS UNLESS YOU KNOW WHAT YOU ARE DOING!</strong></div>
<hr/>
<form id="form" name="form" method="post">
<label for="count">Number to process:</label><input type="number" value="<?php echo $count ?>" name="count"/>
<label for="count">Start after (counting up) doc_id:</label><input type="number" name="starting_after" id="starting_after"/>
<button id="btn">Gzip Some Files</button>
<div>There are currently <?php echo $file_count ?> potential files to compress (starting after <?php echo $starting_after ?>)
with a total unzipped size of <?php echo floor($total_size/1024/1024) ?> MB</div>
</form>
<hr/>
<div>
<p>
This script automatically archives the original (non .gz) files to a sub-folder at
<code><?php echo $ez->getArchiveFolder() ?></code> in case something went badly wrong.
If all goes well, you can/should clean up and remove this folder when satisfied.
</p>
<p>
Currently, this folder contains <strong><?php echo $ez->getArchiveCount() ?></strong> files.
</p>
</div>
<hr/>
<div>
<?php
if ($_SERVER['REQUEST_METHOD'] == 'POST') {
$count = is_numeric($_POST['count']) ? $_POST['count'] : 100;
if ($count == -1) {
$ez->getDeletedEdocs();
exit();
}
if ($count == -2) {
$ez->getDeletedEdocs2();
exit();
}
$files = $ez->getUnzippedFiles($count, $starting_after);
echo "<h3>Processing $count of " . count($files) . " files that are gzippable and not currently " .
"zipped starting after $starting_after</h3>";
echo "<pre>";
$success = $ez->processFiles($files);
echo "</pre>";
echo "<h3>Successfully processed " . count($success) . " edocs</h3>";
echo "<pre>";
echo "\n" . json_encode($success) . "\n";
file_put_contents(APP_PATH_TEMP . "zipper/edoc_zipper_success.log", "\n" . json_encode($success), FILE_APPEND);
echo "</pre>";
if (!empty($ez->deleted_files)) {
echo "<h4>" . count($ez->deleted_files) . " deleted edocs:</h4><pre>" . json_encode($ez->deleted_files) . "</pre>";
file_put_contents(APP_PATH_TEMP . "zipper/edoc_zipper_deleted_files.log", "\n" . json_encode($ez->deleted_files), FILE_APPEND);
}
if (!empty($ez->errors)) {
echo "<h4>" . count($ez->errors) . " ERRORS:</h4><pre>" . print_r($ez->errors, true) . "</pre>";
file_put_contents(APP_PATH_TEMP . "zipper/edoc_zipper_errors.log", "\n" . json_encode($ez->errors), FILE_APPEND);
}
if (!empty($ez->missing_files)) {
echo "<h4>" . count($ez->missing_files) . " MISSING FILES:</h4><pre>" . print_r($ez->missing_files, true) . "</pre>";
file_put_contents(APP_PATH_TEMP . "zipper/edoc_zipper_missing_files.log", "\n" . json_encode($ez->missing_files), FILE_APPEND);
}
if (!empty($ez->size_change)) {
$total_before = 0;
$total_after = 0;
$total_delta = 0;
foreach ($ez->size_change as $d) {
list($before, $after) = $d;
$total_before = $total_before + $before;
$total_after = $total_after + $after;
$total_delta = $total_delta + ($before - $after);
}
$total_before = floor($total_before / 1024 / 1024);
$total_after = floor($total_after / 1024 / 1024);
$total_delta = floor($total_delta / 1024 / 1024);
$size_count = count($ez->size_change);
echo "<h4>Size Change:</h4>" .
"<pre> Count: $size_count files\nBefore: $total_before MB\n After: $total_after MB\nTotals: $total_delta MB Saved</pre>";
file_put_contents(APP_PATH_TEMP . "zipper/edoc_zipper_size_change.log", "\n" . json_encode($ez->size_change), FILE_APPEND);
}
if (!empty($ez->last_doc_id)) {
echo "<div>Last doc_id processed: " . $ez->last_doc_id . "</div>";
?>
<script>
document.getElementById('starting_after').value = '<?php echo $ez->last_doc_id ?>';
</script>
<?php
}
}
?>
</div>
<script>
let f = document.getElementById('form');
f.addEventListener('submit', function (event) {
document.getElementById('btn').insertAdjacentHTML('afterend', '<span>...running...</span>');
document.getElementById('btn').remove();
});
</script>
</body>
</html>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment