# Canvas FileZapper. Zap yer files. # Monkey patch the File class. # This is to work around a bug in gems/attachment_fu/lib/attachment_fu#detect_mimetype. # During att.make_childless, Canvas will call attachment.uploaded_data = data, data being a File # instance. Attachment#uploaded_data= will call detect_mimetype with data, but will fail if data # does not respond to #content_type. So we add the content_type method, using the same code that # detect_mimetype would use anyways. class File def content_type File.mime_type?(self) end end class FileZapper # This class deletes user-uploaded and system-generated files, to free up space on disk. It can be # used to comply with your institutional data retention policies, and to remove old cruft. # USE WITH CAUTION. Files are DELETED FROM DISK and cannot be retrieved. # Attachment records are not removed. The underlying files are deleted, and Canvas' native de-dup # behaviour is used replace the file with a placeholder. A new placeholder attachment record will # be created and set as the root attachment for all deleted attachments. # For some fully disposable files like system-generated reports and exports, the files are deleted # altogehter, and not replaced with placeholders. # Only tested with local storage. Behaviour with S3 is unclear. # To Do: # - Files in account-level groups # - Disposable files, like ePub exports, SIS imports, reports etc. # - Clear out failed uploads? def initialize(options={}) defaults = { cutoff_deleted: 1.year.ago, cutoff_content_export: 1.year.ago, cutoff_epubs: 1.year.ago, placeholder_filename: 'OCADU_file_removed_2019', } @options = defaults.merge(options) end def replace_course_files(term) term = verify_term(term) att_ids = Attachment.where( context: term.courses, file_state: :available ).pluck(:id) # Get files from course groups att_ids.concat Attachment.where( context: Group.where(context: term.courses), file_state: 'available' ).pluck(:id) replace_files(att_ids) end def replace_submissions(term, also=[:comments, :quizzes]) # Remove student assignment submissions for the given term. Optionally also delete files # attached to submissions comments and quiz submission attachments term = verify_term(term) # Find ALL submissions with attachments for the given terms # Pluck attachment IDs (comma-delimited) and flatten them att_ids = Submission .where(assignment: Assignment.where(context: term.courses)) .where.not(attachment_ids: '') .pluck(:attachment_ids) .map { |ids| ids.split(',') } .flatten # Submission comment attachments if also.include?(:comments) att_ids.concat Attachment .where(context: Assignment.where(context: term.courses)) .where.not(workflow_state: :zipped) # Exclude submission exports .pluck(:id) end # Files attached to quiz submissions if also.include?(:quizzes) att_ids.concat Attachment.where( context: Quizzes::QuizSubmission.where( quiz: Quizzes::Quiz.where(context: term.courses) ) ).pluck(:id) end replace_files(att_ids) end def delete_content_exports ContentExport.where('created_at < ?', @options[:cutoff_content_export]).each do |ce| log("Deleting ContextExport #{ce.id}") # ContentExport#destroy is broken: PG throws a FK violation when trying to delete the attachment row # So we manually delete the content and destroy, rather than delete the attachment ce.attachment&.tap do |att| log("Deleting Attachment #{att.id}") destroy_attachment(att) end ce.workflow_state = 'deleted' ce.save! end end def delete_deleted_files # Remove files that have been manually deleted. Any file deleted before `cutoff_deleted` will be # removed from disk. We don't need to replace these, because they're not referenced anywhere. Attachment .where(file_state: :deleted) .where('deleted_at < ?', @options[:cutoff_deleted]) .each do |att| destroy_attachment(att) end end private def replace_files(att_ids) # Delete the original file from disk and replace it with a handy placeholder # Adapted from Attachment#destroy_content_and_replace and Attachments::GarbageCollector att_ids.each_slice(500) do |ids_batch| Attachment.where(id: ids_batch).each do |att| log("Deleting attachment #{att.id}") # Find the appropriate placeholder root attachment new_root = is_image?(att) ? root_image : root_pdf if att.root_attachment_id # Skip files we've already processed next if att.root_attachment_id == new_root.id # Don't delete content from child items. Just set the new root, and save the old root # for later reloading old_root = att.root_attachment else old_root = nil # This will copy the file to a child and make it the new root att.make_childless # Delete original file. DANGER! begin att.destroy_content att.thumbnail&.destroy rescue Errno::ENOENT # The file was not found. Oh well? end end att.root_attachment = new_root [:filename, :md5, :size, :content_type].each do |key| att.send("#{key}=", new_root.send(key)) end # Fix file extension, so the file will open properly unless File.extname(att.display_name) == new_root.extension att.display_name = att.display_name + new_root.extension end att.save! # Make sure to update associations on the old root_attachment old_root&.reload end end end def destroy_attachment(att) # Remove the file from disk and mark the attachment as deleted unless att.root_attachment_id att.make_childless att.destroy_content end att.destroy end def root_pdf @root_pdf ||= Attachment.find_by( filename: placeholder_pdf_filename, context: Account.default, root_attachment_id: nil ) || create_root_pdf end def root_image @root_image ||= Attachment.find_by( filename: placeholder_image_filename, context: Account.default, root_attachment_id: nil ) || create_root_image end def create_root_pdf file_removed_pdf = File.open Rails.root.join('tmp', 'files', 'file_removed.pdf') Attachment.new do |att| att.context = Account.default att.filename = placeholder_pdf_filename att.uploaded_data = file_removed_pdf att.content_type = 'application/pdf' att.save end end def create_root_image file_removed_image = File.open Rails.root.join('tmp', 'files', 'file_removed.png') Attachment.new do |att| att.context = Account.default att.filename = placeholder_image_filename att.uploaded_data = file_removed_image att.content_type = 'image/png' att.save end end def is_image?(att) image_types = %w(image/gif image/jpeg image/pjpeg image/png image/x-png image/bmp) image_types.include? att.content_type end def verify_term(term) term.is_a?(EnrollmentTerm) ? term : EnrollmentTerm.find_by(sis_source_id: term) end def log(message) Rails.logger.info {"---#{message}"} end def placeholder_pdf_filename "#{@options[:placeholder_filename]}.pdf" end def placeholder_image_filename "#{@options[:placeholder_filename]}.png" end end