Created
August 2, 2023 17:12
-
-
Save cnk/df118f051b5d1ebbaa221b707078f4a3 to your computer and use it in GitHub Desktop.
Import documents from file + yml data
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import requests | |
from io import BytesIO | |
from collections import OrderedDict | |
from django.core.files import File | |
from wagtail.models import Collection | |
from wagtail.documents import get_document_model | |
from djunk.utils import get_or_generate | |
from core.logging import logger | |
class DocumentMigrator(object): | |
def __get_doc_data(self, yml): | |
if yml.get('file', None): | |
# This import is dealing with files in the local file system | |
return (open(os.path.join(os.getcwd(), 'documents', yml['file']), 'rb'), yml['file']) | |
elif yml.get('url', None): | |
# This import is getting urls from which we can upload the docs, retrieve the actual doc content | |
response = requests.get(yml['url']) | |
return (BytesIO(response.content), yml['url'].split('/')[-1]) | |
else: | |
raise RuntimeError('We need either a local file or a url from which we can retrieve the file.') | |
def __file_needs_update(self, doc, yml): | |
# figure out if we need to create new file object or not | |
# If there isn't a current file, we definitely need to update | |
if not doc.file: | |
return True | |
# Otherwise, check if the exported file hash equals the one recorded in the database | |
return yml.get('file_hash', None) != doc.get_file_hash() | |
def create(self, site_helper, yml, dry_run=False): | |
""" | |
documents: | |
- id: 2 | |
collection_name: Clip Art | |
title_text: foobar | |
file: foobar.jpg | |
tags: | |
- tag1 | |
- tag2 | |
""" | |
doc = None | |
if not dry_run: | |
doc, created = get_or_generate(get_document_model(), import_id=site_helper.import_id(yml['id'])) | |
doc.title = yml.get('title_text', yml.get('filename', '').replace('%20', ' ')) | |
if yml.get('collection_name', None): | |
other_collection = Collection.objects.descendant_of(site_helper.collection).filter(name=yml['collection_name']).first() | |
doc.collection = other_collection | |
else: | |
doc.collection = site_helper.collection | |
if created or self.__file_needs_update(doc, yml): | |
doc_data, filename = self.__get_doc_data(yml) | |
doc.file = File(doc_data, name=filename.replace('%20', ' ')) | |
doc.file_size = yml.get('filesize') or len(doc.file) | |
doc.file_hash = yml.get('file_hash', '') | |
doc.save() | |
doc.get_file_hash() | |
if yml.get('tags', None): | |
doc.tags.set(yml['tags']) | |
op = 'update' | |
if created: | |
op = 'create' | |
logger.info( | |
'importer.document.{}'.format(op), | |
file=yml.get('file', yml.get('url', 'NO FILE?!')) | |
) | |
else: | |
logger.info( | |
'importer.document.create.dry-run', | |
file=yml.get('file', yml.get('url', 'NO FILE?!')) | |
) | |
return doc | |
def export(self, doc): | |
doc_data = OrderedDict([ | |
('id', doc.id), | |
('file', doc.filename), | |
('filesize', doc.file_size), | |
('file_hash', doc.file_hash), | |
('title_text', doc.title), | |
]) | |
# if this is in a collection other than the default collection, export collection name | |
if doc.collection.depth > 2: | |
doc_data['collection_name'] = doc.collection.name | |
if doc.tags.exists(): | |
doc_data['tags'] = [tag.name for tag in doc.tags.all()] | |
return doc_data |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment