Last active
June 22, 2017 07:37
-
-
Save buchi/2c9da7122f32bac138a0c1bce086d63b to your computer and use it in GitHub Desktop.
Check Plone indexes for Unicode values
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Checks all UnIndex-based indexes for Unicode strings. | |
# Indexes should contain only byte strings. | |
# Having Unicode strings in indexes may result in UnicodeDecodeErrors during indexing or searching. | |
from Products.CMFPlone.interfaces import IPloneSiteRoot | |
from Products.CMFCore.utils import getToolByName | |
def get_plone_sites(root): | |
result = [] | |
for obj in root.values(): | |
if obj.meta_type is 'Folder': | |
result = result + get_plone_sites(obj) | |
elif IPloneSiteRoot.providedBy(obj): | |
result.append(obj) | |
elif obj.getId() in getattr(root, '_mount_points', {}): | |
result.extend(get_plone_sites(obj)) | |
return result | |
sites = get_plone_sites(app) | |
for site in sites: | |
print "Examining Plone site '%s' ..." % '/'.join(site.getPhysicalPath()) | |
ct = getToolByName(site, 'portal_catalog') | |
bad_indexes = set() | |
for name, idx in ct._catalog.indexes.items(): | |
if idx.meta_type in ['KeywordIndex', 'FieldIndex']: | |
print "Examining index '%s' ..." % name | |
bad_values = set() | |
bad_doc_ids = set() | |
for datum, doc_ids in idx._index.items(): | |
if isinstance(datum, unicode): | |
bad_doc_ids |= set(doc_ids) | |
if datum not in bad_values: | |
bad_values.add(datum) | |
bad_indexes.add(name) | |
print "*** Forward index '%s' contains Unicode value: '%s'. Document Ids: %s" % (name, datum, list(doc_ids)) | |
bad_values = set() | |
for doc_id, datum in idx._unindex.items(): | |
if isinstance(datum, unicode): | |
bad_doc_ids.add(doc_id) | |
if datum not in bad_values: | |
bad_values.add(datum) | |
bad_indexes.add(name) | |
print "*** Inverted index '%s' contains Unicode value: '%s'. Document Ids: %s" % (name, datum, list(doc_ids)) | |
print "***** Bad indexes: %s" % list(bad_indexes) | |
print "Done." |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment