Skip to content

Instantly share code, notes, and snippets.

@adjam
Last active September 14, 2016 19:00
Show Gist options
  • Save adjam/0f6a5e894ba4682407886c1e7f8e9b88 to your computer and use it in GitHub Desktop.
Save adjam/0f6a5e894ba4682407886c1e7f8e9b88 to your computer and use it in GitHub Desktop.
Solr Schema "autogeneration" from Endeca Mappings spreadsheet
#!/usr/bin/env python3
import sys
from lxml import objectify, etree
import itertools
from csv import DictReader
# mnemonic for a widely used dictionary key
EP = 'Endeca_Property'
# valid attributes for a 'fieldType' elememt
FT_ATTRS = set(['name', 'class', 'multiValued', 'sortMissingLast',
'docValues', 'precisionStep', 'positionIncrementGap'])
# Field types found in Solr 6 default 'managed schema' schema
# included as a starting point.
field_types = {'_bbox_coord': {'stored': 'false', 'useDocValuesAsStored': 'false', 'class': 'solr.TrieDoubleField', 'precisionStep': '8', 'docValues': 'true'},
'alphaOnlySort': {'sortMissingLast': 'true', 'class': 'solr.TextField', 'omitNorms': 'true'},
'bbox': {'class': 'solr.BBoxField', 'numberType': '_bbox_coord', 'distanceUnits': 'kilometers', 'geo': 'true'},
'binary': {'class': 'solr.BinaryField'},
'boolean': {'sortMissingLast': 'true', 'class': 'solr.BoolField'},
'currency': {'currencyConfig': 'currency.xml', 'defaultCurrency': 'USD', 'class': 'solr.CurrencyField', 'precisionStep': '8'},
'date': {'positionIncrementGap': '0', 'class': 'solr.TrieDateField', 'precisionStep': '0', 'docValues': 'true'},
'double': {'positionIncrementGap': '0', 'class': 'solr.TrieDoubleField', 'precisionStep': '0', 'docValues': 'true'},
'float': {'positionIncrementGap': '0', 'class': 'solr.TrieFloatField', 'precisionStep': '0', 'docValues': 'true'},
'ignored': {'class': 'solr.StrField', 'stored': 'false', 'docValues': 'false', 'indexed': 'false', 'multiValued': 'true'},
'int': {'positionIncrementGap': '0', 'class': 'solr.TrieIntField', 'precisionStep': '0', 'docValues': 'true'},
'location': {'class': 'solr.LatLonType', 'subFieldSuffix': '_coordinate'},
'location_rpt': {'distErrPct': '0.025', 'class': 'solr.SpatialRecursivePrefixTreeFieldType', 'maxDistErr': '0.001', 'distanceUnits': 'kilometers', 'geo': 'true'},
'long': {'positionIncrementGap': '0', 'class': 'solr.TrieLongField', 'precisionStep': '0', 'docValues': 'true'},
'lowercase': {'positionIncrementGap': '100', 'class': 'solr.TextField'},
'point': {'class': 'solr.PointType', 'dimension': '2', 'subFieldSuffix': '_d'},
'random': {'class': 'solr.RandomSortField', 'indexed': 'true'},
'string': {'sortMissingLast': 'true', 'class': 'solr.StrField', 'docValues': 'true'},
'tdate': {'positionIncrementGap': '0', 'class': 'solr.TrieDateField', 'precisionStep': '6', 'docValues': 'true'},
'tdouble': {'positionIncrementGap': '0', 'class': 'solr.TrieDoubleField', 'precisionStep': '8', 'docValues': 'true'},
'text_en': {'positionIncrementGap': '100', 'class': 'solr.TextField'},
'text_en_splitting': {'positionIncrementGap': '100', 'class': 'solr.TextField', 'autoGeneratePhraseQueries': 'true'},
'text_en_splitting_tight': {'positionIncrementGap': '100', 'class': 'solr.TextField', 'autoGeneratePhraseQueries': 'true'},
'text_general': {'positionIncrementGap': '100', 'class': 'solr.TextField'},
'text_general_rev': {'positionIncrementGap': '100', 'class': 'solr.TextField'},
'text_ws': {'positionIncrementGap': '100', 'class': 'solr.TextField'},
'tfloat': {'positionIncrementGap': '0', 'class': 'solr.TrieFloatField', 'precisionStep': '8', 'docValues': 'true'},
'tint': {'positionIncrementGap': '0', 'class': 'solr.TrieIntField', 'precisionStep': '8', 'docValues': 'true'},
'tlong': {'positionIncrementGap': '0', 'class': 'solr.TrieLongField', 'precisionStep': '8', 'docValues': 'true'}}
def to_ft_elememt(data, E):
attrs = {}
for k, v in data.items():
if k in FT_ATTRS:
if k == 'class' and '.' not in k:
k = "solr." + k
attrs[k] = data[k]
else:
print("Unknown attribute {0}".format(k))
return E(**attrs)
def get_field_types():
result = []
for ft_name, attrs in field_types.items():
el = objectify.Element("fieldType", **attrs)
el.attrib['name'] = ft_name
result.append(el)
return result
def base_fields(E):
"""Gets the base fields for the schema"""
return (
E.field(
name='id',
type='string',
indexed='true',
stored='true',
required='true',
multiValued='false'
),
)
def base_schema():
"""Creates a base Solr schema etree element"""
E = objectify.E
schema = E.schema(
*get_field_types(),
name="trln-auto",
version="1.0-snapshot"
)
for el in base_fields(E):
schema.append(el)
return schema
def load(filename):
"""Loads the spreadsheet as a list of dict objects"""
with open(filename) as f:
rows = list(DictReader(f))
return rows
def text_field(data, repeatable=False):
"""Generates a solr text field definition from a row in the spreadsheet"""
field = objectify.Element('field',
type='text',
name=normalize_name(data['name'])
)
comments = [
etree.Comment("Label: " + data['labels']),
etree.Comment('MARC: ' + data['tags'])
]
if repeatable:
field.attrib['multiValued'] = "true"
return comments[0], comments[1], field
def normalize_name(endeca_name):
"""Replaces spaces in field names with underscores"""
return endeca_name.replace(' ', '_')
def build_schema(rows):
"""Builds a schema object based on rows found in the spreadsheet"""
schema = base_schema()
ep_groups = itertools.groupby(rows, lambda x: x[EP])
for endeca_property, definitions in ep_groups:
definitions = list(definitions)
tags = ", ".join([x['MARC Tag'] for x in definitions])
labels = ",".join(set([x['OPAC Label'] for x in definitions]))
field = text_field(dict(name=endeca_property, labels=labels, tags=tags))
schema.extend(field)
objectify.deannotate(schema, cleanup_namespaces=True)
return schema
if __name__ == '__main__':
rows = load(
len(sys.argv) > 1 and sys.argv[1] or "TRLN-Endeca-Mappings.csv")
schema = build_schema(rows)
result = etree.tostring(schema, encoding="unicode", pretty_print=True)
print(result)
@adjam
Copy link
Author

adjam commented Sep 14, 2016

This is waaaaay too simplistic and I'm not sure what it produces will work. But it's a demo of a technique we might use to maintain a schema that's in synch with our documentation.

Recipe:

  • Start with TRLN-Endeca-Mappings.xls
  • Export the Properties (revised 03-31-15) sheet (minus the blank lines at the top) to a CSV file
  • Run this script passing in the filename (or not, if you exported to TRLN-Endeca-Mappings.csv)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment