Last active
September 14, 2016 19:00
-
-
Save adjam/0f6a5e894ba4682407886c1e7f8e9b88 to your computer and use it in GitHub Desktop.
Solr Schema "autogeneration" from Endeca Mappings spreadsheet
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import sys | |
from lxml import objectify, etree | |
import itertools | |
from csv import DictReader | |
# mnemonic for a widely used dictionary key | |
EP = 'Endeca_Property' | |
# valid attributes for a 'fieldType' elememt | |
FT_ATTRS = set(['name', 'class', 'multiValued', 'sortMissingLast', | |
'docValues', 'precisionStep', 'positionIncrementGap']) | |
# Field types found in Solr 6 default 'managed schema' schema | |
# included as a starting point. | |
field_types = {'_bbox_coord': {'stored': 'false', 'useDocValuesAsStored': 'false', 'class': 'solr.TrieDoubleField', 'precisionStep': '8', 'docValues': 'true'}, | |
'alphaOnlySort': {'sortMissingLast': 'true', 'class': 'solr.TextField', 'omitNorms': 'true'}, | |
'bbox': {'class': 'solr.BBoxField', 'numberType': '_bbox_coord', 'distanceUnits': 'kilometers', 'geo': 'true'}, | |
'binary': {'class': 'solr.BinaryField'}, | |
'boolean': {'sortMissingLast': 'true', 'class': 'solr.BoolField'}, | |
'currency': {'currencyConfig': 'currency.xml', 'defaultCurrency': 'USD', 'class': 'solr.CurrencyField', 'precisionStep': '8'}, | |
'date': {'positionIncrementGap': '0', 'class': 'solr.TrieDateField', 'precisionStep': '0', 'docValues': 'true'}, | |
'double': {'positionIncrementGap': '0', 'class': 'solr.TrieDoubleField', 'precisionStep': '0', 'docValues': 'true'}, | |
'float': {'positionIncrementGap': '0', 'class': 'solr.TrieFloatField', 'precisionStep': '0', 'docValues': 'true'}, | |
'ignored': {'class': 'solr.StrField', 'stored': 'false', 'docValues': 'false', 'indexed': 'false', 'multiValued': 'true'}, | |
'int': {'positionIncrementGap': '0', 'class': 'solr.TrieIntField', 'precisionStep': '0', 'docValues': 'true'}, | |
'location': {'class': 'solr.LatLonType', 'subFieldSuffix': '_coordinate'}, | |
'location_rpt': {'distErrPct': '0.025', 'class': 'solr.SpatialRecursivePrefixTreeFieldType', 'maxDistErr': '0.001', 'distanceUnits': 'kilometers', 'geo': 'true'}, | |
'long': {'positionIncrementGap': '0', 'class': 'solr.TrieLongField', 'precisionStep': '0', 'docValues': 'true'}, | |
'lowercase': {'positionIncrementGap': '100', 'class': 'solr.TextField'}, | |
'point': {'class': 'solr.PointType', 'dimension': '2', 'subFieldSuffix': '_d'}, | |
'random': {'class': 'solr.RandomSortField', 'indexed': 'true'}, | |
'string': {'sortMissingLast': 'true', 'class': 'solr.StrField', 'docValues': 'true'}, | |
'tdate': {'positionIncrementGap': '0', 'class': 'solr.TrieDateField', 'precisionStep': '6', 'docValues': 'true'}, | |
'tdouble': {'positionIncrementGap': '0', 'class': 'solr.TrieDoubleField', 'precisionStep': '8', 'docValues': 'true'}, | |
'text_en': {'positionIncrementGap': '100', 'class': 'solr.TextField'}, | |
'text_en_splitting': {'positionIncrementGap': '100', 'class': 'solr.TextField', 'autoGeneratePhraseQueries': 'true'}, | |
'text_en_splitting_tight': {'positionIncrementGap': '100', 'class': 'solr.TextField', 'autoGeneratePhraseQueries': 'true'}, | |
'text_general': {'positionIncrementGap': '100', 'class': 'solr.TextField'}, | |
'text_general_rev': {'positionIncrementGap': '100', 'class': 'solr.TextField'}, | |
'text_ws': {'positionIncrementGap': '100', 'class': 'solr.TextField'}, | |
'tfloat': {'positionIncrementGap': '0', 'class': 'solr.TrieFloatField', 'precisionStep': '8', 'docValues': 'true'}, | |
'tint': {'positionIncrementGap': '0', 'class': 'solr.TrieIntField', 'precisionStep': '8', 'docValues': 'true'}, | |
'tlong': {'positionIncrementGap': '0', 'class': 'solr.TrieLongField', 'precisionStep': '8', 'docValues': 'true'}} | |
def to_ft_elememt(data, E): | |
attrs = {} | |
for k, v in data.items(): | |
if k in FT_ATTRS: | |
if k == 'class' and '.' not in k: | |
k = "solr." + k | |
attrs[k] = data[k] | |
else: | |
print("Unknown attribute {0}".format(k)) | |
return E(**attrs) | |
def get_field_types(): | |
result = [] | |
for ft_name, attrs in field_types.items(): | |
el = objectify.Element("fieldType", **attrs) | |
el.attrib['name'] = ft_name | |
result.append(el) | |
return result | |
def base_fields(E): | |
"""Gets the base fields for the schema""" | |
return ( | |
E.field( | |
name='id', | |
type='string', | |
indexed='true', | |
stored='true', | |
required='true', | |
multiValued='false' | |
), | |
) | |
def base_schema(): | |
"""Creates a base Solr schema etree element""" | |
E = objectify.E | |
schema = E.schema( | |
*get_field_types(), | |
name="trln-auto", | |
version="1.0-snapshot" | |
) | |
for el in base_fields(E): | |
schema.append(el) | |
return schema | |
def load(filename): | |
"""Loads the spreadsheet as a list of dict objects""" | |
with open(filename) as f: | |
rows = list(DictReader(f)) | |
return rows | |
def text_field(data, repeatable=False): | |
"""Generates a solr text field definition from a row in the spreadsheet""" | |
field = objectify.Element('field', | |
type='text', | |
name=normalize_name(data['name']) | |
) | |
comments = [ | |
etree.Comment("Label: " + data['labels']), | |
etree.Comment('MARC: ' + data['tags']) | |
] | |
if repeatable: | |
field.attrib['multiValued'] = "true" | |
return comments[0], comments[1], field | |
def normalize_name(endeca_name): | |
"""Replaces spaces in field names with underscores""" | |
return endeca_name.replace(' ', '_') | |
def build_schema(rows): | |
"""Builds a schema object based on rows found in the spreadsheet""" | |
schema = base_schema() | |
ep_groups = itertools.groupby(rows, lambda x: x[EP]) | |
for endeca_property, definitions in ep_groups: | |
definitions = list(definitions) | |
tags = ", ".join([x['MARC Tag'] for x in definitions]) | |
labels = ",".join(set([x['OPAC Label'] for x in definitions])) | |
field = text_field(dict(name=endeca_property, labels=labels, tags=tags)) | |
schema.extend(field) | |
objectify.deannotate(schema, cleanup_namespaces=True) | |
return schema | |
if __name__ == '__main__': | |
rows = load( | |
len(sys.argv) > 1 and sys.argv[1] or "TRLN-Endeca-Mappings.csv") | |
schema = build_schema(rows) | |
result = etree.tostring(schema, encoding="unicode", pretty_print=True) | |
print(result) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This is waaaaay too simplistic and I'm not sure what it produces will work. But it's a demo of a technique we might use to maintain a schema that's in synch with our documentation.
Recipe:
TRLN-Endeca-Mappings.xls
Properties (revised 03-31-15)
sheet (minus the blank lines at the top) to a CSV fileTRLN-Endeca-Mappings.csv
)