adjam · September 14, 2016 19:00 · adjam · Sep 14, 2016
diff --git a/tosolr.py b/tosolr.py
 #!/usr/bin/env python3

 import sys
 from lxml import objectify, etree

 import itertools

 from csv import DictReader

 # mnemonic for a widely used dictionary key
 EP = 'Endeca_Property'

 # valid attributes for a 'fieldType' elememt
 FT_ATTRS = set(['name', 'class', 'multiValued', 'sortMissingLast',
                'docValues', 'precisionStep', 'positionIncrementGap'])

 # Field types found in Solr 6 default 'managed schema' schema
 # included as a starting point.

 field_types = {'_bbox_coord': {'stored': 'false', 'useDocValuesAsStored': 'false', 'class': 'solr.TrieDoubleField', 'precisionStep': '8', 'docValues': 'true'},
               'alphaOnlySort': {'sortMissingLast': 'true', 'class': 'solr.TextField', 'omitNorms': 'true'},
               'bbox': {'class': 'solr.BBoxField', 'numberType': '_bbox_coord', 'distanceUnits': 'kilometers', 'geo': 'true'},
               'binary': {'class': 'solr.BinaryField'},
               'boolean': {'sortMissingLast': 'true', 'class': 'solr.BoolField'},
               'currency': {'currencyConfig': 'currency.xml', 'defaultCurrency': 'USD', 'class': 'solr.CurrencyField', 'precisionStep': '8'},
               'date': {'positionIncrementGap': '0', 'class': 'solr.TrieDateField', 'precisionStep': '0', 'docValues': 'true'},
               'double': {'positionIncrementGap': '0', 'class': 'solr.TrieDoubleField', 'precisionStep': '0', 'docValues': 'true'},
               'float': {'positionIncrementGap': '0', 'class': 'solr.TrieFloatField', 'precisionStep': '0', 'docValues': 'true'},
               'ignored': {'class': 'solr.StrField', 'stored': 'false', 'docValues': 'false', 'indexed': 'false', 'multiValued': 'true'},
               'int': {'positionIncrementGap': '0', 'class': 'solr.TrieIntField', 'precisionStep': '0', 'docValues': 'true'},
               'location': {'class': 'solr.LatLonType', 'subFieldSuffix': '_coordinate'},
               'location_rpt': {'distErrPct': '0.025', 'class': 'solr.SpatialRecursivePrefixTreeFieldType', 'maxDistErr': '0.001', 'distanceUnits': 'kilometers', 'geo': 'true'},
               'long': {'positionIncrementGap': '0', 'class': 'solr.TrieLongField', 'precisionStep': '0', 'docValues': 'true'},
               'lowercase': {'positionIncrementGap': '100', 'class': 'solr.TextField'},
               'point': {'class': 'solr.PointType', 'dimension': '2', 'subFieldSuffix': '_d'},
               'random': {'class': 'solr.RandomSortField', 'indexed': 'true'},
               'string': {'sortMissingLast': 'true', 'class': 'solr.StrField', 'docValues': 'true'},
               'tdate': {'positionIncrementGap': '0', 'class': 'solr.TrieDateField', 'precisionStep': '6', 'docValues': 'true'},
               'tdouble': {'positionIncrementGap': '0', 'class': 'solr.TrieDoubleField', 'precisionStep': '8', 'docValues': 'true'},
               'text_en': {'positionIncrementGap': '100', 'class': 'solr.TextField'},
               'text_en_splitting': {'positionIncrementGap': '100', 'class': 'solr.TextField', 'autoGeneratePhraseQueries': 'true'},
               'text_en_splitting_tight': {'positionIncrementGap': '100', 'class': 'solr.TextField', 'autoGeneratePhraseQueries': 'true'},
               'text_general': {'positionIncrementGap': '100', 'class': 'solr.TextField'},
               'text_general_rev': {'positionIncrementGap': '100', 'class': 'solr.TextField'},
               'text_ws': {'positionIncrementGap': '100', 'class': 'solr.TextField'},
               'tfloat': {'positionIncrementGap': '0', 'class': 'solr.TrieFloatField', 'precisionStep': '8', 'docValues': 'true'},
               'tint': {'positionIncrementGap': '0', 'class': 'solr.TrieIntField', 'precisionStep': '8', 'docValues': 'true'},
               'tlong': {'positionIncrementGap': '0', 'class': 'solr.TrieLongField', 'precisionStep': '8', 'docValues': 'true'}}


 def to_ft_elememt(data, E):
    attrs = {}
    for k, v in data.items():
        if k in FT_ATTRS:
            if k == 'class' and '.' not in k:
                k = "solr." + k
            attrs[k] = data[k]
        else:
            print("Unknown attribute {0}".format(k))
    return E(**attrs)


 def get_field_types():
    result = []
    for ft_name, attrs in field_types.items():
        el = objectify.Element("fieldType", **attrs)
        el.attrib['name'] = ft_name
        result.append(el)
    return result


 def base_fields(E):
    """Gets the base fields for the schema"""
    return (
        E.field(
            name='id',
            type='string',
            indexed='true',
            stored='true',
            required='true',
            multiValued='false'
        ),
    )


 def base_schema():
    """Creates a base Solr schema etree element"""
    E = objectify.E
    schema = E.schema(
        *get_field_types(),
        name="trln-auto",
        version="1.0-snapshot"
    )
    for el in base_fields(E):
        schema.append(el)
    return schema


 def load(filename):
    """Loads the spreadsheet as a list of dict objects"""
    with open(filename) as f:
        rows = list(DictReader(f))
    return rows


 def text_field(data, repeatable=False):
    """Generates a solr text field definition from a row in the spreadsheet"""
    field = objectify.Element('field',
                              type='text',
                              name=normalize_name(data['name'])
                              )
    comments = [
        etree.Comment("Label: " + data['labels']),
        etree.Comment('MARC: ' + data['tags'])
    ]
    if repeatable:
        field.attrib['multiValued'] = "true"
    return comments[0], comments[1], field


 def normalize_name(endeca_name):
    """Replaces spaces in field names with underscores"""
    return endeca_name.replace(' ', '_')


 def build_schema(rows):
    """Builds a schema object based on rows found in the spreadsheet"""
    schema = base_schema()
    ep_groups = itertools.groupby(rows, lambda x: x[EP])
    for endeca_property, definitions in ep_groups:
        definitions = list(definitions)
        tags = ", ".join([x['MARC Tag'] for x in definitions])
        labels = ",".join(set([x['OPAC Label'] for x in definitions]))

        field = text_field(dict(name=endeca_property, labels=labels, tags=tags))
        schema.extend(field)
        objectify.deannotate(schema, cleanup_namespaces=True)
    return schema

 if __name__ == '__main__':
    rows = load(
        len(sys.argv) > 1 and sys.argv[1] or "TRLN-Endeca-Mappings.csv")
    schema = build_schema(rows)
    result = etree.tostring(schema, encoding="unicode", pretty_print=True)
    print(result)
	#!/usr/bin/env python3

	import sys
	from lxml import objectify, etree

	import itertools

	from csv import DictReader

	# mnemonic for a widely used dictionary key
	EP = 'Endeca_Property'

	# valid attributes for a 'fieldType' elememt
	FT_ATTRS = set(['name', 'class', 'multiValued', 'sortMissingLast',
	'docValues', 'precisionStep', 'positionIncrementGap'])

	# Field types found in Solr 6 default 'managed schema' schema
	# included as a starting point.

	field_types = {'_bbox_coord': {'stored': 'false', 'useDocValuesAsStored': 'false', 'class': 'solr.TrieDoubleField', 'precisionStep': '8', 'docValues': 'true'},
	'alphaOnlySort': {'sortMissingLast': 'true', 'class': 'solr.TextField', 'omitNorms': 'true'},
	'bbox': {'class': 'solr.BBoxField', 'numberType': '_bbox_coord', 'distanceUnits': 'kilometers', 'geo': 'true'},
	'binary': {'class': 'solr.BinaryField'},
	'boolean': {'sortMissingLast': 'true', 'class': 'solr.BoolField'},
	'currency': {'currencyConfig': 'currency.xml', 'defaultCurrency': 'USD', 'class': 'solr.CurrencyField', 'precisionStep': '8'},
	'date': {'positionIncrementGap': '0', 'class': 'solr.TrieDateField', 'precisionStep': '0', 'docValues': 'true'},
	'double': {'positionIncrementGap': '0', 'class': 'solr.TrieDoubleField', 'precisionStep': '0', 'docValues': 'true'},
	'float': {'positionIncrementGap': '0', 'class': 'solr.TrieFloatField', 'precisionStep': '0', 'docValues': 'true'},
	'ignored': {'class': 'solr.StrField', 'stored': 'false', 'docValues': 'false', 'indexed': 'false', 'multiValued': 'true'},
	'int': {'positionIncrementGap': '0', 'class': 'solr.TrieIntField', 'precisionStep': '0', 'docValues': 'true'},
	'location': {'class': 'solr.LatLonType', 'subFieldSuffix': '_coordinate'},
	'location_rpt': {'distErrPct': '0.025', 'class': 'solr.SpatialRecursivePrefixTreeFieldType', 'maxDistErr': '0.001', 'distanceUnits': 'kilometers', 'geo': 'true'},
	'long': {'positionIncrementGap': '0', 'class': 'solr.TrieLongField', 'precisionStep': '0', 'docValues': 'true'},
	'lowercase': {'positionIncrementGap': '100', 'class': 'solr.TextField'},
	'point': {'class': 'solr.PointType', 'dimension': '2', 'subFieldSuffix': '_d'},
	'random': {'class': 'solr.RandomSortField', 'indexed': 'true'},
	'string': {'sortMissingLast': 'true', 'class': 'solr.StrField', 'docValues': 'true'},
	'tdate': {'positionIncrementGap': '0', 'class': 'solr.TrieDateField', 'precisionStep': '6', 'docValues': 'true'},
	'tdouble': {'positionIncrementGap': '0', 'class': 'solr.TrieDoubleField', 'precisionStep': '8', 'docValues': 'true'},
	'text_en': {'positionIncrementGap': '100', 'class': 'solr.TextField'},
	'text_en_splitting': {'positionIncrementGap': '100', 'class': 'solr.TextField', 'autoGeneratePhraseQueries': 'true'},
	'text_en_splitting_tight': {'positionIncrementGap': '100', 'class': 'solr.TextField', 'autoGeneratePhraseQueries': 'true'},
	'text_general': {'positionIncrementGap': '100', 'class': 'solr.TextField'},
	'text_general_rev': {'positionIncrementGap': '100', 'class': 'solr.TextField'},
	'text_ws': {'positionIncrementGap': '100', 'class': 'solr.TextField'},
	'tfloat': {'positionIncrementGap': '0', 'class': 'solr.TrieFloatField', 'precisionStep': '8', 'docValues': 'true'},
	'tint': {'positionIncrementGap': '0', 'class': 'solr.TrieIntField', 'precisionStep': '8', 'docValues': 'true'},
	'tlong': {'positionIncrementGap': '0', 'class': 'solr.TrieLongField', 'precisionStep': '8', 'docValues': 'true'}}


	def to_ft_elememt(data, E):
	attrs = {}
	for k, v in data.items():
	if k in FT_ATTRS:
	if k == 'class' and '.' not in k:
	k = "solr." + k
	attrs[k] = data[k]
	else:
	print("Unknown attribute {0}".format(k))
	return E(**attrs)


	def get_field_types():
	result = []
	for ft_name, attrs in field_types.items():
	el = objectify.Element("fieldType", **attrs)
	el.attrib['name'] = ft_name
	result.append(el)
	return result


	def base_fields(E):
	"""Gets the base fields for the schema"""
	return (
	E.field(
	name='id',
	type='string',
	indexed='true',
	stored='true',
	required='true',
	multiValued='false'
	),
	)


	def base_schema():
	"""Creates a base Solr schema etree element"""
	E = objectify.E
	schema = E.schema(
	*get_field_types(),
	name="trln-auto",
	version="1.0-snapshot"
	)
	for el in base_fields(E):
	schema.append(el)
	return schema


	def load(filename):
	"""Loads the spreadsheet as a list of dict objects"""
	with open(filename) as f:
	rows = list(DictReader(f))
	return rows


	def text_field(data, repeatable=False):
	"""Generates a solr text field definition from a row in the spreadsheet"""
	field = objectify.Element('field',
	type='text',
	name=normalize_name(data['name'])
	)
	comments = [
	etree.Comment("Label: " + data['labels']),
	etree.Comment('MARC: ' + data['tags'])
	]
	if repeatable:
	field.attrib['multiValued'] = "true"
	return comments[0], comments[1], field


	def normalize_name(endeca_name):
	"""Replaces spaces in field names with underscores"""
	return endeca_name.replace(' ', '_')


	def build_schema(rows):
	"""Builds a schema object based on rows found in the spreadsheet"""
	schema = base_schema()
	ep_groups = itertools.groupby(rows, lambda x: x[EP])
	for endeca_property, definitions in ep_groups:
	definitions = list(definitions)
	tags = ", ".join([x['MARC Tag'] for x in definitions])
	labels = ",".join(set([x['OPAC Label'] for x in definitions]))

	field = text_field(dict(name=endeca_property, labels=labels, tags=tags))
	schema.extend(field)
	objectify.deannotate(schema, cleanup_namespaces=True)
	return schema

	if __name__ == '__main__':
	rows = load(
	len(sys.argv) > 1 and sys.argv[1] or "TRLN-Endeca-Mappings.csv")
	schema = build_schema(rows)
	result = etree.tostring(schema, encoding="unicode", pretty_print=True)
	print(result)