aplz · October 16, 2019 09:17 · aplz · Apr 11, 2019
diff --git a/es_german_analyzer.py b/es_german_analyzer.py
 es = Elasticsearch()
 tokens = es.indices.analyze(
  body={"analyzer": "german",
        "text": "Die junge Informatikerin Katie Bouman machte die "
                "historische Aufnahme eines schwarzen Lochs "
                "möglich."})['tokens']
 for token in tokens:
  print(token)
  

 {'token': 'jung', 'start_offset': 4, 'end_offset': 9, 'type': '<ALPHANUM>', 'position': 1}
 {'token': 'informatikerin', 'start_offset': 10, 'end_offset': 24, 'type': '<ALPHANUM>', 'position': 2}
 {'token': 'kati', 'start_offset': 25, 'end_offset': 30, 'type': '<ALPHANUM>', 'position': 3}
 {'token': 'bouman', 'start_offset': 31, 'end_offset': 37, 'type': '<ALPHANUM>', 'position': 4}
 {'token': 'macht', 'start_offset': 38, 'end_offset': 44, 'type': '<ALPHANUM>', 'position': 5}
 {'token': 'historisch', 'start_offset': 49, 'end_offset': 60, 'type': '<ALPHANUM>', 'position': 7}
 {'token': 'aufnahm', 'start_offset': 61, 'end_offset': 69, 'type': '<ALPHANUM>', 'position': 8}
 {'token': 'schwarz', 'start_offset': 76, 'end_offset': 85, 'type': '<ALPHANUM>', 'position': 10}
 {'token': 'loch', 'start_offset': 86, 'end_offset': 91, 'type': '<ALPHANUM>', 'position': 11}
 {'token': 'moglich', 'start_offset': 92, 'end_offset': 99, 'type': '<ALPHANUM>', 'position': 12}
 {'token': 'besonders', 'start_offset': 101, 'end_offset': 110, 'type': '<ALPHANUM>', 'position': 13}
 {'token': 'schw', 'start_offset': 111, 'end_offset': 117, 'type': '<ALPHANUM>', 'position': 14}
 {'token': 'fiel', 'start_offset': 118, 'end_offset': 122, 'type': '<ALPHANUM>', 'position': 15}
 {'token': 'geheimnis', 'start_offset': 135, 'end_offset': 144, 'type': '<ALPHANUM>', 'position': 19}
 {'token': 'lang', 'start_offset': 148, 'end_offset': 153, 'type': '<ALPHANUM>', 'position': 21}
 {'token': 'bewahr', 'start_offset': 157, 'end_offset': 165, 'type': '<ALPHANUM>', 'position': 23}
 {'token': 'famili', 'start_offset': 197, 'end_offset': 204, 'type': '<ALPHANUM>', 'position': 29}
 {'token': 'davon', 'start_offset': 205, 'end_offset': 210, 'type': '<ALPHANUM>', 'position': 30}
 {'token': 'erzahlt', 'start_offset': 211, 'end_offset': 218, 'type': '<ALPHANUM>', 'position': 31}

 # omit stemmer but use German stopword filter
 es = Elasticsearch()
 tokens = es.indices.analyze(
  body={"tokenizer": "standard", "filter": ["lowercase", {"type": "stop", "stopwords": "_german_"}],
        "text": "Die junge Informatikerin Katie Bouman machte die "
                "historische Aufnahme eines schwarzen Lochs "
                "möglich."})['tokens']
 for token in tokens:
  print(token)
  
 {'token': 'junge', 'start_offset': 4, 'end_offset': 9, 'type': '<ALPHANUM>', 'position': 1}
 {'token': 'informatikerin', 'start_offset': 10, 'end_offset': 24, 'type': '<ALPHANUM>', 'position': 2}
 {'token': 'katie', 'start_offset': 25, 'end_offset': 30, 'type': '<ALPHANUM>', 'position': 3}
 {'token': 'bouman', 'start_offset': 31, 'end_offset': 37, 'type': '<ALPHANUM>', 'position': 4}
 {'token': 'machte', 'start_offset': 38, 'end_offset': 44, 'type': '<ALPHANUM>', 'position': 5}
 {'token': 'historische', 'start_offset': 49, 'end_offset': 60, 'type': '<ALPHANUM>', 'position': 7}
 {'token': 'aufnahme', 'start_offset': 61, 'end_offset': 69, 'type': '<ALPHANUM>', 'position': 8}
 {'token': 'schwarzen', 'start_offset': 76, 'end_offset': 85, 'type': '<ALPHANUM>', 'position': 10}
 {'token': 'lochs', 'start_offset': 86, 'end_offset': 91, 'type': '<ALPHANUM>', 'position': 11}
 {'token': 'möglich', 'start_offset': 92, 'end_offset': 99, 'type': '<ALPHANUM>', 'position': 12}
 {'token': 'besonders', 'start_offset': 101, 'end_offset': 110, 'type': '<ALPHANUM>', 'position': 13}
 {'token': 'schwer', 'start_offset': 111, 'end_offset': 117, 'type': '<ALPHANUM>', 'position': 14}
 {'token': 'fiel', 'start_offset': 118, 'end_offset': 122, 'type': '<ALPHANUM>', 'position': 15}
 {'token': 'geheimnis', 'start_offset': 135, 'end_offset': 144, 'type': '<ALPHANUM>', 'position': 19}
 {'token': 'lange', 'start_offset': 148, 'end_offset': 153, 'type': '<ALPHANUM>', 'position': 21}
 {'token': 'bewahren', 'start_offset': 157, 'end_offset': 165, 'type': '<ALPHANUM>', 'position': 23}
 {'token': 'familie', 'start_offset': 197, 'end_offset': 204, 'type': '<ALPHANUM>', 'position': 29}
 {'token': 'davon', 'start_offset': 205, 'end_offset': 210, 'type': '<ALPHANUM>', 'position': 30}
 {'token': 'erzählt', 'start_offset': 211, 'end_offset': 218, 'type': '<ALPHANUM>', 'position': 31}
	es = Elasticsearch()
	tokens = es.indices.analyze(
	body={"analyzer": "german",
	"text": "Die junge Informatikerin Katie Bouman machte die "
	"historische Aufnahme eines schwarzen Lochs "
	"möglich."})['tokens']
	for token in tokens:
	print(token)


	{'token': 'jung', 'start_offset': 4, 'end_offset': 9, 'type': '<ALPHANUM>', 'position': 1}
	{'token': 'informatikerin', 'start_offset': 10, 'end_offset': 24, 'type': '<ALPHANUM>', 'position': 2}
	{'token': 'kati', 'start_offset': 25, 'end_offset': 30, 'type': '<ALPHANUM>', 'position': 3}
	{'token': 'bouman', 'start_offset': 31, 'end_offset': 37, 'type': '<ALPHANUM>', 'position': 4}
	{'token': 'macht', 'start_offset': 38, 'end_offset': 44, 'type': '<ALPHANUM>', 'position': 5}
	{'token': 'historisch', 'start_offset': 49, 'end_offset': 60, 'type': '<ALPHANUM>', 'position': 7}
	{'token': 'aufnahm', 'start_offset': 61, 'end_offset': 69, 'type': '<ALPHANUM>', 'position': 8}
	{'token': 'schwarz', 'start_offset': 76, 'end_offset': 85, 'type': '<ALPHANUM>', 'position': 10}
	{'token': 'loch', 'start_offset': 86, 'end_offset': 91, 'type': '<ALPHANUM>', 'position': 11}
	{'token': 'moglich', 'start_offset': 92, 'end_offset': 99, 'type': '<ALPHANUM>', 'position': 12}
	{'token': 'besonders', 'start_offset': 101, 'end_offset': 110, 'type': '<ALPHANUM>', 'position': 13}
	{'token': 'schw', 'start_offset': 111, 'end_offset': 117, 'type': '<ALPHANUM>', 'position': 14}
	{'token': 'fiel', 'start_offset': 118, 'end_offset': 122, 'type': '<ALPHANUM>', 'position': 15}
	{'token': 'geheimnis', 'start_offset': 135, 'end_offset': 144, 'type': '<ALPHANUM>', 'position': 19}
	{'token': 'lang', 'start_offset': 148, 'end_offset': 153, 'type': '<ALPHANUM>', 'position': 21}
	{'token': 'bewahr', 'start_offset': 157, 'end_offset': 165, 'type': '<ALPHANUM>', 'position': 23}
	{'token': 'famili', 'start_offset': 197, 'end_offset': 204, 'type': '<ALPHANUM>', 'position': 29}
	{'token': 'davon', 'start_offset': 205, 'end_offset': 210, 'type': '<ALPHANUM>', 'position': 30}
	{'token': 'erzahlt', 'start_offset': 211, 'end_offset': 218, 'type': '<ALPHANUM>', 'position': 31}

	# omit stemmer but use German stopword filter
	es = Elasticsearch()
	tokens = es.indices.analyze(
	body={"tokenizer": "standard", "filter": ["lowercase", {"type": "stop", "stopwords": "_german_"}],
	"text": "Die junge Informatikerin Katie Bouman machte die "
	"historische Aufnahme eines schwarzen Lochs "
	"möglich."})['tokens']
	for token in tokens:
	print(token)

	{'token': 'junge', 'start_offset': 4, 'end_offset': 9, 'type': '<ALPHANUM>', 'position': 1}
	{'token': 'informatikerin', 'start_offset': 10, 'end_offset': 24, 'type': '<ALPHANUM>', 'position': 2}
	{'token': 'katie', 'start_offset': 25, 'end_offset': 30, 'type': '<ALPHANUM>', 'position': 3}
	{'token': 'bouman', 'start_offset': 31, 'end_offset': 37, 'type': '<ALPHANUM>', 'position': 4}
	{'token': 'machte', 'start_offset': 38, 'end_offset': 44, 'type': '<ALPHANUM>', 'position': 5}
	{'token': 'historische', 'start_offset': 49, 'end_offset': 60, 'type': '<ALPHANUM>', 'position': 7}
	{'token': 'aufnahme', 'start_offset': 61, 'end_offset': 69, 'type': '<ALPHANUM>', 'position': 8}
	{'token': 'schwarzen', 'start_offset': 76, 'end_offset': 85, 'type': '<ALPHANUM>', 'position': 10}
	{'token': 'lochs', 'start_offset': 86, 'end_offset': 91, 'type': '<ALPHANUM>', 'position': 11}
	{'token': 'möglich', 'start_offset': 92, 'end_offset': 99, 'type': '<ALPHANUM>', 'position': 12}
	{'token': 'besonders', 'start_offset': 101, 'end_offset': 110, 'type': '<ALPHANUM>', 'position': 13}
	{'token': 'schwer', 'start_offset': 111, 'end_offset': 117, 'type': '<ALPHANUM>', 'position': 14}
	{'token': 'fiel', 'start_offset': 118, 'end_offset': 122, 'type': '<ALPHANUM>', 'position': 15}
	{'token': 'geheimnis', 'start_offset': 135, 'end_offset': 144, 'type': '<ALPHANUM>', 'position': 19}
	{'token': 'lange', 'start_offset': 148, 'end_offset': 153, 'type': '<ALPHANUM>', 'position': 21}
	{'token': 'bewahren', 'start_offset': 157, 'end_offset': 165, 'type': '<ALPHANUM>', 'position': 23}
	{'token': 'familie', 'start_offset': 197, 'end_offset': 204, 'type': '<ALPHANUM>', 'position': 29}
	{'token': 'davon', 'start_offset': 205, 'end_offset': 210, 'type': '<ALPHANUM>', 'position': 30}
	{'token': 'erzählt', 'start_offset': 211, 'end_offset': 218, 'type': '<ALPHANUM>', 'position': 31}