Skip to content

Instantly share code, notes, and snippets.

@smerchek
Created May 15, 2013 01:07
Show Gist options
  • Save smerchek/5580962 to your computer and use it in GitHub Desktop.
Save smerchek/5580962 to your computer and use it in GitHub Desktop.

Using the JSON output of a Solr field analysis, to visualize the Lucene indexing/analysis pipeline.

{
"responseHeader": {
"status": 0,
"QTime": 4
},
"analysis": {
"field_types": {
"text_en_splitting": {
"index": [
"org.apache.lucene.analysis.core.WhitespaceTokenizer",
[
{
"text": "WiFi",
"raw_bytes": "[57 69 46 69]",
"start": 0,
"end": 4,
"position": 1,
"positionHistory": [
1
],
"type": "word"
}
],
"org.apache.lucene.analysis.core.StopFilter",
[
{
"text": "WiFi",
"raw_bytes": "[57 69 46 69]",
"position": 1,
"positionHistory": [
1,
1
],
"start": 0,
"end": 4,
"type": "word"
}
],
"org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter",
[
{
"text": "Wi",
"raw_bytes": "[57 69]",
"start": 0,
"end": 2,
"type": "word",
"position": 1,
"positionHistory": [
1,
1,
1
]
},
{
"text": "Fi",
"raw_bytes": "[46 69]",
"start": 2,
"end": 4,
"type": "word",
"position": 2,
"positionHistory": [
1,
1,
2
]
},
{
"text": "WiFi",
"raw_bytes": "[57 69 46 69]",
"start": 0,
"end": 4,
"type": "word",
"position": 2,
"positionHistory": [
1,
1,
2
]
}
],
"org.apache.lucene.analysis.core.LowerCaseFilter",
[
{
"text": "wi",
"raw_bytes": "[77 69]",
"position": 1,
"positionHistory": [
1,
1,
1,
1
],
"start": 0,
"end": 2,
"type": "word"
},
{
"text": "fi",
"raw_bytes": "[66 69]",
"position": 2,
"positionHistory": [
1,
1,
2,
2
],
"start": 2,
"end": 4,
"type": "word"
},
{
"text": "wifi",
"raw_bytes": "[77 69 66 69]",
"position": 2,
"positionHistory": [
1,
1,
2,
2
],
"start": 0,
"end": 4,
"type": "word"
}
],
"org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter",
[
{
"text": "wi",
"raw_bytes": "[77 69]",
"org.apache.lucene.analysis.tokenattributes.KeywordAttribute#keyword": false,
"position": 1,
"positionHistory": [
1,
1,
1,
1,
1
],
"start": 0,
"end": 2,
"type": "word"
},
{
"text": "fi",
"raw_bytes": "[66 69]",
"org.apache.lucene.analysis.tokenattributes.KeywordAttribute#keyword": false,
"position": 2,
"positionHistory": [
1,
1,
2,
2,
2
],
"start": 2,
"end": 4,
"type": "word"
},
{
"text": "wifi",
"raw_bytes": "[77 69 66 69]",
"org.apache.lucene.analysis.tokenattributes.KeywordAttribute#keyword": false,
"position": 2,
"positionHistory": [
1,
1,
2,
2,
2
],
"start": 0,
"end": 4,
"type": "word"
}
],
"org.apache.lucene.analysis.en.PorterStemFilter",
[
{
"text": "wi",
"raw_bytes": "[77 69]",
"org.apache.lucene.analysis.tokenattributes.KeywordAttribute#keyword": false,
"position": 1,
"positionHistory": [
1,
1,
1,
1,
1,
1
],
"start": 0,
"end": 2,
"type": "word"
},
{
"text": "fi",
"raw_bytes": "[66 69]",
"org.apache.lucene.analysis.tokenattributes.KeywordAttribute#keyword": false,
"position": 2,
"positionHistory": [
1,
1,
2,
2,
2,
2
],
"start": 2,
"end": 4,
"type": "word"
},
{
"text": "wifi",
"raw_bytes": "[77 69 66 69]",
"org.apache.lucene.analysis.tokenattributes.KeywordAttribute#keyword": false,
"position": 2,
"positionHistory": [
1,
1,
2,
2,
2,
2
],
"start": 0,
"end": 4,
"type": "word"
}
]
]
}
},
"field_names": {}
}
}
<!DOCTYPE html>
<html>
<head>
<meta http-equiv="content-type" content="text/html; charset=UTF-8">
<title></title>
<script src="http://d3js.org/d3.v3.min.js" charset="utf-8"></script>
<style type="text/css">
a.link{color: grey;}
a.link:hover{color: black;}
div.gallery{width:800px; height:500px; overflow:scroll}
</style>
</head>
<body>
<script id="template" type="text/template">
<a class="link" href="{url}">{title}</a>
</script>
<div class="analysis"></div>
<script type="text/javascript">
var tmpl = d3.select("#template").text();
d3.json("analysis.json", function(json){
d3.select(".analysis")
.selectAll("div.entry")
.data(json.analysis.field_types.text_en_splitting.index)
.enter().append("div")
.attr("class", "entry")
.html(function(d, i){
return d;
});
});
</script>
</body>
</html>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment