Skip to content

Instantly share code, notes, and snippets.

@jprante
Last active August 29, 2015 14:18
Show Gist options
  • Save jprante/22c28851bed9aff8d9da to your computer and use it in GitHub Desktop.
Save jprante/22c28851bed9aff8d9da to your computer and use it in GitHub Desktop.
Combining MySQL blob, PDF, Elasticsearch JDBC plugin, and attachment mapper type plugin
#!/bin/sh
# MySQL 5.1+ with http://dev.mysql.com/doc/refman/5.1/en/string-functions.html#function_load-file
# ES 1.4.4+
# plugins required: jdbc, mapper attachment
# test.pdf is a PDF that can be parsed by Tika
/usr/local/mysql/bin/mysql -u root test <<EOT
drop table test;
create table test (
id integer,
content blob
);
insert into test values (0,LOAD_FILE('/Users/joerg/Desktop/test.pdf'));
EOT
curl -XDELETE 'localhost:9200/_river/my_test_river/'
curl -XDELETE 'localhost:9200/mytest'
curl -XPOST 'localhost:9200/_river/my_test_river/_meta' -d '
{
"type" : "jdbc",
"jdbc" : {
"url" : "jdbc:mysql://localhost:3306/test",
"user" : "",
"password" : "",
"sql" : "select id as _id, content as \"content\" from test",
"index" : "mytest",
"type" : "mydocs",
"index_settings" : { },
"type_mapping" : {
"mydocs" : {
"properties" : {
"content" : {
"type" : "attachment",
"path" : "full",
"fields" : {
"content" : {
"type" : "string",
"store" : true
}
}
}
}
}
}
}
}
'
echo "sleeping while river should run..."
sleep 5
curl -XGET 'localhost:9200/mytest/_mapping?pretty'
# search for a word in the test,pdf file and show parsed PDF text only
curl -XPOST 'localhost:9200/mytest/_search?pretty' -d '
{
"fields" : [ "content" ],
"query": {
"match": {
"content" : "Online"
}
}
}'
curl -XDELETE 'localhost:9200/_river/my_test_river/'
$ bash mysql-blob-river.sh
{"error":"TypeMissingException[[_all] type[[my_test_river]] missing: No index has the type.]","status":404}{"acknowledged":true}{"_index":"_river","_type":"my_test_river","_id":"_meta","_version":1,"created":true}sleeping while river should run...
{
"mytest" : {
"mappings" : {
"mydocs" : {
"properties" : {
"content" : {
"type" : "attachment",
"path" : "full",
"fields" : {
"content" : {
"type" : "string",
"store" : true
},
"author" : {
"type" : "string"
},
"title" : {
"type" : "string"
},
"name" : {
"type" : "string"
},
"date" : {
"type" : "date",
"format" : "dateOptionalTime"
},
"keywords" : {
"type" : "string"
},
"content_type" : {
"type" : "string"
},
"content_length" : {
"type" : "integer"
},
"language" : {
"type" : "string"
}
}
}
}
}
}
}
}
{
"took" : 3,
"timed_out" : false,
"_shards" : {
"total" : 5,
"successful" : 5,
"failed" : 0
},
"hits" : {
"total" : 1,
"max_score" : 0.023972876,
"hits" : [ {
"_index" : "mytest",
"_type" : "mydocs",
"_id" : "0",
"_score" : 0.023972876,
"fields" : {
"content" : [ "\n2933380\n\nAufsatzbestellung\n\nTyp: Online\nMedea-Nummer: 2933380\n" ]
}
} ]
}
}
{"acknowledged":true}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment