Last active
August 29, 2015 14:14
-
-
Save igorbrigadir/89b3dc75fcab0bc10d59 to your computer and use it in GitHub Desktop.
python-tika-server.ipnb
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "metadata": { | |
| "name": "", | |
| "signature": "sha256:388fb5be2fc1d54eed3c94abfce3c816b8f2abe8ab8b2384323a15b22ad2de0a" | |
| }, | |
| "nbformat": 3, | |
| "nbformat_minor": 0, | |
| "worksheets": [ | |
| { | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "Install Tika server docker image:" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "$ sudo docker pull logicalspark/docker-tikaserver\n", | |
| "$ sudo docker run -d -p 9998:9998 logicalspark/docker-tikaserver" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "Check to see if server is up:" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "$ curl -X GET http://localhost:9998/tika" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "> This is Tika Server. Please PUT" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "Use curl..." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "$ curl -T foo.doc http://localhost:9998/rmeta" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "[\n", | |
| " {\"Application-Name\":\"Microsoft Office Word\",\n", | |
| " \"Application-Version\":\"15.0000\",\n", | |
| " \"X-Parsed-By\":[\"org.apache.tika.parser.DefaultParser\",\"org.apache.tika.parser.microsoft.ooxml.OOXMLParser\"],\n", | |
| " \"X-TIKA:content\":\"embed_0 \"\n", | |
| " ...\n", | |
| " },\n", | |
| " {\"Content-Encoding\":\"ISO-8859-1\",\n", | |
| " \"Content-Length\":\"8\",\n", | |
| " \"Content-Type\":\"text/plain; charset=ISO-8859-1\"\n", | |
| " \"X-TIKA:content\": ...\n", | |
| " }\n", | |
| " ...\n", | |
| "]" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "... or your favourite REST python client:" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "import requests\n", | |
| "import json\n", | |
| "\n", | |
| "url = \"http://localhost:9998/rmeta\"\n", | |
| "filepath = 'foo.doc'\n", | |
| "\n", | |
| "with open(filepath) as fh:\n", | |
| " filedata = fh.read()\n", | |
| " response = requests.put(url, data=filedata, params={'file': filepath})" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "docjson = json.loads(response.text)" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [], | |
| "prompt_number": 16 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "Document Text is in X-TIKA:content" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "print docjson[0]['X-TIKA:content']" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "foo doc body" | |
| ] | |
| } | |
| ], | |
| "metadata": {} | |
| } | |
| ] | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment