Created
July 16, 2015 18:59
-
-
Save andheiberg/ab5958c077895131acfd to your computer and use it in GitHub Desktop.
Elasticsearch Tutorial
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from elasticsearch import Elasticsearch | |
es = Elasticsearch() | |
# Return a response of the top 100 IAMA Reddit posts of all time | |
response = requests.get("http://api.reddit.com/r/iama/top/?t=all&limit=100", | |
headers={"User-Agent":"TrackMaven"}) | |
fields = ['title', 'selftext', 'author', 'score', | |
'ups', 'downs', 'num_comments', 'url', 'created'] | |
# Loop through results and add each data dictionary to the ES "reddit" index | |
for i, iama in enumerate(response.json()['data']['children']): | |
content = iama['data'] | |
doc = {} | |
for field in fields: | |
doc[field] = content[field] | |
es.index(index="reddit", doc_type='iama', id=i, body=doc) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
from elasticsearch import Elasticsearch | |
# Map the fields of a new "trip" doc_type | |
mapping = { | |
"trip": { | |
"properties": { | |
"duration": {"type": "integer"}, | |
"start_date": {"type" : "date", "format" : "MM/dd/yyyy HH:mm"}, | |
"start_station": {"type": "string", "index": "not_analyzed"}, | |
"end_date": {"type" : "date", "format" : "MM/dd/yyyy HH:mm"}, | |
"end_station": {"type": "string", "index": "not_analyzed"}, | |
"bike_id": {"type": "string"}, | |
"subscriber_type": {"type": "string"} | |
} | |
} | |
} | |
# Create a new "bikeshare" index that includes "trips" with the above mapping | |
es = Elasticsearch() | |
es.indices.create("bikeshare") | |
es.indices.put_mapping(index="bikeshare", doc_type="trip", body=mapping) | |
# Import a CSV file of trip data - this will take quite a while! | |
with open('trips.csv', 'rb') as csvfile: | |
reader = csv.reader(csvfile) | |
reader.next() # Skip header row | |
for id, row in enumerate(reader): | |
h, m, s = row[0].split() | |
trip_seconds = int(h[:-1])*60*60 + int(m[:-4])*60 + int(s[:-4]) | |
content = { | |
"duration": trip_seconds, | |
"start_date": row[1], | |
"end_date": row[2], | |
"start_station": row[3], | |
"end_station": row[4], | |
"bike_id": row[5], | |
"subscriber_type": row[6] | |
} | |
es.index(index="bikeshare", doc_type='trip', id=id, body=content) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from elasticsearch import Elasticsearch | |
es = Elasticsearch() | |
# Fetch a specific result | |
res = es.get(index='reddit', doc_type='iama', id=1) | |
print res['_source'] | |
# Update the index to be able to query against it | |
es.indices.refresh(index="reddit") | |
# Query for results: nothing will match this author | |
res = es.search(index="reddit", | |
body={"query": {"match": {"author": "no results here!"}}}) | |
print res | |
# Query for all results (no matching criteria) | |
res = es.search(index="reddit", body={"query": {"match_all": {}}}) | |
print res['hits']['total'] | |
print res['hits']['hits'][1]['_source']['title'] | |
# Query based on text appearing in the title | |
# (by default matches across capitalization, pluralization, etc) | |
res = es.search(index="reddit", body={"query": {"match": {"title": "obama"}}}) | |
print res['hits']['total'] | |
print res['hits']['hits'][0]['_source']['title'] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment