Last active
December 28, 2015 00:06
-
-
Save PythonicNinja/9b4952b6cbc17572c7db to your computer and use it in GitHub Desktop.
pydrill
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pydrill.client import PyDrill | |
drill = PyDrill(host='localhost', port=8047) | |
print drill.is_active() | |
yelp_reviews = drill.query(''' | |
SELECT * FROM | |
`dfs.root`.`./Users/macbookair/Downloads/yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_review.json` | |
LIMIT 5 | |
''') | |
for result in yelp_reviews: | |
print result['type'], result['date'] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pydrill.client import PyDrill | |
from pydrill_dsl import PyDrillDsl, fn | |
drill = PyDrill(host='localhost', port=8047) | |
dsl = PyDrillDsl(drill) | |
Review = dsl.search(storage_plugin='dfs', | |
workspace='root', | |
path='/Users/macbookair/Downloads/yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_review.json') | |
print Review.fields | |
# [u'votes', u'user_id', u'review_id', u'text', u'business_id', u'stars', u'date', u'type'] | |
reviews = Review.select() # SELECT * FROM `dfs.root`.`./Users/macbookair/Downloads/yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_review.json` | |
# By default lazy, but when iterated on it will run query. | |
for review in reviews: | |
print review.type, review.type | |
#review 2007-05-17 | |
#review 2010-03-22 | |
#review 2012-02-14 | |
#review 2012-03-02 | |
#review 2012-05-15 | |
#... | |
reviews = Review.select(Review.votes, Review.type).limit(5) # SELECT votes, type FROM `dfs.root`.`./Users/macbookair/Downloads/yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_review.json` LIMIT 5 | |
# explicitly run query without need to iterate. | |
results = reviews.run() | |
query = Review.select(Review.stars, Review.date, fn.COUNT(Review.review_id).alias('num_reviews'))\ | |
.group_by(Review.stars, Review.date)\ | |
.order_by(fn.COUNT(Review.review_id).alias('num_reviews').desc()) | |
# Based on that we could also have join methods for joining several data sources via .join method. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment