Created
November 20, 2015 20:14
-
-
Save soobrosa/4adf89ce197eb6299eb9 to your computer and use it in GitHub Desktop.
Is Yelp international?
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
source.downloaded: | |
mkdir source | |
cd source && { curl -O "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/yelp_dataset_challenge_academic_dataset.zip" ; cd -; } | |
source.decompressed: source.downloaded | |
unzip source/yelp_dataset_challenge_academic_dataset.zip | |
# | |
# one record pretty printed from each file composed of lines of JSONs | |
# | |
examples: source.decompressed | |
head -n 1 source/yelp_academic_dataset_business.json | python -mjson.tool > example/business.json | |
head -n 1 source/yelp_academic_dataset_review.json | python -mjson.tool > example/review.json | |
head -n 1 source/yelp_academic_dataset_user.json | python -mjson.tool > example/user.json | |
head -n 1 source/yelp_academic_dataset_checkin.json | python -mjson.tool > example/checkin.json | |
head -n 1 source/yelp_academic_dataset_tip.json | python -mjson.tool > example/tip.json | |
# | |
# $ brew install coreutils | |
# | |
# a random 10k sample from all files for quickies | |
# | |
samples: source.decompressed | |
gshuf -n 10000 source/yelp_academic_dataset_business.json > sample/business.json | |
gshuf -n 10000 source/yelp_academic_dataset_review.json > sample/review.json | |
gshuf -n 10000 source/yelp_academic_dataset_user.json > sample/user.json | |
gshuf -n 10000 source/yelp_academic_dataset_checkin.json > sample/checkin.json | |
gshuf -n 10000 source/yelp_academic_dataset_tip.json > sample/tip.json | |
# | |
# https://github.com/jehiah/json2csv | |
# | |
# http://stedolan.github.io/jq/ | |
# | |
# flatten all files to tables lazily | |
# | |
sample_flattened: source.decompressed | |
< sample/tip.json | json2csv -p=true -k business_id,date,likes,text,user_id > sample_flattened/tip.csv | |
< sample/tip.json | json2csv -p=true -k business_id,date,likes,user_id > source_flattened/tip_no_text.csv | |
< sample/review.json | json2csv -p=true -k business_id,date,review_id,stars,text,user_id > sample_flattened/review_compact.csv | |
< sample/review.json | json2csv -p=true -k business_id,date,review_id,stars,user_id > source_flattened/review_compact_no_text.csv | |
< sample/review.json | jq -c '{review_id, a: .votes | to_entries[]}' | jq -c '{review_id: .review_id, key: .a.key, value: .a.value}' | json2csv -p=true -k review_id,key,value > sample_flattened/review_votes.csv | |
< sample/user.json | jq -c '{average_stars, fans, friends: .friends | length, name, review_count, user_id, yelping_since}' | json2csv -p=true -k average_stars,fans,friends,name,review_count,user_id,yelping_since > sample_flattened/user_compact.csv | |
< sample/user.json | jq -c '{user_id, friend: .friends[]}'| json2csv -p=true -k user_id,friend > sample_flattened/user_friends.csv | |
< sample/user.json | jq -c '{user_id, a: .compliments | to_entries[]}' | jq -c '{user_id: .user_id, key: .a.key, value: .a.value}' | json2csv -p=true -k user_id,key,value > sample_flattened/user_compliments.csv | |
< sample/business.json | jq -c '{business_id, category_main: .categories[0], category_sub: .categories[1], city, latitude, longitude, name, neighborhood: .neighborhoods[0], open, review_count, stars, state}' | json2csv -p=true -k business_id,category_main,category_sub,city,latitude,longitude,name,neighborhood,open,review_count,stars,state > sample_flattened/business.csv | |
< sample/business.json | jq -c '{business_id, a: .hours | to_entries[]}' | jq -c '{business_id, day: .a.key, b: .a.value | to_entries[]}' | jq -c '{business_id: .business_id, day: .day, key: .b.key, value: .b.value}' | json2csv -p=true -k business_id,day,key,value > sample_flattened/business_hours.csv | |
< sample/checkin.json | jq -c '{business_id, a: .checkin_info | to_entries[]}' | jq -c '{business_id: .business_id, key: .a.key | split ("-"), value: .a.value}' | jq -c '{business_id: .business_id, key1: .key[0], key2: .key[1], value: .value}' | json2csv -p=true -k business_id,key1,key2,value > sample_flattened/checkin.csv | |
source_flattened: source.decompressed | |
< source/yelp_academic_dataset_tip.json | json2csv -p=true -k business_id,date,likes,text,user_id > source_flattened/tip.csv | |
< source/yelp_academic_dataset_tip.json | json2csv -p=true -k business_id,date,likes,user_id > source_flattened/tip_no_text.csv | |
< source/yelp_academic_dataset_review.json | json2csv -p=true -k business_id,date,review_id,stars,text,user_id > source_flattened/review_compact.csv | |
< source/yelp_academic_dataset_review.json | json2csv -p=true -k business_id,date,review_id,stars,user_id > source_flattened/review_compact_no_text.csv | |
< source/yelp_academic_dataset_review.json | jq -c '{review_id, a: .votes | to_entries[]}' | jq -c '{review_id: .review_id, key: .a.key, value: .a.value}' | json2csv -p=true -k review_id,key,value > source_flattened/review_votes.csv | |
< source/yelp_academic_dataset_user.json | jq -c '{average_stars, fans, friends: .friends | length, name, review_count, user_id, yelping_since}' | json2csv -p=true -k average_stars,fans,friends,name,review_count,user_id,yelping_since > source_flattened/user_compact.csv | |
< source/yelp_academic_dataset_user.json | jq -c '{user_id, friend: .friends[]}' | json2csv -p=true -k user_id,friend > sample_flattened/source_friends.csv | |
< source/yelp_academic_dataset_user.json | jq -c '{user_id, a: .compliments | to_entries[]}' | jq -c '{user_id: .user_id, key: .a.key, value: .a.value}' | json2csv -p=true -k user_id,key,value > sample_flattened/user_compliments.csv | |
< source/yelp_academic_dataset_business.json | jq -c '{business_id, category_main: .categories[0], category_sub: .categories[1], city, latitude, longitude, name, neighborhood: .neighborhoods[0], open, review_count, stars, state}' | json2csv -p=true -k business_id,category_main,category_sub,city,latitude,longitude,name,neighborhood,open,review_count,stars,state > source_flattened/business.csv | |
< source/yelp_academic_dataset_business.json | jq -c '{business_id, a: .hours | to_entries[]}' | jq -c '{business_id, day: .a.key, b: .a.value | to_entries[]}' | jq -c '{business_id: .business_id, day: .day, key: .b.key, value: .b.value}' | json2csv -p=true -k business_id,day,key,value > source_flattened/business_hours.csv | |
< source/yelp_academic_dataset_checkin.json | jq -c '{business_id, a: .checkin_info | to_entries[]}' | jq -c '{business_id: .business_id, key: .a.key | split ("-"), value: .a.value}' | jq -c '{business_id: .business_id, key1: .key[0], key2: .key[1], value: .value}' | json2csv -p=true -k business_id,key1,key2,value > source_flattened/checkin.csv |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment