Created
March 5, 2016 20:47
-
-
Save arsenyinfo/22d5ae4ad09733134c16 to your computer and use it in GitHub Desktop.
This script can be useful if you need to parse products from Product Hunt into single table. It was tested with Python 3 but probably will work with Python 2.7 as well.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import logging | |
import pandas as pd | |
import requests | |
logging.basicConfig(level=logging.INFO, | |
format='%(asctime)s - %(levelname)s - %(message)s', | |
datefmt='%H:%M:%S', ) | |
logger = logging.getLogger(__name__) | |
token = 'Bearer INSERT_YOUR_TOKEN_HERE' | |
def parse(page): | |
for item in page: | |
result = dict() | |
for k in ('tagline', 'name', 'redirect_url', 'featured', 'discussion_url', 'day', 'votes_count'): | |
result[k] = item[k] | |
yield result | |
def get_index(page): | |
try: | |
index = min([x.get('id') for x in page]) | |
except ValueError: | |
index = 0 | |
return index | |
with requests.Session() as s: | |
s.headers['Authorization'] = token | |
s.headers['Content-Type'] = 'application/json' | |
s.headers['Accept'] = 'application/json' | |
s.headers['Host'] = 'api.producthunt.com' | |
post_id = 1000000000 | |
data = [] | |
while post_id: | |
url = 'https://api.producthunt.com/v1/posts/all?older={}'.format(post_id) | |
logger.info('Fetching posts older than {}'.format(post_id)) | |
page = s.get(url).json().get('posts') | |
for item in parse(page): | |
data.append(item) | |
post_id = get_index(page) | |
data = pd.DataFrame(data) | |
data.to_csv('product_hunt.csv', index=False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment