Created
May 9, 2020 04:39
-
-
Save keithrozario/865c33b27f06e1c3d82021794e913628 to your computer and use it in GitHub Desktop.
AWS This is my Architecture Scrapper
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import json | |
import csv | |
base_url = "https://aws.amazon.com/api/dirs/items/search" | |
params = { | |
"item.directoryId": "this-is-my-architecture", | |
"sort_by": "item.additionalFields.airDate", | |
"sort_order": "desc", | |
"size": 100, | |
"item.locale": "en_US" | |
} | |
posts = list() | |
# Not the best, but iterate over the first 100 pages, break if the count == 0. | |
for page_num in range(0, 100): | |
params['page'] = page_num | |
response = json.loads(requests.get(base_url, params=params).content.decode('utf-8')) | |
posts.extend(response['items']) | |
if response['metadata']['count'] == 0: | |
break | |
print(f"Found {len(posts)} posts on this is my architecture") | |
tag_data, category_data, post_data = list(), list(), list() | |
for post_id, post in enumerate(posts): | |
air_date = post['item']['additionalFields']['airDate'] | |
description = post['item']['additionalFields']['description'] | |
headline = post['item']['additionalFields']['headline'] | |
url = post['item']['additionalFields']['headlineUrl'] | |
post_data.append({ | |
"post_id": post_id, | |
"description": description, | |
"date": air_date, | |
"headline": headline, | |
"url": url | |
}) | |
try: | |
categories = [cat.strip() for cat in post['item']['additionalFields']['category'].split('|')] | |
for cat in categories: | |
category_data.append({ | |
"post_id": post_id, | |
"category": cat, | |
"date": air_date}) | |
except KeyError: | |
pass | |
tags = [tag.strip().lower() for tag in post['item']['additionalFields']['youtubeTags'].split(',')] | |
tags = set(tags) # deduplicate list | |
for tag in tags: | |
tag_data.append({ | |
"post_id": post_id, | |
"tag": tag, | |
"date": air_date | |
}) | |
with open('tags.csv', 'w', newline='\n') as tag_file: | |
fieldnames = ['post_id', 'date', 'tag'] | |
writer = csv.DictWriter(tag_file, fieldnames=fieldnames, quoting=csv.QUOTE_ALL) | |
writer.writeheader() | |
writer.writerows(tag_data) | |
with open('categories.csv', 'w', newline='\n') as cat_file: | |
fieldnames = ['post_id', 'date', 'category'] | |
writer = csv.DictWriter(cat_file, fieldnames=fieldnames, quoting=csv.QUOTE_ALL) | |
writer.writeheader() | |
writer.writerows(category_data) | |
with open('post.csv', 'w', newline='\n') as post_file: | |
fieldnames = ['post_id', 'date', 'headline', 'description', 'url'] | |
writer = csv.DictWriter(post_file, fieldnames=fieldnames, quoting=csv.QUOTE_ALL) | |
writer.writeheader() | |
writer.writerows(post_data) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment