Created
March 11, 2025 00:39
-
-
Save toy-crane/04be049718d1797377083c5986080cc9 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import csv | |
import json | |
import time | |
import argparse | |
from urllib.parse import urlencode | |
def scrape_bluer_restaurants(output_file="bluer_restaurants.csv"): | |
""" | |
Scrape restaurant data from Bluer API and save to CSV file | |
Args: | |
output_file (str): Path to output CSV file | |
""" | |
# Base URL for the API | |
base_url = "https://www.bluer.co.kr/api/v1/restaurants" | |
# Parameters for the API request | |
params = { | |
"page": 0, | |
"size": 30, | |
"query": "", | |
"foodType": "", | |
"foodTypeDetail": "", | |
"feature": "", | |
"location": "", | |
"locationDetail": "", | |
"area": "", | |
"areaDetail": "", | |
"priceRange": "", | |
"ribbonType": "RIBBON_ONE", | |
"recommended": "false", | |
"isSearchName": "false", | |
"tabMode": "single", | |
"searchMode": "ribbonType", | |
"zone1": "", | |
"zone2": "", | |
"zone2Lat": "", | |
"zone2Lng": "" | |
} | |
# Headers to mimic a browser request | |
headers = { | |
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", | |
"Accept": "application/json, text/plain, */*", | |
"Accept-Language": "ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7", | |
"Referer": "https://www.bluer.co.kr/", | |
"Origin": "https://www.bluer.co.kr", | |
"Connection": "keep-alive" | |
} | |
all_restaurants = [] | |
current_page = 0 | |
total_pages = None | |
print("Starting to scrape Bluer restaurant data...") | |
# Loop through pages until there's no more data | |
while total_pages is None or current_page < total_pages: | |
# Update page parameter | |
params["page"] = current_page | |
# Construct URL with parameters | |
url = f"{base_url}?{urlencode(params)}" | |
print(f"Fetching page {current_page + 1}...") | |
try: | |
# Make the request | |
response = requests.get(url, headers=headers) | |
response.raise_for_status() # Raise exception for HTTP errors | |
# Parse JSON response | |
data = response.json() | |
# Extract restaurant data from the correct path in the JSON | |
restaurants = data.get("_embedded", {}).get("restaurants", []) | |
if not restaurants: | |
print("No more restaurants found.") | |
break | |
# Add restaurants to our list | |
all_restaurants.extend(restaurants) | |
# Get total pages information | |
if total_pages is None: | |
page_info = data.get("page", {}) | |
total_pages = page_info.get("totalPages", 0) | |
total_elements = page_info.get("totalElements", 0) | |
print(f"Found {total_elements} restaurants across {total_pages} pages.") | |
# Increment page counter | |
current_page += 1 | |
# Be nice to the server with a small delay | |
time.sleep(1) | |
except requests.exceptions.RequestException as e: | |
print(f"Error fetching data: {e}") | |
break | |
except json.JSONDecodeError: | |
print("Error parsing JSON response") | |
break | |
print(f"Scraped {len(all_restaurants)} restaurants in total.") | |
# Save data to CSV | |
if all_restaurants: | |
save_to_csv(all_restaurants, output_file) | |
print(f"Data saved to {output_file}") | |
else: | |
print("No data to save.") | |
def save_to_csv(restaurants, output_file): | |
""" | |
Save restaurant data to CSV file | |
Args: | |
restaurants (list): List of restaurant dictionaries | |
output_file (str): Path to output CSV file | |
""" | |
# Extract a sample restaurant to get all possible fields | |
if not restaurants: | |
return | |
# Define the fields we want to extract based on the actual JSON structure | |
fields = [ | |
"id", | |
"headerInfo.nameKR", | |
"headerInfo.nameEN", | |
"headerInfo.ribbonType", | |
"defaultInfo.chefName", | |
"defaultInfo.website", | |
"defaultInfo.phone", | |
"defaultInfo.dayOff", | |
"statusInfo.priceRange", | |
"statusInfo.menu", | |
"statusInfo.businessHours", | |
"juso.roadAddrPart1", | |
"juso.siNm", | |
"juso.sggNm", | |
"gps.latitude", | |
"gps.longitude", | |
"review.review" | |
] | |
# Create header names for the CSV | |
header_names = [ | |
"ID", | |
"Name", | |
"English Name", | |
"Ribbon Type", | |
"Chef Name", | |
"Website", | |
"Phone", | |
"Day Off", | |
"Price Range", | |
"Menu", | |
"Business Hours", | |
"Address", | |
"City", | |
"District", | |
"Latitude", | |
"Longitude", | |
"Review" | |
] | |
with open(output_file, 'w', newline='', encoding='utf-8') as csvfile: | |
writer = csv.writer(csvfile) | |
writer.writerow(header_names) | |
for restaurant in restaurants: | |
row = [] | |
for field in fields: | |
# Handle nested fields with dot notation | |
if "." in field: | |
parts = field.split(".") | |
value = restaurant | |
for part in parts: | |
if isinstance(value, dict) and part in value: | |
value = value[part] | |
else: | |
value = "" | |
break | |
else: | |
value = restaurant.get(field, "") | |
row.append(value) | |
writer.writerow(row) | |
if __name__ == "__main__": | |
# Set up command line argument parsing | |
parser = argparse.ArgumentParser(description='Scrape restaurant data from Bluer API') | |
parser.add_argument('-o', '--output', type=str, default='bluer_restaurants.csv', | |
help='Output CSV file name (default: bluer_restaurants.csv)') | |
args = parser.parse_args() | |
# Run the scraper with the specified output file | |
scrape_bluer_restaurants(args.output) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment