Skip to content

Instantly share code, notes, and snippets.

@toy-crane
Created March 11, 2025 00:39
Show Gist options
  • Save toy-crane/04be049718d1797377083c5986080cc9 to your computer and use it in GitHub Desktop.
Save toy-crane/04be049718d1797377083c5986080cc9 to your computer and use it in GitHub Desktop.
import requests
import csv
import json
import time
import argparse
from urllib.parse import urlencode
def scrape_bluer_restaurants(output_file="bluer_restaurants.csv"):
"""
Scrape restaurant data from Bluer API and save to CSV file
Args:
output_file (str): Path to output CSV file
"""
# Base URL for the API
base_url = "https://www.bluer.co.kr/api/v1/restaurants"
# Parameters for the API request
params = {
"page": 0,
"size": 30,
"query": "",
"foodType": "",
"foodTypeDetail": "",
"feature": "",
"location": "",
"locationDetail": "",
"area": "",
"areaDetail": "",
"priceRange": "",
"ribbonType": "RIBBON_ONE",
"recommended": "false",
"isSearchName": "false",
"tabMode": "single",
"searchMode": "ribbonType",
"zone1": "",
"zone2": "",
"zone2Lat": "",
"zone2Lng": ""
}
# Headers to mimic a browser request
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
"Accept": "application/json, text/plain, */*",
"Accept-Language": "ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7",
"Referer": "https://www.bluer.co.kr/",
"Origin": "https://www.bluer.co.kr",
"Connection": "keep-alive"
}
all_restaurants = []
current_page = 0
total_pages = None
print("Starting to scrape Bluer restaurant data...")
# Loop through pages until there's no more data
while total_pages is None or current_page < total_pages:
# Update page parameter
params["page"] = current_page
# Construct URL with parameters
url = f"{base_url}?{urlencode(params)}"
print(f"Fetching page {current_page + 1}...")
try:
# Make the request
response = requests.get(url, headers=headers)
response.raise_for_status() # Raise exception for HTTP errors
# Parse JSON response
data = response.json()
# Extract restaurant data from the correct path in the JSON
restaurants = data.get("_embedded", {}).get("restaurants", [])
if not restaurants:
print("No more restaurants found.")
break
# Add restaurants to our list
all_restaurants.extend(restaurants)
# Get total pages information
if total_pages is None:
page_info = data.get("page", {})
total_pages = page_info.get("totalPages", 0)
total_elements = page_info.get("totalElements", 0)
print(f"Found {total_elements} restaurants across {total_pages} pages.")
# Increment page counter
current_page += 1
# Be nice to the server with a small delay
time.sleep(1)
except requests.exceptions.RequestException as e:
print(f"Error fetching data: {e}")
break
except json.JSONDecodeError:
print("Error parsing JSON response")
break
print(f"Scraped {len(all_restaurants)} restaurants in total.")
# Save data to CSV
if all_restaurants:
save_to_csv(all_restaurants, output_file)
print(f"Data saved to {output_file}")
else:
print("No data to save.")
def save_to_csv(restaurants, output_file):
"""
Save restaurant data to CSV file
Args:
restaurants (list): List of restaurant dictionaries
output_file (str): Path to output CSV file
"""
# Extract a sample restaurant to get all possible fields
if not restaurants:
return
# Define the fields we want to extract based on the actual JSON structure
fields = [
"id",
"headerInfo.nameKR",
"headerInfo.nameEN",
"headerInfo.ribbonType",
"defaultInfo.chefName",
"defaultInfo.website",
"defaultInfo.phone",
"defaultInfo.dayOff",
"statusInfo.priceRange",
"statusInfo.menu",
"statusInfo.businessHours",
"juso.roadAddrPart1",
"juso.siNm",
"juso.sggNm",
"gps.latitude",
"gps.longitude",
"review.review"
]
# Create header names for the CSV
header_names = [
"ID",
"Name",
"English Name",
"Ribbon Type",
"Chef Name",
"Website",
"Phone",
"Day Off",
"Price Range",
"Menu",
"Business Hours",
"Address",
"City",
"District",
"Latitude",
"Longitude",
"Review"
]
with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(header_names)
for restaurant in restaurants:
row = []
for field in fields:
# Handle nested fields with dot notation
if "." in field:
parts = field.split(".")
value = restaurant
for part in parts:
if isinstance(value, dict) and part in value:
value = value[part]
else:
value = ""
break
else:
value = restaurant.get(field, "")
row.append(value)
writer.writerow(row)
if __name__ == "__main__":
# Set up command line argument parsing
parser = argparse.ArgumentParser(description='Scrape restaurant data from Bluer API')
parser.add_argument('-o', '--output', type=str, default='bluer_restaurants.csv',
help='Output CSV file name (default: bluer_restaurants.csv)')
args = parser.parse_args()
# Run the scraper with the specified output file
scrape_bluer_restaurants(args.output)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment