toy-crane · March 11, 2025 00:39
diff --git a/blue ribbon crawling b/blue ribbon crawling
 import requests
 import csv
 import json
 import time
 import argparse
 from urllib.parse import urlencode

 def scrape_bluer_restaurants(output_file="bluer_restaurants.csv"):
    """
    Scrape restaurant data from Bluer API and save to CSV file
    
    Args:
        output_file (str): Path to output CSV file
    """
    # Base URL for the API
    base_url = "https://www.bluer.co.kr/api/v1/restaurants"
    
    # Parameters for the API request
    params = {
        "page": 0,
        "size": 30,
        "query": "",
        "foodType": "",
        "foodTypeDetail": "",
        "feature": "",
        "location": "",
        "locationDetail": "",
        "area": "",
        "areaDetail": "",
        "priceRange": "",
        "ribbonType": "RIBBON_ONE",
        "recommended": "false",
        "isSearchName": "false",
        "tabMode": "single",
        "searchMode": "ribbonType",
        "zone1": "",
        "zone2": "",
        "zone2Lat": "",
        "zone2Lng": ""
    }
    
    # Headers to mimic a browser request
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
        "Accept": "application/json, text/plain, */*",
        "Accept-Language": "ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7",
        "Referer": "https://www.bluer.co.kr/",
        "Origin": "https://www.bluer.co.kr",
        "Connection": "keep-alive"
    }
    
    all_restaurants = []
    current_page = 0
    total_pages = None
    
    print("Starting to scrape Bluer restaurant data...")
    
    # Loop through pages until there's no more data
    while total_pages is None or current_page < total_pages:
        # Update page parameter
        params["page"] = current_page
        
        # Construct URL with parameters
        url = f"{base_url}?{urlencode(params)}"
        
        print(f"Fetching page {current_page + 1}...")
        
        try:
            # Make the request
            response = requests.get(url, headers=headers)
            response.raise_for_status()  # Raise exception for HTTP errors
            
            # Parse JSON response
            data = response.json()
            
            # Extract restaurant data from the correct path in the JSON
            restaurants = data.get("_embedded", {}).get("restaurants", [])
            
            if not restaurants:
                print("No more restaurants found.")
                break
                
            # Add restaurants to our list
            all_restaurants.extend(restaurants)
            
            # Get total pages information
            if total_pages is None:
                page_info = data.get("page", {})
                total_pages = page_info.get("totalPages", 0)
                total_elements = page_info.get("totalElements", 0)
                print(f"Found {total_elements} restaurants across {total_pages} pages.")
            
            # Increment page counter
            current_page += 1
            
            # Be nice to the server with a small delay
            time.sleep(1)
            
        except requests.exceptions.RequestException as e:
            print(f"Error fetching data: {e}")
            break
        except json.JSONDecodeError:
            print("Error parsing JSON response")
            break
    
    print(f"Scraped {len(all_restaurants)} restaurants in total.")
    
    # Save data to CSV
    if all_restaurants:
        save_to_csv(all_restaurants, output_file)
        print(f"Data saved to {output_file}")
    else:
        print("No data to save.")

 def save_to_csv(restaurants, output_file):
    """
    Save restaurant data to CSV file
    
    Args:
        restaurants (list): List of restaurant dictionaries
        output_file (str): Path to output CSV file
    """
    # Extract a sample restaurant to get all possible fields
    if not restaurants:
        return
    
    # Define the fields we want to extract based on the actual JSON structure
    fields = [
        "id", 
        "headerInfo.nameKR", 
        "headerInfo.nameEN", 
        "headerInfo.ribbonType", 
        "defaultInfo.chefName", 
        "defaultInfo.website", 
        "defaultInfo.phone", 
        "defaultInfo.dayOff", 
        "statusInfo.priceRange", 
        "statusInfo.menu", 
        "statusInfo.businessHours", 
        "juso.roadAddrPart1", 
        "juso.siNm", 
        "juso.sggNm", 
        "gps.latitude", 
        "gps.longitude", 
        "review.review"
    ]
    
    # Create header names for the CSV
    header_names = [
        "ID", 
        "Name", 
        "English Name", 
        "Ribbon Type", 
        "Chef Name", 
        "Website", 
        "Phone", 
        "Day Off", 
        "Price Range", 
        "Menu", 
        "Business Hours", 
        "Address", 
        "City", 
        "District", 
        "Latitude", 
        "Longitude", 
        "Review"
    ]
    
    with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(header_names)
        
        for restaurant in restaurants:
            row = []
            for field in fields:
                # Handle nested fields with dot notation
                if "." in field:
                    parts = field.split(".")
                    value = restaurant
                    for part in parts:
                        if isinstance(value, dict) and part in value:
                            value = value[part]
                        else:
                            value = ""
                            break
                else:
                    value = restaurant.get(field, "")
                
                row.append(value)
            
            writer.writerow(row)

 if __name__ == "__main__":
    # Set up command line argument parsing
    parser = argparse.ArgumentParser(description='Scrape restaurant data from Bluer API')
    parser.add_argument('-o', '--output', type=str, default='bluer_restaurants.csv',
                        help='Output CSV file name (default: bluer_restaurants.csv)')
    
    args = parser.parse_args()
    
    # Run the scraper with the specified output file
    scrape_bluer_restaurants(args.output)
	import requests
	import csv
	import json
	import time
	import argparse
	from urllib.parse import urlencode

	def scrape_bluer_restaurants(output_file="bluer_restaurants.csv"):
	"""
	Scrape restaurant data from Bluer API and save to CSV file

	Args:
	output_file (str): Path to output CSV file
	"""
	# Base URL for the API
	base_url = "https://www.bluer.co.kr/api/v1/restaurants"

	# Parameters for the API request
	params = {
	"page": 0,
	"size": 30,
	"query": "",
	"foodType": "",
	"foodTypeDetail": "",
	"feature": "",
	"location": "",
	"locationDetail": "",
	"area": "",
	"areaDetail": "",
	"priceRange": "",
	"ribbonType": "RIBBON_ONE",
	"recommended": "false",
	"isSearchName": "false",
	"tabMode": "single",
	"searchMode": "ribbonType",
	"zone1": "",
	"zone2": "",
	"zone2Lat": "",
	"zone2Lng": ""
	}

	# Headers to mimic a browser request
	headers = {
	"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
	"Accept": "application/json, text/plain, /",
	"Accept-Language": "ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7",
	"Referer": "https://www.bluer.co.kr/",
	"Origin": "https://www.bluer.co.kr",
	"Connection": "keep-alive"
	}

	all_restaurants = []
	current_page = 0
	total_pages = None

	print("Starting to scrape Bluer restaurant data...")

	# Loop through pages until there's no more data
	while total_pages is None or current_page < total_pages:
	# Update page parameter
	params["page"] = current_page

	# Construct URL with parameters
	url = f"{base_url}?{urlencode(params)}"

	print(f"Fetching page {current_page + 1}...")

	try:
	# Make the request
	response = requests.get(url, headers=headers)
	response.raise_for_status() # Raise exception for HTTP errors

	# Parse JSON response
	data = response.json()

	# Extract restaurant data from the correct path in the JSON
	restaurants = data.get("_embedded", {}).get("restaurants", [])

	if not restaurants:
	print("No more restaurants found.")
	break

	# Add restaurants to our list
	all_restaurants.extend(restaurants)

	# Get total pages information
	if total_pages is None:
	page_info = data.get("page", {})
	total_pages = page_info.get("totalPages", 0)
	total_elements = page_info.get("totalElements", 0)
	print(f"Found {total_elements} restaurants across {total_pages} pages.")

	# Increment page counter
	current_page += 1

	# Be nice to the server with a small delay
	time.sleep(1)

	except requests.exceptions.RequestException as e:
	print(f"Error fetching data: {e}")
	break
	except json.JSONDecodeError:
	print("Error parsing JSON response")
	break

	print(f"Scraped {len(all_restaurants)} restaurants in total.")

	# Save data to CSV
	if all_restaurants:
	save_to_csv(all_restaurants, output_file)
	print(f"Data saved to {output_file}")
	else:
	print("No data to save.")

	def save_to_csv(restaurants, output_file):
	"""
	Save restaurant data to CSV file

	Args:
	restaurants (list): List of restaurant dictionaries
	output_file (str): Path to output CSV file
	"""
	# Extract a sample restaurant to get all possible fields
	if not restaurants:
	return

	# Define the fields we want to extract based on the actual JSON structure
	fields = [
	"id",
	"headerInfo.nameKR",
	"headerInfo.nameEN",
	"headerInfo.ribbonType",
	"defaultInfo.chefName",
	"defaultInfo.website",
	"defaultInfo.phone",
	"defaultInfo.dayOff",
	"statusInfo.priceRange",
	"statusInfo.menu",
	"statusInfo.businessHours",
	"juso.roadAddrPart1",
	"juso.siNm",
	"juso.sggNm",
	"gps.latitude",
	"gps.longitude",
	"review.review"
	]

	# Create header names for the CSV
	header_names = [
	"ID",
	"Name",
	"English Name",
	"Ribbon Type",
	"Chef Name",
	"Website",
	"Phone",
	"Day Off",
	"Price Range",
	"Menu",
	"Business Hours",
	"Address",
	"City",
	"District",
	"Latitude",
	"Longitude",
	"Review"
	]

	with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
	writer = csv.writer(csvfile)
	writer.writerow(header_names)

	for restaurant in restaurants:
	row = []
	for field in fields:
	# Handle nested fields with dot notation
	if "." in field:
	parts = field.split(".")
	value = restaurant
	for part in parts:
	if isinstance(value, dict) and part in value:
	value = value[part]
	else:
	value = ""
	break
	else:
	value = restaurant.get(field, "")

	row.append(value)

	writer.writerow(row)

	if __name__ == "__main__":
	# Set up command line argument parsing
	parser = argparse.ArgumentParser(description='Scrape restaurant data from Bluer API')
	parser.add_argument('-o', '--output', type=str, default='bluer_restaurants.csv',
	help='Output CSV file name (default: bluer_restaurants.csv)')

	args = parser.parse_args()

	# Run the scraper with the specified output file
	scrape_bluer_restaurants(args.output)