Skip to content

Instantly share code, notes, and snippets.

@rightson
Last active December 22, 2024 12:37
Show Gist options
  • Save rightson/e64a61cef3284879aee0f962a73c2f5f to your computer and use it in GitHub Desktop.
Save rightson/e64a61cef3284879aee0f962a73c2f5f to your computer and use it in GitHub Desktop.
Better coolpc (original site's UX is horrible)
import requests
import os
import argparse
import json
import re
from datetime import datetime
from bs4 import BeautifulSoup
def fetch_or_load_html(url, cache_file='coolpc_cache.html', force_refresh=False):
"""
Fetch HTML from URL or load from cache based on settings
"""
try:
# Check if cache exists and should be used
if not force_refresh and os.path.exists(cache_file):
cache_time = datetime.fromtimestamp(os.path.getmtime(cache_file))
print(f"Debug: Found cache file from {cache_time}")
with open(cache_file, 'r', encoding='cp950', errors='replace') as f:
return f.read()
# Fetch from URL
print(f"Debug: Fetching fresh copy from {url}")
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,/;q=0.8'
}
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
# Explicitly set CP950 encoding
response.encoding = 'cp950'
# Save to cache using CP950 encoding
with open(cache_file, 'w', encoding='cp950', errors='replace') as f:
f.write(response.text)
print(f"Debug: Saved new cache file at {datetime.now()}")
return response.text
except requests.RequestException as e:
print(f"Error fetching URL: {e}")
if os.path.exists(cache_file):
print("Debug: Using existing cache file due to fetch error")
with open(cache_file, 'r', encoding='cp950', errors='replace') as f:
return f.read()
raise
def parse_option_text(text):
"""Parse the option text into title, price, and popularity"""
title = ""
price = "N/A"
popularity = ""
# Extract price
price_match = re.search(r'\$\d+,?\d*', text)
if price_match:
price = price_match.group(0)
# Get popularity indicators
popularity = '★' if '★' in text else ''
popularity = '♥' if '♥' in text else popularity
# Get title
if price_match:
title = text[:price_match.start()].strip(' ,')
else:
title = text.strip()
return title, price, popularity
def extract_select_options(html_content):
soup = BeautifulSoup(html_content, 'html.parser')
tbody = soup.find('tbody', id='tbdy')
if not tbody:
print("Debug: tbody with id 'tbdy' not found")
return None, None
json_results = []
md_content = [] # Store markdown content
selects = tbody.find_all('select')
print(f"Debug: Found {len(selects)} select elements\n")
for select_num, select in enumerate(selects, 1):
select_data = {
'select_num': select_num,
'name': select.get('name', 'N/A'),
'groups': []
}
# Add to markdown
md_content.append(f"\n## Select #{select_num} (name: {select.get('name', 'N/A')})")
optgroups = select.find_all('optgroup')
if optgroups:
for optgroup in optgroups:
current_category = optgroup.get('label', 'No Label')
group_data = {
'title': current_category,
'options': []
}
# Add to markdown
md_content.append(f"\n### {current_category}")
md_content.append("\n| Title | Price | Popularity |")
md_content.append("|-------|-------|------------|")
options = optgroup.find_all('option')
for option in options:
if not option.get('disabled'): # Skip disabled options
title, price, popularity = parse_option_text(option.get_text(strip=True))
# Add to JSON
option_data = {
'title': title,
'price': price,
'popularity': popularity
}
group_data['options'].append(option_data)
# Add to markdown
md_content.append(f"| {title} | {price} | {popularity} |")
select_data['groups'].append(group_data)
md_content.append("") # Add blank line after table
json_results.append(select_data)
return json_results, "\n".join(md_content)
def save_json_with_encoding(data, filename):
"""Save JSON with proper encoding handling"""
try:
with open(filename, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=2)
except UnicodeEncodeError:
# Fallback to CP950 if UTF-8 fails
with open(filename, 'w', encoding='cp950', errors='replace') as f:
json.dump(data, f, ensure_ascii=False, indent=2)
def save_markdown(content, filename):
"""Save markdown content with proper encoding"""
try:
with open(filename, 'w', encoding='utf-8') as f:
f.write(content)
except UnicodeEncodeError:
with open(filename, 'w', encoding='cp950', errors='replace') as f:
f.write(content)
def main():
parser = argparse.ArgumentParser(description='Parse COOLPC select options')
parser.add_argument('--refresh', action='store_true',
help='Force refresh cache and fetch new data')
parser.add_argument('--url', default="https://www.coolpc.com.tw/evaluate.php",
help='URL to fetch (default: COOLPC evaluate page)')
parser.add_argument('--cache', default="coolpc_cache.html",
help='Cache file path (default: coolpc_cache.html)')
parser.add_argument('--json', default="coolpc_options.json",
help='Output JSON file path (default: coolpc_options.json)')
parser.add_argument('--md', default="coolpc_options.md",
help='Output Markdown file path (default: coolpc_options.md)')
args = parser.parse_args()
try:
# Show cache status
if os.path.exists(args.cache) and not args.refresh:
cache_time = datetime.fromtimestamp(os.path.getmtime(args.cache))
print(f"Using cached data from: {cache_time}")
elif args.refresh:
print("Forcing cache refresh...")
else:
print("No cache found, will fetch fresh data...")
# Fetch/load HTML
html_content = fetch_or_load_html(args.url, args.cache, args.refresh)
json_results, md_content = extract_select_options(html_content)
# Save results
if json_results:
save_json_with_encoding(json_results, args.json)
print(f"\nJSON results saved to {args.json}")
if md_content:
save_markdown(md_content, args.md)
print(f"Markdown results saved to {args.md}")
except Exception as e:
print(f"Error: {e}")
if __name__ == "__main__":
main()
import requests
import os
import argparse
import json
import re
from datetime import datetime
from bs4 import BeautifulSoup
def fetch_or_load_html(url, cache_file='coolpc_cache.html', force_refresh=False):
"""
Fetch HTML from URL or load from cache based on settings
"""
try:
# Check if cache exists and should be used
if not force_refresh and os.path.exists(cache_file):
cache_time = datetime.fromtimestamp(os.path.getmtime(cache_file))
print(f"Debug: Found cache file from {cache_time}")
with open(cache_file, 'r', encoding='utf-8') as f:
return f.read()
# Fetch from URL
print(f"Debug: Fetching fresh copy from {url}")
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
}
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
# Convert content from big5 to utf-8
content = response.content.decode('big5', errors='replace')
# Save to cache using utf-8
with open(cache_file, 'w', encoding='utf-8') as f:
f.write(content)
print(f"Debug: Saved new cache file at {datetime.now()}")
return content
except requests.RequestException as e:
print(f"Error fetching URL: {e}")
if os.path.exists(cache_file):
print("Debug: Using existing cache file due to fetch error")
with open(cache_file, 'r', encoding='utf-8') as f:
return f.read()
raise
def parse_option_text(text):
"""Parse the option text into title, price, and popularity"""
title = ""
price = "N/A"
popularity = ""
# Extract price
price_match = re.search(r'\$\d+,?\d*', text)
if price_match:
price = price_match.group(0)
# Convert special characters to UTF-8 compatible ones
if '★' in text:
popularity = '⭐' # Unicode star
elif '♥' in text:
popularity = '❤️' # Unicode heart
# Get title
if price_match:
title = text[:price_match.start()].strip(' ,')
else:
title = text.strip()
return title, price, popularity
def extract_select_options(html_content):
soup = BeautifulSoup(html_content, 'html.parser')
tbody = soup.find('tbody', id='tbdy')
if not tbody:
print("Debug: tbody with id 'tbdy' not found")
return None, None
json_results = []
md_content = [] # Store markdown content
selects = tbody.find_all('select')
print(f"Debug: Found {len(selects)} select elements\n")
for select_num, select in enumerate(selects, 1):
select_data = {
'select_num': select_num,
'name': select.get('name', 'N/A'),
'groups': []
}
# Add to markdown
md_content.append(f"\n## Select #{select_num} (name: {select.get('name', 'N/A')})")
optgroups = select.find_all('optgroup')
if optgroups:
for optgroup in optgroups:
current_category = optgroup.get('label', 'No Label')
group_data = {
'title': current_category,
'options': []
}
# Add to markdown
md_content.append(f"\n### {current_category}")
md_content.append("\n| Title | Price | Popularity |")
md_content.append("|-------|-------|------------|")
options = optgroup.find_all('option')
for option in options:
if not option.get('disabled'): # Skip disabled options
title, price, popularity = parse_option_text(option.get_text(strip=True))
# Add to JSON
option_data = {
'title': title,
'price': price,
'popularity': popularity
}
group_data['options'].append(option_data)
# Add to markdown
md_content.append(f"| {title} | {price} | {popularity} |")
select_data['groups'].append(group_data)
md_content.append("") # Add blank line after table
json_results.append(select_data)
return json_results, "\n".join(md_content)
def save_json_with_encoding(data, filename):
"""Save JSON with UTF-8 encoding"""
with open(filename, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=2)
def save_markdown(content, filename):
"""Save markdown content with UTF-8 encoding"""
with open(filename, 'w', encoding='utf-8') as f:
f.write(content)
def main():
parser = argparse.ArgumentParser(description='Parse COOLPC select options')
parser.add_argument('--refresh', action='store_true',
help='Force refresh cache and fetch new data')
parser.add_argument('--url', default="https://www.coolpc.com.tw/evaluate.php",
help='URL to fetch (default: COOLPC evaluate page)')
parser.add_argument('--cache', default="coolpc_cache.html",
help='Cache file path (default: coolpc_cache.html)')
parser.add_argument('--json', default="coolpc_options.json",
help='Output JSON file path (default: coolpc_options.json)')
parser.add_argument('--md', default="coolpc_options.md",
help='Output Markdown file path (default: coolpc_options.md)')
args = parser.parse_args()
try:
# Show cache status
if os.path.exists(args.cache) and not args.refresh:
cache_time = datetime.fromtimestamp(os.path.getmtime(args.cache))
print(f"Using cached data from: {cache_time}")
elif args.refresh:
print("Forcing cache refresh...")
else:
print("No cache found, will fetch fresh data...")
# Fetch/load HTML
html_content = fetch_or_load_html(args.url, args.cache, args.refresh)
json_results, md_content = extract_select_options(html_content)
# Save results
if json_results:
save_json_with_encoding(json_results, args.json)
print(f"\nJSON results saved to {args.json}")
if md_content:
save_markdown(md_content, args.md)
print(f"Markdown results saved to {args.md}")
except Exception as e:
print(f"Error: {e}")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment