Last active
December 22, 2024 12:37
-
-
Save rightson/e64a61cef3284879aee0f962a73c2f5f to your computer and use it in GitHub Desktop.
Better coolpc (original site's UX is horrible)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import os | |
import argparse | |
import json | |
import re | |
from datetime import datetime | |
from bs4 import BeautifulSoup | |
def fetch_or_load_html(url, cache_file='coolpc_cache.html', force_refresh=False): | |
""" | |
Fetch HTML from URL or load from cache based on settings | |
""" | |
try: | |
# Check if cache exists and should be used | |
if not force_refresh and os.path.exists(cache_file): | |
cache_time = datetime.fromtimestamp(os.path.getmtime(cache_file)) | |
print(f"Debug: Found cache file from {cache_time}") | |
with open(cache_file, 'r', encoding='cp950', errors='replace') as f: | |
return f.read() | |
# Fetch from URL | |
print(f"Debug: Fetching fresh copy from {url}") | |
headers = { | |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36', | |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,/;q=0.8' | |
} | |
response = requests.get(url, headers=headers, timeout=10) | |
response.raise_for_status() | |
# Explicitly set CP950 encoding | |
response.encoding = 'cp950' | |
# Save to cache using CP950 encoding | |
with open(cache_file, 'w', encoding='cp950', errors='replace') as f: | |
f.write(response.text) | |
print(f"Debug: Saved new cache file at {datetime.now()}") | |
return response.text | |
except requests.RequestException as e: | |
print(f"Error fetching URL: {e}") | |
if os.path.exists(cache_file): | |
print("Debug: Using existing cache file due to fetch error") | |
with open(cache_file, 'r', encoding='cp950', errors='replace') as f: | |
return f.read() | |
raise | |
def parse_option_text(text): | |
"""Parse the option text into title, price, and popularity""" | |
title = "" | |
price = "N/A" | |
popularity = "" | |
# Extract price | |
price_match = re.search(r'\$\d+,?\d*', text) | |
if price_match: | |
price = price_match.group(0) | |
# Get popularity indicators | |
popularity = '★' if '★' in text else '' | |
popularity = '♥' if '♥' in text else popularity | |
# Get title | |
if price_match: | |
title = text[:price_match.start()].strip(' ,') | |
else: | |
title = text.strip() | |
return title, price, popularity | |
def extract_select_options(html_content): | |
soup = BeautifulSoup(html_content, 'html.parser') | |
tbody = soup.find('tbody', id='tbdy') | |
if not tbody: | |
print("Debug: tbody with id 'tbdy' not found") | |
return None, None | |
json_results = [] | |
md_content = [] # Store markdown content | |
selects = tbody.find_all('select') | |
print(f"Debug: Found {len(selects)} select elements\n") | |
for select_num, select in enumerate(selects, 1): | |
select_data = { | |
'select_num': select_num, | |
'name': select.get('name', 'N/A'), | |
'groups': [] | |
} | |
# Add to markdown | |
md_content.append(f"\n## Select #{select_num} (name: {select.get('name', 'N/A')})") | |
optgroups = select.find_all('optgroup') | |
if optgroups: | |
for optgroup in optgroups: | |
current_category = optgroup.get('label', 'No Label') | |
group_data = { | |
'title': current_category, | |
'options': [] | |
} | |
# Add to markdown | |
md_content.append(f"\n### {current_category}") | |
md_content.append("\n| Title | Price | Popularity |") | |
md_content.append("|-------|-------|------------|") | |
options = optgroup.find_all('option') | |
for option in options: | |
if not option.get('disabled'): # Skip disabled options | |
title, price, popularity = parse_option_text(option.get_text(strip=True)) | |
# Add to JSON | |
option_data = { | |
'title': title, | |
'price': price, | |
'popularity': popularity | |
} | |
group_data['options'].append(option_data) | |
# Add to markdown | |
md_content.append(f"| {title} | {price} | {popularity} |") | |
select_data['groups'].append(group_data) | |
md_content.append("") # Add blank line after table | |
json_results.append(select_data) | |
return json_results, "\n".join(md_content) | |
def save_json_with_encoding(data, filename): | |
"""Save JSON with proper encoding handling""" | |
try: | |
with open(filename, 'w', encoding='utf-8') as f: | |
json.dump(data, f, ensure_ascii=False, indent=2) | |
except UnicodeEncodeError: | |
# Fallback to CP950 if UTF-8 fails | |
with open(filename, 'w', encoding='cp950', errors='replace') as f: | |
json.dump(data, f, ensure_ascii=False, indent=2) | |
def save_markdown(content, filename): | |
"""Save markdown content with proper encoding""" | |
try: | |
with open(filename, 'w', encoding='utf-8') as f: | |
f.write(content) | |
except UnicodeEncodeError: | |
with open(filename, 'w', encoding='cp950', errors='replace') as f: | |
f.write(content) | |
def main(): | |
parser = argparse.ArgumentParser(description='Parse COOLPC select options') | |
parser.add_argument('--refresh', action='store_true', | |
help='Force refresh cache and fetch new data') | |
parser.add_argument('--url', default="https://www.coolpc.com.tw/evaluate.php", | |
help='URL to fetch (default: COOLPC evaluate page)') | |
parser.add_argument('--cache', default="coolpc_cache.html", | |
help='Cache file path (default: coolpc_cache.html)') | |
parser.add_argument('--json', default="coolpc_options.json", | |
help='Output JSON file path (default: coolpc_options.json)') | |
parser.add_argument('--md', default="coolpc_options.md", | |
help='Output Markdown file path (default: coolpc_options.md)') | |
args = parser.parse_args() | |
try: | |
# Show cache status | |
if os.path.exists(args.cache) and not args.refresh: | |
cache_time = datetime.fromtimestamp(os.path.getmtime(args.cache)) | |
print(f"Using cached data from: {cache_time}") | |
elif args.refresh: | |
print("Forcing cache refresh...") | |
else: | |
print("No cache found, will fetch fresh data...") | |
# Fetch/load HTML | |
html_content = fetch_or_load_html(args.url, args.cache, args.refresh) | |
json_results, md_content = extract_select_options(html_content) | |
# Save results | |
if json_results: | |
save_json_with_encoding(json_results, args.json) | |
print(f"\nJSON results saved to {args.json}") | |
if md_content: | |
save_markdown(md_content, args.md) | |
print(f"Markdown results saved to {args.md}") | |
except Exception as e: | |
print(f"Error: {e}") | |
if __name__ == "__main__": | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import os | |
import argparse | |
import json | |
import re | |
from datetime import datetime | |
from bs4 import BeautifulSoup | |
def fetch_or_load_html(url, cache_file='coolpc_cache.html', force_refresh=False): | |
""" | |
Fetch HTML from URL or load from cache based on settings | |
""" | |
try: | |
# Check if cache exists and should be used | |
if not force_refresh and os.path.exists(cache_file): | |
cache_time = datetime.fromtimestamp(os.path.getmtime(cache_file)) | |
print(f"Debug: Found cache file from {cache_time}") | |
with open(cache_file, 'r', encoding='utf-8') as f: | |
return f.read() | |
# Fetch from URL | |
print(f"Debug: Fetching fresh copy from {url}") | |
headers = { | |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36', | |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' | |
} | |
response = requests.get(url, headers=headers, timeout=10) | |
response.raise_for_status() | |
# Convert content from big5 to utf-8 | |
content = response.content.decode('big5', errors='replace') | |
# Save to cache using utf-8 | |
with open(cache_file, 'w', encoding='utf-8') as f: | |
f.write(content) | |
print(f"Debug: Saved new cache file at {datetime.now()}") | |
return content | |
except requests.RequestException as e: | |
print(f"Error fetching URL: {e}") | |
if os.path.exists(cache_file): | |
print("Debug: Using existing cache file due to fetch error") | |
with open(cache_file, 'r', encoding='utf-8') as f: | |
return f.read() | |
raise | |
def parse_option_text(text): | |
"""Parse the option text into title, price, and popularity""" | |
title = "" | |
price = "N/A" | |
popularity = "" | |
# Extract price | |
price_match = re.search(r'\$\d+,?\d*', text) | |
if price_match: | |
price = price_match.group(0) | |
# Convert special characters to UTF-8 compatible ones | |
if '★' in text: | |
popularity = '⭐' # Unicode star | |
elif '♥' in text: | |
popularity = '❤️' # Unicode heart | |
# Get title | |
if price_match: | |
title = text[:price_match.start()].strip(' ,') | |
else: | |
title = text.strip() | |
return title, price, popularity | |
def extract_select_options(html_content): | |
soup = BeautifulSoup(html_content, 'html.parser') | |
tbody = soup.find('tbody', id='tbdy') | |
if not tbody: | |
print("Debug: tbody with id 'tbdy' not found") | |
return None, None | |
json_results = [] | |
md_content = [] # Store markdown content | |
selects = tbody.find_all('select') | |
print(f"Debug: Found {len(selects)} select elements\n") | |
for select_num, select in enumerate(selects, 1): | |
select_data = { | |
'select_num': select_num, | |
'name': select.get('name', 'N/A'), | |
'groups': [] | |
} | |
# Add to markdown | |
md_content.append(f"\n## Select #{select_num} (name: {select.get('name', 'N/A')})") | |
optgroups = select.find_all('optgroup') | |
if optgroups: | |
for optgroup in optgroups: | |
current_category = optgroup.get('label', 'No Label') | |
group_data = { | |
'title': current_category, | |
'options': [] | |
} | |
# Add to markdown | |
md_content.append(f"\n### {current_category}") | |
md_content.append("\n| Title | Price | Popularity |") | |
md_content.append("|-------|-------|------------|") | |
options = optgroup.find_all('option') | |
for option in options: | |
if not option.get('disabled'): # Skip disabled options | |
title, price, popularity = parse_option_text(option.get_text(strip=True)) | |
# Add to JSON | |
option_data = { | |
'title': title, | |
'price': price, | |
'popularity': popularity | |
} | |
group_data['options'].append(option_data) | |
# Add to markdown | |
md_content.append(f"| {title} | {price} | {popularity} |") | |
select_data['groups'].append(group_data) | |
md_content.append("") # Add blank line after table | |
json_results.append(select_data) | |
return json_results, "\n".join(md_content) | |
def save_json_with_encoding(data, filename): | |
"""Save JSON with UTF-8 encoding""" | |
with open(filename, 'w', encoding='utf-8') as f: | |
json.dump(data, f, ensure_ascii=False, indent=2) | |
def save_markdown(content, filename): | |
"""Save markdown content with UTF-8 encoding""" | |
with open(filename, 'w', encoding='utf-8') as f: | |
f.write(content) | |
def main(): | |
parser = argparse.ArgumentParser(description='Parse COOLPC select options') | |
parser.add_argument('--refresh', action='store_true', | |
help='Force refresh cache and fetch new data') | |
parser.add_argument('--url', default="https://www.coolpc.com.tw/evaluate.php", | |
help='URL to fetch (default: COOLPC evaluate page)') | |
parser.add_argument('--cache', default="coolpc_cache.html", | |
help='Cache file path (default: coolpc_cache.html)') | |
parser.add_argument('--json', default="coolpc_options.json", | |
help='Output JSON file path (default: coolpc_options.json)') | |
parser.add_argument('--md', default="coolpc_options.md", | |
help='Output Markdown file path (default: coolpc_options.md)') | |
args = parser.parse_args() | |
try: | |
# Show cache status | |
if os.path.exists(args.cache) and not args.refresh: | |
cache_time = datetime.fromtimestamp(os.path.getmtime(args.cache)) | |
print(f"Using cached data from: {cache_time}") | |
elif args.refresh: | |
print("Forcing cache refresh...") | |
else: | |
print("No cache found, will fetch fresh data...") | |
# Fetch/load HTML | |
html_content = fetch_or_load_html(args.url, args.cache, args.refresh) | |
json_results, md_content = extract_select_options(html_content) | |
# Save results | |
if json_results: | |
save_json_with_encoding(json_results, args.json) | |
print(f"\nJSON results saved to {args.json}") | |
if md_content: | |
save_markdown(md_content, args.md) | |
print(f"Markdown results saved to {args.md}") | |
except Exception as e: | |
print(f"Error: {e}") | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment