Last active
September 29, 2025 19:41
-
-
Save TechByTom/697d9c5b87121ace3039d82bd4fe9c9c to your computer and use it in GitHub Desktop.
Get Fortune Companies from Fortune.com
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """ | |
| Fortune 500/1000 Company Fetcher | |
| Author's Note: | |
| I have been looking for a complete and updated list of F500 companies for some | |
| work I'm doing, and nothing seems to exist. This should (for now) automatically | |
| retrieve the list from fortune.com - they annoyingly don't have a public plain | |
| text list. Also, does you the favor of making a mysql database populated with | |
| the info. Enjoy! | |
| This script fetches Fortune 500/1000 companies from fortune.com's API and | |
| creates a SQLite database with company information including rank, revenue, | |
| industry, and URLs. | |
| =============================================================================== | |
| COMMON USAGE EXAMPLES | |
| =============================================================================== | |
| SCRAPING COMMANDS: | |
| ------------------ | |
| # Fetch Fortune 500 (2024, basic data only) | |
| python3 fetch_by_revenue.py 500 | |
| # Fetch Fortune 1000 with detailed information (2024) | |
| python3 fetch_by_revenue.py 1000 --details | |
| # Fetch all companies for 2025 with details | |
| python3 fetch_by_revenue.py all 2025 --details | |
| # Fetch top 100 companies (2024, basic data) | |
| python3 fetch_by_revenue.py 100 | |
| # Fetch Fortune 500 for 2023 with details | |
| python3 fetch_by_revenue.py 500 2023 -d | |
| # Fetch top 2000 companies (2024, basic data) | |
| python3 fetch_by_revenue.py 2000 | |
| DATABASE QUERIES: | |
| ----------------- | |
| # Top 500 US companies by revenue | |
| sqlite3 -header -column fortune500.db " | |
| SELECT ROW_NUMBER() OVER (ORDER BY revenue DESC) as rank, | |
| name, revenue, industry, headquarters, ceo | |
| FROM companies | |
| WHERE country = 'U.S.' AND revenue IS NOT NULL AND revenue > 0 | |
| ORDER BY revenue DESC | |
| LIMIT 500; | |
| " | |
| # Export top 500 US companies to CSV | |
| sqlite3 -header -csv fortune500.db " | |
| SELECT ROW_NUMBER() OVER (ORDER BY revenue DESC) as rank, | |
| name, revenue, profits, employees, industry, headquarters, | |
| ceo, website, ticker, company_type | |
| FROM companies | |
| WHERE country = 'U.S.' AND revenue IS NOT NULL AND revenue > 0 | |
| ORDER BY revenue DESC | |
| LIMIT 500; | |
| " > fortune500_us.csv | |
| # Find all tech companies in top 1000 | |
| sqlite3 -header -column fortune500.db " | |
| SELECT rank, name, revenue, headquarters, ceo | |
| FROM companies | |
| WHERE industry LIKE '%Tech%' | |
| OR industry LIKE '%Software%' | |
| OR industry LIKE '%Internet%' | |
| ORDER BY revenue DESC | |
| LIMIT 50; | |
| " | |
| # Top 10 most profitable companies | |
| sqlite3 -header -column fortune500.db " | |
| SELECT rank, name, revenue, profits, | |
| ROUND(profits * 100.0 / revenue, 2) as profit_margin | |
| FROM companies | |
| WHERE profits IS NOT NULL AND revenue IS NOT NULL | |
| ORDER BY profits DESC | |
| LIMIT 10; | |
| " | |
| # All public companies with market value > $100B | |
| sqlite3 -header -column fortune500.db " | |
| SELECT rank, name, revenue, market_value, ticker | |
| FROM companies | |
| WHERE company_type = 'Public' | |
| AND market_value > 100000 | |
| ORDER BY market_value DESC; | |
| " | |
| # Companies by industry (count and total revenue) | |
| sqlite3 -header -column fortune500.db " | |
| SELECT industry, | |
| COUNT(*) as company_count, | |
| ROUND(SUM(revenue), 0) as total_revenue, | |
| ROUND(AVG(revenue), 0) as avg_revenue | |
| FROM companies | |
| WHERE industry IS NOT NULL AND revenue IS NOT NULL | |
| GROUP BY industry | |
| ORDER BY total_revenue DESC | |
| LIMIT 20; | |
| " | |
| # Find a specific company | |
| sqlite3 -header -column fortune500.db " | |
| SELECT rank, name, revenue, profits, employees, | |
| headquarters, ceo, website, ticker | |
| FROM companies | |
| WHERE name LIKE '%Apple%'; | |
| " | |
| # All companies in a specific state | |
| sqlite3 -header -column fortune500.db " | |
| SELECT rank, name, revenue, headquarters, industry | |
| FROM companies | |
| WHERE headquarters LIKE '%California%' | |
| OR headquarters LIKE '%Calif.%' | |
| OR headquarters LIKE '%CA%' | |
| ORDER BY revenue DESC; | |
| " | |
| # Top 100 by employee count | |
| sqlite3 -header -column fortune500.db " | |
| SELECT rank, name, employees, revenue, industry, headquarters | |
| FROM companies | |
| WHERE employees IS NOT NULL | |
| ORDER BY employees DESC | |
| LIMIT 100; | |
| " | |
| # All US companies ranked 1-1000 by revenue | |
| sqlite3 -header -csv fortune500.db " | |
| SELECT ROW_NUMBER() OVER (ORDER BY revenue DESC) as us_rank, | |
| name, revenue, profits, employees, industry, | |
| headquarters, ceo, website, ticker, company_type | |
| FROM companies | |
| WHERE country = 'U.S.' AND revenue IS NOT NULL AND revenue > 0 | |
| ORDER BY revenue DESC | |
| LIMIT 1000; | |
| " > fortune1000_us_only.csv | |
| # Companies with details fetched vs not fetched | |
| sqlite3 -header -column fortune500.db " | |
| SELECT | |
| COUNT(*) as total, | |
| SUM(CASE WHEN details_fetched = 1 THEN 1 ELSE 0 END) as with_details, | |
| SUM(CASE WHEN details_fetched = 0 OR details_fetched IS NULL THEN 1 ELSE 0 END) as without_details | |
| FROM companies; | |
| " | |
| # Search by CEO name | |
| sqlite3 -header -column fortune500.db " | |
| SELECT rank, name, ceo, revenue, industry, headquarters | |
| FROM companies | |
| WHERE ceo LIKE '%Satya%' | |
| OR ceo LIKE '%Tim Cook%' | |
| ORDER BY revenue DESC; | |
| " | |
| =============================================================================== | |
| """ | |
| import requests | |
| import sqlite3 | |
| import time | |
| import sys | |
| import re | |
| import json | |
| from typing import List, Dict, Optional | |
| class FortuneLargeCompaniesFetcher: | |
| def __init__(self, year: int = 2024): | |
| self.base_url = "https://fortune.com/api/page/directory/companies" | |
| self.year = year | |
| self.headers = { | |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36', | |
| 'Accept': 'application/json, text/plain, */*', | |
| 'Referer': 'https://fortune.com/companies/', | |
| } | |
| self.session = requests.Session() | |
| self.session.headers.update(self.headers) | |
| def fetch_page(self, page: int, min_revenue: int = None, use_ranking_filter: bool = True) -> Dict: | |
| """Fetch a page of companies filtered by minimum revenue""" | |
| url = f"{self.base_url}/{page}/" | |
| params = {} | |
| # Only use ranking filter if requested (limits to 1000 companies) | |
| if use_ranking_filter: | |
| params['rankingId'] = 2 # Fortune 500/1000 | |
| params['rankingYear'] = self.year | |
| if min_revenue: | |
| params['minRevenue'] = min_revenue | |
| try: | |
| print(f"Fetching page {page} with minRevenue={min_revenue}...") | |
| response = self.session.get(url, params=params, timeout=10) | |
| if response.status_code == 200: | |
| return response.json() | |
| else: | |
| print(f" Failed with status {response.status_code}") | |
| return None | |
| except Exception as e: | |
| print(f" Error: {e}") | |
| return None | |
| def fetch_all_large_companies(self, min_revenue: int = None, max_companies: int = None) -> List[Dict]: | |
| """Fetch all companies above revenue threshold, up to max_companies if specified""" | |
| all_companies = [] | |
| page = 1 | |
| total_pages = None | |
| # Don't use ranking filter if we need more than 1000 companies or want all | |
| # Use ranking filter only when max_companies is set and <= 1000 | |
| use_ranking_filter = max_companies is not None and max_companies <= 1000 | |
| if min_revenue: | |
| print(f"Fetching companies with revenue >= ${min_revenue}M") | |
| else: | |
| if max_companies: | |
| print(f"Fetching companies from Fortune API (target: {max_companies})") | |
| else: | |
| print("Fetching all companies from Fortune API") | |
| print("=" * 60) | |
| while True: | |
| data = self.fetch_page(page, min_revenue, use_ranking_filter) | |
| if not data: | |
| print(f"No data returned for page {page}") | |
| break | |
| # Extract companies from response | |
| items = data.get('items', []) | |
| if not items: | |
| print(f"No items found on page {page}") | |
| break | |
| # Process each company | |
| for item in items: | |
| company = self.process_company(item) | |
| if company: | |
| all_companies.append(company) | |
| print(f" Page {page}: Found {len(items)} companies (Total: {len(all_companies)})") | |
| # Check pagination | |
| pagination = data.get('pagination', {}) | |
| if total_pages is None: | |
| total_pages = pagination.get('lastPage', 1) | |
| total_items = pagination.get('total', 0) | |
| print(f" Total items available: {total_items}") | |
| print(f" Total pages: {total_pages}") | |
| # Stop if we have enough companies | |
| if max_companies and len(all_companies) >= max_companies: | |
| print(f"Reached target of {max_companies} companies") | |
| break | |
| # Check if there are more pages | |
| if not pagination.get('hasMorePages', False): | |
| print("No more pages available") | |
| break | |
| if page >= total_pages: | |
| print(f"Reached last page ({total_pages})") | |
| break | |
| page += 1 | |
| time.sleep(0.5) # Be respectful | |
| # Safety limit - increase for larger fetches (30 companies per page) | |
| if max_companies: | |
| safety_limit = (max_companies // 30) + 10 # Add buffer | |
| else: | |
| safety_limit = 300 # Default max | |
| if page > safety_limit: | |
| print(f"Reached safety limit of {safety_limit} pages") | |
| break | |
| return all_companies | |
| def process_company(self, item: Dict) -> Dict: | |
| """Process a company item from the API""" | |
| # Extract revenue | |
| revenue_str = item.get('revenues', '') | |
| revenue = None | |
| if revenue_str: | |
| revenue_clean = revenue_str.replace('$', '').replace(',', '').strip() | |
| try: | |
| revenue = float(revenue_clean) if revenue_clean else None | |
| except ValueError: | |
| revenue = None | |
| # Get the permalink/URL | |
| permalink = item.get('permalink', '') | |
| if not permalink and item.get('uri'): | |
| permalink = 'https://fortune.com' + item.get('uri') | |
| return { | |
| 'name': item.get('name'), | |
| 'revenue': revenue, | |
| 'country': item.get('country'), | |
| 'permalink': permalink, | |
| 'slug': item.get('slug'), | |
| 'industry': item.get('industry'), | |
| 'employees': item.get('employees'), | |
| } | |
| def fetch_company_details(self, company_url: str) -> Optional[Dict]: | |
| """Fetch detailed company information from company page""" | |
| try: | |
| response = self.session.get(company_url, timeout=10) | |
| if response.status_code != 200: | |
| return None | |
| html = response.text | |
| # Extract __NEXT_DATA__ JSON | |
| match = re.search(r'<script id="__NEXT_DATA__"[^>]*>({.*?})</script>', html, re.DOTALL) | |
| if not match: | |
| return None | |
| data = json.loads(match.group(1)) | |
| page_props = data.get('props', {}).get('pageProps', {}) | |
| company = page_props.get('company', {}) | |
| company_info = company.get('companyInfo', {}) | |
| if not company_info: | |
| return None | |
| # Parse numeric fields | |
| def parse_numeric(value): | |
| if not value or value == '': | |
| return None | |
| cleaned = str(value).replace('$', '').replace(',', '').strip() | |
| try: | |
| return float(cleaned) if cleaned else None | |
| except ValueError: | |
| return None | |
| def parse_int(value): | |
| if not value or value == '': | |
| return None | |
| cleaned = str(value).replace(',', '').strip() | |
| try: | |
| return int(cleaned) if cleaned else None | |
| except ValueError: | |
| return None | |
| return { | |
| 'headquarters': company_info.get('Headquarters'), | |
| 'industry': company_info.get('Industry'), | |
| 'ceo': company_info.get('CEO'), | |
| 'website': company_info.get('Website'), | |
| 'ticker': company_info.get('Ticker'), | |
| 'company_type': company_info.get('Company type'), | |
| 'revenues': parse_numeric(company_info.get('Revenues ($M)')), | |
| 'profits': parse_numeric(company_info.get('Profits ($M)')), | |
| 'market_value': parse_numeric(company_info.get('Market value ($M)')), | |
| 'num_employees': parse_int(company_info.get('Number of employees')), | |
| } | |
| except Exception as e: | |
| print(f"Error fetching details: {e}") | |
| return None | |
| def rank_companies(self, cursor, max_rank: int = 1000): | |
| """Rank companies by revenue up to max_rank""" | |
| print("\n" + "=" * 60) | |
| print(f"Ranking companies by revenue (Top {max_rank})...") | |
| print("=" * 60) | |
| cursor.execute("UPDATE companies SET rank = NULL") | |
| cursor.execute(""" | |
| WITH ranked_companies AS ( | |
| SELECT | |
| id, | |
| ROW_NUMBER() OVER (ORDER BY revenue DESC, name ASC) as new_rank | |
| FROM companies | |
| WHERE revenue IS NOT NULL | |
| ) | |
| UPDATE companies | |
| SET rank = ( | |
| SELECT new_rank | |
| FROM ranked_companies | |
| WHERE ranked_companies.id = companies.id | |
| ) | |
| WHERE id IN ( | |
| SELECT id FROM ranked_companies WHERE new_rank <= ? | |
| ) | |
| """, (max_rank,)) | |
| def update_database(self, companies: List[Dict]) -> tuple: | |
| """Update database with fetched companies""" | |
| conn = sqlite3.connect('fortune500.db') | |
| cursor = conn.cursor() | |
| # Create table if it doesn't exist | |
| cursor.execute(""" | |
| CREATE TABLE IF NOT EXISTS companies ( | |
| id INTEGER PRIMARY KEY AUTOINCREMENT, | |
| rank INTEGER, | |
| name TEXT NOT NULL, | |
| revenue REAL, | |
| profits REAL, | |
| employees INTEGER, | |
| industry TEXT, | |
| headquarters TEXT, | |
| hq_city TEXT, | |
| hq_state TEXT, | |
| country TEXT, | |
| ceo TEXT, | |
| website TEXT, | |
| domain TEXT, | |
| ticker TEXT, | |
| company_type TEXT, | |
| market_value REAL, | |
| fortune_url TEXT, | |
| slug TEXT, | |
| created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, | |
| details_fetched BOOLEAN DEFAULT 0 | |
| ) | |
| """) | |
| # Ensure we have needed columns (for existing databases) | |
| cursor.execute("PRAGMA table_info(companies)") | |
| columns = [col[1] for col in cursor.fetchall()] | |
| # Add missing columns to existing databases | |
| column_definitions = { | |
| 'slug': 'TEXT', | |
| 'headquarters': 'TEXT', | |
| 'ceo': 'TEXT', | |
| 'ticker': 'TEXT', | |
| 'company_type': 'TEXT', | |
| 'market_value': 'REAL', | |
| 'details_fetched': 'BOOLEAN DEFAULT 0' | |
| } | |
| for col_name, col_type in column_definitions.items(): | |
| if col_name not in columns: | |
| cursor.execute(f"ALTER TABLE companies ADD COLUMN {col_name} {col_type}") | |
| added = 0 | |
| updated = 0 | |
| for company in companies: | |
| try: | |
| # Check if company exists | |
| cursor.execute(""" | |
| SELECT id, revenue FROM companies | |
| WHERE name = ? OR slug = ? | |
| """, (company['name'], company.get('slug'))) | |
| result = cursor.fetchone() | |
| if result: | |
| # Update existing company | |
| cursor.execute(""" | |
| UPDATE companies | |
| SET revenue = ?, country = ?, fortune_url = ?, | |
| slug = ?, industry = ?, employees = ? | |
| WHERE name = ? OR slug = ? | |
| """, ( | |
| company['revenue'], company['country'], | |
| company['permalink'], company['slug'], | |
| company.get('industry'), company.get('employees'), | |
| company['name'], company.get('slug') | |
| )) | |
| updated += cursor.rowcount | |
| else: | |
| # Insert new company | |
| cursor.execute(""" | |
| INSERT INTO companies | |
| (name, revenue, country, fortune_url, slug, industry, employees) | |
| VALUES (?, ?, ?, ?, ?, ?, ?) | |
| """, ( | |
| company['name'], company['revenue'], company['country'], | |
| company['permalink'], company['slug'], | |
| company.get('industry'), company.get('employees') | |
| )) | |
| added += cursor.rowcount | |
| except Exception as e: | |
| print(f"Error processing {company['name']}: {e}") | |
| conn.commit() | |
| conn.close() | |
| return added, updated | |
| def enrich_company_details(self, fetch_details: bool = True): | |
| """Fetch detailed information for all companies from their individual pages""" | |
| if not fetch_details: | |
| return 0 | |
| conn = sqlite3.connect('fortune500.db') | |
| cursor = conn.cursor() | |
| # Get companies that need details fetched | |
| cursor.execute(""" | |
| SELECT id, name, fortune_url | |
| FROM companies | |
| WHERE fortune_url IS NOT NULL | |
| AND fortune_url != '' | |
| AND (details_fetched IS NULL OR details_fetched = 0) | |
| ORDER BY rank NULLS LAST | |
| """) | |
| companies_to_fetch = cursor.fetchall() | |
| total = len(companies_to_fetch) | |
| if total == 0: | |
| print("\nAll companies already have detailed information") | |
| return 0 | |
| print(f"\nFetching detailed information for {total} companies...") | |
| print("=" * 60) | |
| print("This will take a while (0.5s per company)...") | |
| enriched = 0 | |
| for idx, (company_id, name, url) in enumerate(companies_to_fetch, 1): | |
| if idx % 50 == 0: | |
| print(f" Progress: {idx}/{total} ({(idx/total)*100:.1f}%)") | |
| details = self.fetch_company_details(url) | |
| if details: | |
| cursor.execute(""" | |
| UPDATE companies | |
| SET headquarters = ?, industry = ?, ceo = ?, | |
| website = ?, ticker = ?, company_type = ?, | |
| profits = ?, market_value = ?, | |
| employees = COALESCE(?, employees), | |
| revenue = COALESCE(?, revenue), | |
| details_fetched = 1 | |
| WHERE id = ? | |
| """, ( | |
| details['headquarters'], details['industry'], details['ceo'], | |
| details['website'], details['ticker'], details['company_type'], | |
| details['profits'], details['market_value'], | |
| details['num_employees'], details['revenues'], | |
| company_id | |
| )) | |
| enriched += 1 | |
| time.sleep(0.5) # Be respectful to the server | |
| conn.commit() | |
| conn.close() | |
| print(f"\nSuccessfully enriched {enriched} companies with detailed information") | |
| return enriched | |
| def test_revenue_thresholds(year: int = 2024): | |
| """Test different revenue thresholds to find Fortune 500/1000 cutoffs""" | |
| fetcher = FortuneLargeCompaniesFetcher(year) | |
| thresholds = [5000, 7000, 10000, 15000, 20000] | |
| print("Testing Revenue Thresholds") | |
| print("=" * 60) | |
| for threshold in thresholds: | |
| # Just get first page to check count | |
| data = fetcher.fetch_page(1, threshold) | |
| if data and 'pagination' in data: | |
| total = data['pagination'].get('total', 0) | |
| print(f"Revenue >= ${threshold:,}M: {total} companies") | |
| def main(): | |
| """Main function""" | |
| # Check for flags | |
| fetch_details = False | |
| args = [] | |
| for arg in sys.argv[1:]: | |
| if arg in ['-d', '--details', '--detailed']: | |
| fetch_details = True | |
| elif arg not in ['-h', '--help', 'help']: | |
| args.append(arg) | |
| # Show explanation if no arguments | |
| if len(args) == 0 and not any(arg in ['-h', '--help', 'help'] for arg in sys.argv[1:]): | |
| print("=" * 70) | |
| print("Fortune Company Database Fetcher") | |
| print("=" * 70) | |
| print() | |
| print("This script fetches company data from Fortune.com and ranks by revenue.") | |
| print() | |
| print("FORTUNE LISTS:") | |
| print(" • Fortune 500: Top 500 US companies by revenue (official list)") | |
| print(" • Fortune 1000: Top 1000 US companies by revenue (official list)") | |
| print() | |
| print("EXPANDED DATABASE:") | |
| print(" • Fortune.com tracks ~8,600 total companies") | |
| print(" • You can rank any number (e.g., top 2000, 5000, etc.)") | |
| print(" • Beyond 1000, rankings are calculated from all available companies") | |
| print() | |
| print("DEFAULT: Fetches all ~8,600 companies and ranks them by revenue") | |
| print() | |
| print("USAGE:") | |
| print(" python3 fetch_by_revenue.py [max_rank] [year] [flags]") | |
| print() | |
| print("FLAGS:") | |
| print(" -d, --details Fetch detailed info (CEO, HQ, ticker, etc.)") | |
| print() | |
| print("EXAMPLES:") | |
| print(" python3 fetch_by_revenue.py # All companies, year 2024") | |
| print(" python3 fetch_by_revenue.py 500 # Fortune 500, year 2024") | |
| print(" python3 fetch_by_revenue.py 1000 # Fortune 1000, year 2024") | |
| print(" python3 fetch_by_revenue.py 2000 # Top 2000, year 2024") | |
| print(" python3 fetch_by_revenue.py 500 2025 # Fortune 500, year 2025") | |
| print(" python3 fetch_by_revenue.py all # All ~8,600 companies") | |
| print(" python3 fetch_by_revenue.py 500 --details # Fortune 500 with details") | |
| print() | |
| print("AVAILABLE YEARS: 2023, 2024, 2025") | |
| print("=" * 70) | |
| # Prompt user | |
| response = input("\nContinue with default (all companies, 2024)? [Y/n]: ").strip().lower() | |
| if response and response != 'y' and response != 'yes': | |
| print("Exiting...") | |
| sys.exit(0) | |
| max_rank = None # Fetch all | |
| year = 2024 | |
| else: | |
| # Parse command line arguments | |
| max_rank = None | |
| year = 2024 | |
| if any(arg in ['-h', '--help', 'help'] for arg in sys.argv[1:]): | |
| print("Fortune 500/1000 Company Fetcher") | |
| print("\nUsage: python3 fetch_by_revenue.py [max_rank] [year] [flags]") | |
| print("\nArguments:") | |
| print(" max_rank Number of top companies to rank (default: all)") | |
| print(" year Year for Fortune data: 2023, 2024, or 2025 (default: 2024)") | |
| print("\nFlags:") | |
| print(" -d, --details Fetch detailed company info (CEO, HQ, ticker, etc.)") | |
| print("\nExamples:") | |
| print(" python3 fetch_by_revenue.py # All companies, 2024") | |
| print(" python3 fetch_by_revenue.py 500 # Fortune 500, 2024") | |
| print(" python3 fetch_by_revenue.py 1000 2025 # Fortune 1000, 2025") | |
| print(" python3 fetch_by_revenue.py 500 --details # Fortune 500 with details") | |
| print(" python3 fetch_by_revenue.py all 2024 -d # All companies with details") | |
| sys.exit(0) | |
| # Parse max_rank | |
| if len(args) > 0: | |
| if args[0].lower() == 'all': | |
| max_rank = None | |
| else: | |
| try: | |
| max_rank = int(args[0]) | |
| if max_rank < 1: | |
| print("Error: Rank must be a positive number") | |
| sys.exit(1) | |
| except ValueError: | |
| print(f"Error: Invalid rank number '{args[0]}'") | |
| print("Usage: python3 fetch_by_revenue.py [max_rank] [year] [flags]") | |
| print("Example: python3 fetch_by_revenue.py 500 2024 --details") | |
| sys.exit(1) | |
| # Parse year if provided | |
| if len(args) > 1: | |
| try: | |
| year = int(args[1]) | |
| if year not in [2023, 2024, 2025]: | |
| print(f"Error: Year must be 2023, 2024, or 2025") | |
| sys.exit(1) | |
| except ValueError: | |
| print(f"Error: Invalid year '{args[1]}'") | |
| sys.exit(1) | |
| rank_label = f"Top {max_rank}" if max_rank else "All Companies" | |
| print(f"\nFortune Large Companies Fetcher ({rank_label}, {year})") | |
| print("=" * 60) | |
| # First test thresholds | |
| test_revenue_thresholds(year) | |
| print() | |
| # Fetch companies from Fortune API | |
| fetcher = FortuneLargeCompaniesFetcher(year) | |
| # Determine fetch target - None means fetch all available | |
| # max_rank can be None (all), a number <= 1000, or > 1000 | |
| if max_rank is None: | |
| fetch_target = None # Fetch all available | |
| elif max_rank > 1000: | |
| fetch_target = max_rank # Fetch specific amount beyond 1000 | |
| else: | |
| fetch_target = max_rank # Fetch specific amount up to 1000 | |
| companies = fetcher.fetch_all_large_companies(min_revenue=None, max_companies=fetch_target) | |
| print(f"\nSuccessfully fetched {len(companies)} companies") | |
| # Set max_rank to all companies if None | |
| if max_rank is None: | |
| max_rank = len(companies) | |
| print(f"\nRanking all {max_rank} companies") | |
| elif max_rank > len(companies): | |
| # Warn if user requested more than available | |
| print(f"\nWarning: Only {len(companies)} companies fetched.") | |
| print(f"Ranking top {len(companies)} instead of requested {max_rank}.") | |
| max_rank = len(companies) | |
| # Note if fetching more than Fortune 1000 | |
| if max_rank > 1000: | |
| print(f"Note: Fetching beyond Fortune 1000 (includes all companies in Fortune database)") | |
| if companies: | |
| # Show revenue range | |
| revenues = [c['revenue'] for c in companies if c['revenue']] | |
| if revenues: | |
| print(f"\nRevenue range:") | |
| print(f" Min: ${min(revenues):,.0f}M") | |
| print(f" Max: ${max(revenues):,.0f}M") | |
| # Show sample companies | |
| print("\nTop 10 companies by revenue:") | |
| sorted_companies = sorted(companies, | |
| key=lambda x: x.get('revenue', 0) or 0, | |
| reverse=True) | |
| for i, company in enumerate(sorted_companies[:10], 1): | |
| revenue = company.get('revenue') | |
| revenue_str = f"${revenue:,.0f}M" if revenue else "N/A" | |
| print(f" {i}. {company['name']}: {revenue_str}") | |
| # Update database | |
| added, updated = fetcher.update_database(companies) | |
| print(f"\nDatabase updated:") | |
| print(f" Added: {added} companies") | |
| print(f" Updated: {updated} companies") | |
| # Fetch detailed information if flag is set or ask user | |
| if fetch_details: | |
| print(f"\nDetailed information flag set - fetching details for {len(companies)} companies...") | |
| fetcher.enrich_company_details(fetch_details=True) | |
| else: | |
| # Ask user if they want to fetch detailed information | |
| print("\n" + "=" * 60) | |
| print("Detailed Company Information") | |
| print("=" * 60) | |
| print("Would you like to fetch detailed info for each company?") | |
| print("(CEO, Headquarters, Ticker, Profits, Market Value, etc.)") | |
| print(f"\nThis will fetch individual pages for companies.") | |
| print(f"Estimated time: ~{len(companies) * 0.5 / 60:.1f} minutes") | |
| print("\nTip: Use --details flag to skip this prompt") | |
| response = input("\nFetch detailed information? [y/N]: ").strip().lower() | |
| if response in ['y', 'yes']: | |
| fetcher.enrich_company_details(fetch_details=True) | |
| else: | |
| print("\nSkipping detailed information fetch.") | |
| # Check database status | |
| conn = sqlite3.connect('fortune500.db') | |
| cursor = conn.cursor() | |
| # Count companies with significant revenue | |
| cursor.execute(""" | |
| SELECT COUNT(*) FROM companies | |
| WHERE revenue >= 3000 | |
| """) | |
| high_revenue_count = cursor.fetchone()[0] | |
| cursor.execute("SELECT COUNT(*) FROM companies") | |
| total_count = cursor.fetchone()[0] | |
| print(f"\nDatabase status:") | |
| print(f" Total companies: {total_count}") | |
| print(f" Companies with revenue >= $3,000M: {high_revenue_count}") | |
| # Rank companies by revenue | |
| fetcher.rank_companies(cursor, max_rank) | |
| conn.commit() | |
| # Show ranking results | |
| cursor.execute("SELECT COUNT(*) FROM companies WHERE rank IS NOT NULL") | |
| ranked_count = cursor.fetchone()[0] | |
| cursor.execute(""" | |
| SELECT rank, name, revenue | |
| FROM companies | |
| WHERE rank IS NOT NULL | |
| ORDER BY rank | |
| LIMIT 5 | |
| """) | |
| top_5 = cursor.fetchall() | |
| cursor.execute(""" | |
| SELECT rank, name, revenue | |
| FROM companies | |
| WHERE rank IS NOT NULL | |
| ORDER BY rank DESC | |
| LIMIT 5 | |
| """) | |
| bottom_5 = cursor.fetchall() | |
| print(f"\nSuccessfully ranked {ranked_count} companies") | |
| print("\nTop 5:") | |
| for rank, name, revenue in top_5: | |
| print(f" {rank}. {name}: ${revenue:,.0f}M") | |
| print("\nBottom 5:") | |
| for rank, name, revenue in reversed(bottom_5): | |
| print(f" {rank}. {name}: ${revenue:,.0f}M") | |
| conn.close() | |
| print("\n" + "=" * 60) | |
| print(f"Fortune Top {max_rank} database ready!") | |
| print("=" * 60) | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment