TechByTom · September 29, 2025 19:41
diff --git a/fetch_by_revenue.py b/fetch_by_revenue.py
 #!/usr/bin/env python3
 """
  Fortune 500/1000 Company Fetcher

  Author's Note:
  I have been looking for a complete and updated list of F500 companies for some
  work I'm doing, and nothing seems to exist. This should (for now) automatically
  retrieve the list from fortune.com - they annoyingly don't have a public plain
  text list. Also, does you the favor of making a mysql database populated with
  the info. Enjoy!

  This script fetches Fortune 500/1000 companies from fortune.com's API and
  creates a SQLite database with company information including rank, revenue,
  industry, and URLs.

  ===============================================================================
  COMMON USAGE EXAMPLES
  ===============================================================================

  SCRAPING COMMANDS:
  ------------------
  # Fetch Fortune 500 (2024, basic data only)
  python3 fetch_by_revenue.py 500

  # Fetch Fortune 1000 with detailed information (2024)
  python3 fetch_by_revenue.py 1000 --details

  # Fetch all companies for 2025 with details
  python3 fetch_by_revenue.py all 2025 --details

  # Fetch top 100 companies (2024, basic data)
  python3 fetch_by_revenue.py 100

  # Fetch Fortune 500 for 2023 with details
  python3 fetch_by_revenue.py 500 2023 -d

  # Fetch top 2000 companies (2024, basic data)
  python3 fetch_by_revenue.py 2000


  DATABASE QUERIES:
  -----------------

  # Top 500 US companies by revenue
  sqlite3 -header -column fortune500.db "
  SELECT ROW_NUMBER() OVER (ORDER BY revenue DESC) as rank,
         name, revenue, industry, headquarters, ceo
  FROM companies
  WHERE country = 'U.S.' AND revenue IS NOT NULL AND revenue > 0
  ORDER BY revenue DESC
  LIMIT 500;
  "

  # Export top 500 US companies to CSV
  sqlite3 -header -csv fortune500.db "
  SELECT ROW_NUMBER() OVER (ORDER BY revenue DESC) as rank,
         name, revenue, profits, employees, industry, headquarters,
         ceo, website, ticker, company_type
  FROM companies
  WHERE country = 'U.S.' AND revenue IS NOT NULL AND revenue > 0
  ORDER BY revenue DESC
  LIMIT 500;
  " > fortune500_us.csv

  # Find all tech companies in top 1000
  sqlite3 -header -column fortune500.db "
  SELECT rank, name, revenue, headquarters, ceo
  FROM companies
  WHERE industry LIKE '%Tech%'
     OR industry LIKE '%Software%'
     OR industry LIKE '%Internet%'
  ORDER BY revenue DESC
  LIMIT 50;
  "

  # Top 10 most profitable companies
  sqlite3 -header -column fortune500.db "
  SELECT rank, name, revenue, profits,
         ROUND(profits * 100.0 / revenue, 2) as profit_margin
  FROM companies
  WHERE profits IS NOT NULL AND revenue IS NOT NULL
  ORDER BY profits DESC
  LIMIT 10;
  "

  # All public companies with market value > $100B
  sqlite3 -header -column fortune500.db "
  SELECT rank, name, revenue, market_value, ticker
  FROM companies
  WHERE company_type = 'Public'
    AND market_value > 100000
  ORDER BY market_value DESC;
  "

  # Companies by industry (count and total revenue)
  sqlite3 -header -column fortune500.db "
  SELECT industry,
         COUNT(*) as company_count,
         ROUND(SUM(revenue), 0) as total_revenue,
         ROUND(AVG(revenue), 0) as avg_revenue
  FROM companies
  WHERE industry IS NOT NULL AND revenue IS NOT NULL
  GROUP BY industry
  ORDER BY total_revenue DESC
  LIMIT 20;
  "

  # Find a specific company
  sqlite3 -header -column fortune500.db "
  SELECT rank, name, revenue, profits, employees,
         headquarters, ceo, website, ticker
  FROM companies
  WHERE name LIKE '%Apple%';
  "

  # All companies in a specific state
  sqlite3 -header -column fortune500.db "
  SELECT rank, name, revenue, headquarters, industry
  FROM companies
  WHERE headquarters LIKE '%California%'
    OR headquarters LIKE '%Calif.%'
    OR headquarters LIKE '%CA%'
  ORDER BY revenue DESC;
  "

  # Top 100 by employee count
  sqlite3 -header -column fortune500.db "
  SELECT rank, name, employees, revenue, industry, headquarters
  FROM companies
  WHERE employees IS NOT NULL
  ORDER BY employees DESC
  LIMIT 100;
  "

  # All US companies ranked 1-1000 by revenue
  sqlite3 -header -csv fortune500.db "
  SELECT ROW_NUMBER() OVER (ORDER BY revenue DESC) as us_rank,
         name, revenue, profits, employees, industry,
         headquarters, ceo, website, ticker, company_type
  FROM companies
  WHERE country = 'U.S.' AND revenue IS NOT NULL AND revenue > 0
  ORDER BY revenue DESC
  LIMIT 1000;
  " > fortune1000_us_only.csv

  # Companies with details fetched vs not fetched
  sqlite3 -header -column fortune500.db "
  SELECT
    COUNT(*) as total,
    SUM(CASE WHEN details_fetched = 1 THEN 1 ELSE 0 END) as with_details,
    SUM(CASE WHEN details_fetched = 0 OR details_fetched IS NULL THEN 1 ELSE 0 END) as without_details
  FROM companies;
  "

  # Search by CEO name
  sqlite3 -header -column fortune500.db "
  SELECT rank, name, ceo, revenue, industry, headquarters
  FROM companies
  WHERE ceo LIKE '%Satya%'
     OR ceo LIKE '%Tim Cook%'
  ORDER BY revenue DESC;
  "

  ===============================================================================
 """

 import requests
 import sqlite3
 import time
 import sys
 import re
 import json
 from typing import List, Dict, Optional

 class FortuneLargeCompaniesFetcher:
    def __init__(self, year: int = 2024):
        self.base_url = "https://fortune.com/api/page/directory/companies"
        self.year = year
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
            'Accept': 'application/json, text/plain, */*',
            'Referer': 'https://fortune.com/companies/',
        }
        self.session = requests.Session()
        self.session.headers.update(self.headers)

    def fetch_page(self, page: int, min_revenue: int = None, use_ranking_filter: bool = True) -> Dict:
        """Fetch a page of companies filtered by minimum revenue"""
        url = f"{self.base_url}/{page}/"
        params = {}

        # Only use ranking filter if requested (limits to 1000 companies)
        if use_ranking_filter:
            params['rankingId'] = 2  # Fortune 500/1000
            params['rankingYear'] = self.year

        if min_revenue:
            params['minRevenue'] = min_revenue

        try:
            print(f"Fetching page {page} with minRevenue={min_revenue}...")
            response = self.session.get(url, params=params, timeout=10)

            if response.status_code == 200:
                return response.json()
            else:
                print(f"  Failed with status {response.status_code}")
                return None

        except Exception as e:
            print(f"  Error: {e}")
            return None

    def fetch_all_large_companies(self, min_revenue: int = None, max_companies: int = None) -> List[Dict]:
        """Fetch all companies above revenue threshold, up to max_companies if specified"""
        all_companies = []
        page = 1
        total_pages = None

        # Don't use ranking filter if we need more than 1000 companies or want all
        # Use ranking filter only when max_companies is set and <= 1000
        use_ranking_filter = max_companies is not None and max_companies <= 1000

        if min_revenue:
            print(f"Fetching companies with revenue >= ${min_revenue}M")
        else:
            if max_companies:
                print(f"Fetching companies from Fortune API (target: {max_companies})")
            else:
                print("Fetching all companies from Fortune API")
        print("=" * 60)

        while True:
            data = self.fetch_page(page, min_revenue, use_ranking_filter)

            if not data:
                print(f"No data returned for page {page}")
                break

            # Extract companies from response
            items = data.get('items', [])
            if not items:
                print(f"No items found on page {page}")
                break

            # Process each company
            for item in items:
                company = self.process_company(item)
                if company:
                    all_companies.append(company)

            print(f"  Page {page}: Found {len(items)} companies (Total: {len(all_companies)})")

            # Check pagination
            pagination = data.get('pagination', {})
            if total_pages is None:
                total_pages = pagination.get('lastPage', 1)
                total_items = pagination.get('total', 0)
                print(f"  Total items available: {total_items}")
                print(f"  Total pages: {total_pages}")

            # Stop if we have enough companies
            if max_companies and len(all_companies) >= max_companies:
                print(f"Reached target of {max_companies} companies")
                break

            # Check if there are more pages
            if not pagination.get('hasMorePages', False):
                print("No more pages available")
                break

            if page >= total_pages:
                print(f"Reached last page ({total_pages})")
                break

            page += 1
            time.sleep(0.5)  # Be respectful

            # Safety limit - increase for larger fetches (30 companies per page)
            if max_companies:
                safety_limit = (max_companies // 30) + 10  # Add buffer
            else:
                safety_limit = 300  # Default max

            if page > safety_limit:
                print(f"Reached safety limit of {safety_limit} pages")
                break

        return all_companies

    def process_company(self, item: Dict) -> Dict:
        """Process a company item from the API"""
        # Extract revenue
        revenue_str = item.get('revenues', '')
        revenue = None
        if revenue_str:
            revenue_clean = revenue_str.replace('$', '').replace(',', '').strip()
            try:
                revenue = float(revenue_clean) if revenue_clean else None
            except ValueError:
                revenue = None

        # Get the permalink/URL
        permalink = item.get('permalink', '')
        if not permalink and item.get('uri'):
            permalink = 'https://fortune.com' + item.get('uri')

        return {
            'name': item.get('name'),
            'revenue': revenue,
            'country': item.get('country'),
            'permalink': permalink,
            'slug': item.get('slug'),
            'industry': item.get('industry'),
            'employees': item.get('employees'),
        }

    def fetch_company_details(self, company_url: str) -> Optional[Dict]:
        """Fetch detailed company information from company page"""
        try:
            response = self.session.get(company_url, timeout=10)
            if response.status_code != 200:
                return None

            html = response.text

            # Extract __NEXT_DATA__ JSON
            match = re.search(r'<script id="__NEXT_DATA__"[^>]*>({.*?})</script>', html, re.DOTALL)
            if not match:
                return None

            data = json.loads(match.group(1))
            page_props = data.get('props', {}).get('pageProps', {})
            company = page_props.get('company', {})
            company_info = company.get('companyInfo', {})

            if not company_info:
                return None

            # Parse numeric fields
            def parse_numeric(value):
                if not value or value == '':
                    return None
                cleaned = str(value).replace('$', '').replace(',', '').strip()
                try:
                    return float(cleaned) if cleaned else None
                except ValueError:
                    return None

            def parse_int(value):
                if not value or value == '':
                    return None
                cleaned = str(value).replace(',', '').strip()
                try:
                    return int(cleaned) if cleaned else None
                except ValueError:
                    return None

            return {
                'headquarters': company_info.get('Headquarters'),
                'industry': company_info.get('Industry'),
                'ceo': company_info.get('CEO'),
                'website': company_info.get('Website'),
                'ticker': company_info.get('Ticker'),
                'company_type': company_info.get('Company type'),
                'revenues': parse_numeric(company_info.get('Revenues ($M)')),
                'profits': parse_numeric(company_info.get('Profits ($M)')),
                'market_value': parse_numeric(company_info.get('Market value ($M)')),
                'num_employees': parse_int(company_info.get('Number of employees')),
            }

        except Exception as e:
            print(f"Error fetching details: {e}")
            return None

    def rank_companies(self, cursor, max_rank: int = 1000):
        """Rank companies by revenue up to max_rank"""
        print("\n" + "=" * 60)
        print(f"Ranking companies by revenue (Top {max_rank})...")
        print("=" * 60)

        cursor.execute("UPDATE companies SET rank = NULL")

        cursor.execute("""
            WITH ranked_companies AS (
                SELECT
                    id,
                    ROW_NUMBER() OVER (ORDER BY revenue DESC, name ASC) as new_rank
                FROM companies
                WHERE revenue IS NOT NULL
            )
            UPDATE companies
            SET rank = (
                SELECT new_rank
                FROM ranked_companies
                WHERE ranked_companies.id = companies.id
            )
            WHERE id IN (
                SELECT id FROM ranked_companies WHERE new_rank <= ?
            )
        """, (max_rank,))

    def update_database(self, companies: List[Dict]) -> tuple:
        """Update database with fetched companies"""
        conn = sqlite3.connect('fortune500.db')
        cursor = conn.cursor()

        # Create table if it doesn't exist
        cursor.execute("""
            CREATE TABLE IF NOT EXISTS companies (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                rank INTEGER,
                name TEXT NOT NULL,
                revenue REAL,
                profits REAL,
                employees INTEGER,
                industry TEXT,
                headquarters TEXT,
                hq_city TEXT,
                hq_state TEXT,
                country TEXT,
                ceo TEXT,
                website TEXT,
                domain TEXT,
                ticker TEXT,
                company_type TEXT,
                market_value REAL,
                fortune_url TEXT,
                slug TEXT,
                created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                details_fetched BOOLEAN DEFAULT 0
            )
        """)

        # Ensure we have needed columns (for existing databases)
        cursor.execute("PRAGMA table_info(companies)")
        columns = [col[1] for col in cursor.fetchall()]

        # Add missing columns to existing databases
        column_definitions = {
            'slug': 'TEXT',
            'headquarters': 'TEXT',
            'ceo': 'TEXT',
            'ticker': 'TEXT',
            'company_type': 'TEXT',
            'market_value': 'REAL',
            'details_fetched': 'BOOLEAN DEFAULT 0'
        }

        for col_name, col_type in column_definitions.items():
            if col_name not in columns:
                cursor.execute(f"ALTER TABLE companies ADD COLUMN {col_name} {col_type}")

        added = 0
        updated = 0

        for company in companies:
            try:
                # Check if company exists
                cursor.execute("""
                    SELECT id, revenue FROM companies
                    WHERE name = ? OR slug = ?
                """, (company['name'], company.get('slug')))

                result = cursor.fetchone()

                if result:
                    # Update existing company
                    cursor.execute("""
                        UPDATE companies
                        SET revenue = ?, country = ?, fortune_url = ?,
                            slug = ?, industry = ?, employees = ?
                        WHERE name = ? OR slug = ?
                    """, (
                        company['revenue'], company['country'],
                        company['permalink'], company['slug'],
                        company.get('industry'), company.get('employees'),
                        company['name'], company.get('slug')
                    ))
                    updated += cursor.rowcount
                else:
                    # Insert new company
                    cursor.execute("""
                        INSERT INTO companies
                        (name, revenue, country, fortune_url, slug, industry, employees)
                        VALUES (?, ?, ?, ?, ?, ?, ?)
                    """, (
                        company['name'], company['revenue'], company['country'],
                        company['permalink'], company['slug'],
                        company.get('industry'), company.get('employees')
                    ))
                    added += cursor.rowcount

            except Exception as e:
                print(f"Error processing {company['name']}: {e}")

        conn.commit()
        conn.close()

        return added, updated

    def enrich_company_details(self, fetch_details: bool = True):
        """Fetch detailed information for all companies from their individual pages"""
        if not fetch_details:
            return 0

        conn = sqlite3.connect('fortune500.db')
        cursor = conn.cursor()

        # Get companies that need details fetched
        cursor.execute("""
            SELECT id, name, fortune_url
            FROM companies
            WHERE fortune_url IS NOT NULL
            AND fortune_url != ''
            AND (details_fetched IS NULL OR details_fetched = 0)
            ORDER BY rank NULLS LAST
        """)

        companies_to_fetch = cursor.fetchall()
        total = len(companies_to_fetch)

        if total == 0:
            print("\nAll companies already have detailed information")
            return 0

        print(f"\nFetching detailed information for {total} companies...")
        print("=" * 60)
        print("This will take a while (0.5s per company)...")

        enriched = 0
        for idx, (company_id, name, url) in enumerate(companies_to_fetch, 1):
            if idx % 50 == 0:
                print(f"  Progress: {idx}/{total} ({(idx/total)*100:.1f}%)")

            details = self.fetch_company_details(url)
            if details:
                cursor.execute("""
                    UPDATE companies
                    SET headquarters = ?, industry = ?, ceo = ?,
                        website = ?, ticker = ?, company_type = ?,
                        profits = ?, market_value = ?,
                        employees = COALESCE(?, employees),
                        revenue = COALESCE(?, revenue),
                        details_fetched = 1
                    WHERE id = ?
                """, (
                    details['headquarters'], details['industry'], details['ceo'],
                    details['website'], details['ticker'], details['company_type'],
                    details['profits'], details['market_value'],
                    details['num_employees'], details['revenues'],
                    company_id
                ))
                enriched += 1

            time.sleep(0.5)  # Be respectful to the server

        conn.commit()
        conn.close()

        print(f"\nSuccessfully enriched {enriched} companies with detailed information")
        return enriched

 def test_revenue_thresholds(year: int = 2024):
    """Test different revenue thresholds to find Fortune 500/1000 cutoffs"""
    fetcher = FortuneLargeCompaniesFetcher(year)

    thresholds = [5000, 7000, 10000, 15000, 20000]

    print("Testing Revenue Thresholds")
    print("=" * 60)

    for threshold in thresholds:
        # Just get first page to check count
        data = fetcher.fetch_page(1, threshold)
        if data and 'pagination' in data:
            total = data['pagination'].get('total', 0)
            print(f"Revenue >= ${threshold:,}M: {total} companies")

 def main():
    """Main function"""
    # Check for flags
    fetch_details = False
    args = []
    for arg in sys.argv[1:]:
        if arg in ['-d', '--details', '--detailed']:
            fetch_details = True
        elif arg not in ['-h', '--help', 'help']:
            args.append(arg)

    # Show explanation if no arguments
    if len(args) == 0 and not any(arg in ['-h', '--help', 'help'] for arg in sys.argv[1:]):
        print("=" * 70)
        print("Fortune Company Database Fetcher")
        print("=" * 70)
        print()
        print("This script fetches company data from Fortune.com and ranks by revenue.")
        print()
        print("FORTUNE LISTS:")
        print("  • Fortune 500:  Top 500 US companies by revenue (official list)")
        print("  • Fortune 1000: Top 1000 US companies by revenue (official list)")
        print()
        print("EXPANDED DATABASE:")
        print("  • Fortune.com tracks ~8,600 total companies")
        print("  • You can rank any number (e.g., top 2000, 5000, etc.)")
        print("  • Beyond 1000, rankings are calculated from all available companies")
        print()
        print("DEFAULT: Fetches all ~8,600 companies and ranks them by revenue")
        print()
        print("USAGE:")
        print("  python3 fetch_by_revenue.py [max_rank] [year] [flags]")
        print()
        print("FLAGS:")
        print("  -d, --details    Fetch detailed info (CEO, HQ, ticker, etc.)")
        print()
        print("EXAMPLES:")
        print("  python3 fetch_by_revenue.py              # All companies, year 2024")
        print("  python3 fetch_by_revenue.py 500          # Fortune 500, year 2024")
        print("  python3 fetch_by_revenue.py 1000         # Fortune 1000, year 2024")
        print("  python3 fetch_by_revenue.py 2000         # Top 2000, year 2024")
        print("  python3 fetch_by_revenue.py 500 2025     # Fortune 500, year 2025")
        print("  python3 fetch_by_revenue.py all          # All ~8,600 companies")
        print("  python3 fetch_by_revenue.py 500 --details  # Fortune 500 with details")
        print()
        print("AVAILABLE YEARS: 2023, 2024, 2025")
        print("=" * 70)

        # Prompt user
        response = input("\nContinue with default (all companies, 2024)? [Y/n]: ").strip().lower()
        if response and response != 'y' and response != 'yes':
            print("Exiting...")
            sys.exit(0)

        max_rank = None  # Fetch all
        year = 2024
    else:
        # Parse command line arguments
        max_rank = None
        year = 2024

        if any(arg in ['-h', '--help', 'help'] for arg in sys.argv[1:]):
            print("Fortune 500/1000 Company Fetcher")
            print("\nUsage: python3 fetch_by_revenue.py [max_rank] [year] [flags]")
            print("\nArguments:")
            print("  max_rank    Number of top companies to rank (default: all)")
            print("  year        Year for Fortune data: 2023, 2024, or 2025 (default: 2024)")
            print("\nFlags:")
            print("  -d, --details    Fetch detailed company info (CEO, HQ, ticker, etc.)")
            print("\nExamples:")
            print("  python3 fetch_by_revenue.py                 # All companies, 2024")
            print("  python3 fetch_by_revenue.py 500             # Fortune 500, 2024")
            print("  python3 fetch_by_revenue.py 1000 2025       # Fortune 1000, 2025")
            print("  python3 fetch_by_revenue.py 500 --details   # Fortune 500 with details")
            print("  python3 fetch_by_revenue.py all 2024 -d     # All companies with details")
            sys.exit(0)

        # Parse max_rank
        if len(args) > 0:
            if args[0].lower() == 'all':
                max_rank = None
            else:
                try:
                    max_rank = int(args[0])
                    if max_rank < 1:
                        print("Error: Rank must be a positive number")
                        sys.exit(1)
                except ValueError:
                    print(f"Error: Invalid rank number '{args[0]}'")
                    print("Usage: python3 fetch_by_revenue.py [max_rank] [year] [flags]")
                    print("Example: python3 fetch_by_revenue.py 500 2024 --details")
                    sys.exit(1)

        # Parse year if provided
        if len(args) > 1:
            try:
                year = int(args[1])
                if year not in [2023, 2024, 2025]:
                    print(f"Error: Year must be 2023, 2024, or 2025")
                    sys.exit(1)
            except ValueError:
                print(f"Error: Invalid year '{args[1]}'")
                sys.exit(1)

    rank_label = f"Top {max_rank}" if max_rank else "All Companies"
    print(f"\nFortune Large Companies Fetcher ({rank_label}, {year})")
    print("=" * 60)

    # First test thresholds
    test_revenue_thresholds(year)
    print()

    # Fetch companies from Fortune API
    fetcher = FortuneLargeCompaniesFetcher(year)
    # Determine fetch target - None means fetch all available
    # max_rank can be None (all), a number <= 1000, or > 1000
    if max_rank is None:
        fetch_target = None  # Fetch all available
    elif max_rank > 1000:
        fetch_target = max_rank  # Fetch specific amount beyond 1000
    else:
        fetch_target = max_rank  # Fetch specific amount up to 1000

    companies = fetcher.fetch_all_large_companies(min_revenue=None, max_companies=fetch_target)

    print(f"\nSuccessfully fetched {len(companies)} companies")

    # Set max_rank to all companies if None
    if max_rank is None:
        max_rank = len(companies)
        print(f"\nRanking all {max_rank} companies")
    elif max_rank > len(companies):
        # Warn if user requested more than available
        print(f"\nWarning: Only {len(companies)} companies fetched.")
        print(f"Ranking top {len(companies)} instead of requested {max_rank}.")
        max_rank = len(companies)

    # Note if fetching more than Fortune 1000
    if max_rank > 1000:
        print(f"Note: Fetching beyond Fortune 1000 (includes all companies in Fortune database)")

    if companies:
        # Show revenue range
        revenues = [c['revenue'] for c in companies if c['revenue']]
        if revenues:
            print(f"\nRevenue range:")
            print(f"  Min: ${min(revenues):,.0f}M")
            print(f"  Max: ${max(revenues):,.0f}M")

        # Show sample companies
        print("\nTop 10 companies by revenue:")
        sorted_companies = sorted(companies,
                                key=lambda x: x.get('revenue', 0) or 0,
                                reverse=True)
        for i, company in enumerate(sorted_companies[:10], 1):
            revenue = company.get('revenue')
            revenue_str = f"${revenue:,.0f}M" if revenue else "N/A"
            print(f"  {i}. {company['name']}: {revenue_str}")

        # Update database
        added, updated = fetcher.update_database(companies)
        print(f"\nDatabase updated:")
        print(f"  Added: {added} companies")
        print(f"  Updated: {updated} companies")

        # Fetch detailed information if flag is set or ask user
        if fetch_details:
            print(f"\nDetailed information flag set - fetching details for {len(companies)} companies...")
            fetcher.enrich_company_details(fetch_details=True)
        else:
            # Ask user if they want to fetch detailed information
            print("\n" + "=" * 60)
            print("Detailed Company Information")
            print("=" * 60)
            print("Would you like to fetch detailed info for each company?")
            print("(CEO, Headquarters, Ticker, Profits, Market Value, etc.)")
            print(f"\nThis will fetch individual pages for companies.")
            print(f"Estimated time: ~{len(companies) * 0.5 / 60:.1f} minutes")
            print("\nTip: Use --details flag to skip this prompt")
            response = input("\nFetch detailed information? [y/N]: ").strip().lower()

            if response in ['y', 'yes']:
                fetcher.enrich_company_details(fetch_details=True)
            else:
                print("\nSkipping detailed information fetch.")

        # Check database status
        conn = sqlite3.connect('fortune500.db')
        cursor = conn.cursor()

        # Count companies with significant revenue
        cursor.execute("""
            SELECT COUNT(*) FROM companies
            WHERE revenue >= 3000
        """)
        high_revenue_count = cursor.fetchone()[0]

        cursor.execute("SELECT COUNT(*) FROM companies")
        total_count = cursor.fetchone()[0]

        print(f"\nDatabase status:")
        print(f"  Total companies: {total_count}")
        print(f"  Companies with revenue >= $3,000M: {high_revenue_count}")

        # Rank companies by revenue
        fetcher.rank_companies(cursor, max_rank)

        conn.commit()

        # Show ranking results
        cursor.execute("SELECT COUNT(*) FROM companies WHERE rank IS NOT NULL")
        ranked_count = cursor.fetchone()[0]

        cursor.execute("""
            SELECT rank, name, revenue
            FROM companies
            WHERE rank IS NOT NULL
            ORDER BY rank
            LIMIT 5
        """)
        top_5 = cursor.fetchall()

        cursor.execute("""
            SELECT rank, name, revenue
            FROM companies
            WHERE rank IS NOT NULL
            ORDER BY rank DESC
            LIMIT 5
        """)
        bottom_5 = cursor.fetchall()

        print(f"\nSuccessfully ranked {ranked_count} companies")

        print("\nTop 5:")
        for rank, name, revenue in top_5:
            print(f"  {rank}. {name}: ${revenue:,.0f}M")

        print("\nBottom 5:")
        for rank, name, revenue in reversed(bottom_5):
            print(f"  {rank}. {name}: ${revenue:,.0f}M")

        conn.close()

        print("\n" + "=" * 60)
        print(f"Fortune Top {max_rank} database ready!")
        print("=" * 60)

 if __name__ == "__main__":
    main()
No results found