ejfox · October 12, 2024 16:57
diff --git a/scrape_mta_data.js b/scrape_mta_data.js
 const axios = require('axios');
 const Bottleneck = require('bottleneck');
 const fs = require('fs');
 const createCsvWriter = require('csv-writer').createObjectCsvWriter;

 const API_ENDPOINT = 'https://data.ny.gov/resource/uhf3-t34z.csv'; // NY data API
 const MAX_RESULTS = 50000; // Maximum number of rows per request
 const TOTAL_ROWS = 1500000; // Total number of rows to scrape
 const MAX_REQUESTS_PER_HOUR = 1000; // Limit to 1000 requests per hour
 const OUTPUT_CSV = 'ny_data_snapshot.csv'; // Snapshot CSV file for immediate use
 const FULL_OUTPUT_CSV = 'ny_data_full.csv'; // Full output CSV for the complete run

 // Set up a rate limiter (limit to 1000 requests/hour)
 const limiter = new Bottleneck({
  minTime: 3600 / MAX_REQUESTS_PER_HOUR // Minimum time between requests to stay within limit
 });

 let allData = [];
 let totalRequests = 0; // Track number of requests

 // Function to make an API request with offset
 const fetchData = async (offset = 0) => {
  try {
    const response = await axios.get(API_ENDPOINT, {
      params: {
        $limit: MAX_RESULTS,
        $offset: offset
      }
    });
    return response.data;
  } catch (error) {
    console.error(`Error fetching data: ${error.message}`);
    return [];
  }
 };

 // Function to fetch a limited number of rows for the snapshot
 const fetchSnapshot = async (offset = 0, maxRequests = MAX_REQUESTS_PER_HOUR) => {
  const data = await limiter.schedule(() => fetchData(offset));

  if (data.length > 0) {
    allData = allData.concat(data);
    totalRequests++;
    console.log(`Fetched ${data.length} records, total so far: ${allData.length}`);

    // Fetch next batch until we hit the request limit or all data is fetched
    if (totalRequests < maxRequests && offset + MAX_RESULTS < TOTAL_ROWS) {
      await fetchSnapshot(offset + MAX_RESULTS, maxRequests);
    } else {
      console.log(`Snapshot complete, writing ${allData.length} records to CSV...`);
      writeToCsv(allData, OUTPUT_CSV);
    }
  }
 };

 // Function to resume fetching the rest of the data
 const fetchRemainingData = async (offset = allData.length, maxRequests = MAX_REQUESTS_PER_HOUR) => {
  const data = await limiter.schedule(() => fetchData(offset));

  if (data.length > 0) {
    allData = allData.concat(data);
    totalRequests++;
    console.log(`Fetched ${data.length} records, total so far: ${allData.length}`);

    // Fetch next batch until all data is fetched
    if (offset + MAX_RESULTS < TOTAL_ROWS) {
      await fetchRemainingData(offset + MAX_RESULTS, maxRequests);
    } else {
      console.log(`All data fetched, writing full ${allData.length} records to CSV...`);
      writeToCsv(allData, FULL_OUTPUT_CSV);
    }
  }
 };

 // Function to write data to CSV
 const writeToCsv = (data, outputFile) => {
  const csvWriter = createCsvWriter({
    path: outputFile,
    header: Object.keys(data[0]).map((key) => ({ id: key, title: key }))
  });

  csvWriter.writeRecords(data)
    .then(() => {
      console.log(`CSV file successfully written to ${outputFile}!`);
    })
    .catch((error) => {
      console.error(`Error writing CSV: ${error.message}`);
    });
 };

 // Start fetching a snapshot now
 fetchSnapshot();

 // You can later resume with something like `fetchRemainingData()`
 // This function will allow you to pick up where the snapshot left off, scraping the rest of the data later tonight.
 // fetchRemainingData();
	const axios = require('axios');
	const Bottleneck = require('bottleneck');
	const fs = require('fs');
	const createCsvWriter = require('csv-writer').createObjectCsvWriter;

	const API_ENDPOINT = 'https://data.ny.gov/resource/uhf3-t34z.csv'; // NY data API
	const MAX_RESULTS = 50000; // Maximum number of rows per request
	const TOTAL_ROWS = 1500000; // Total number of rows to scrape
	const MAX_REQUESTS_PER_HOUR = 1000; // Limit to 1000 requests per hour
	const OUTPUT_CSV = 'ny_data_snapshot.csv'; // Snapshot CSV file for immediate use
	const FULL_OUTPUT_CSV = 'ny_data_full.csv'; // Full output CSV for the complete run

	// Set up a rate limiter (limit to 1000 requests/hour)
	const limiter = new Bottleneck({
	minTime: 3600 / MAX_REQUESTS_PER_HOUR // Minimum time between requests to stay within limit
	});

	let allData = [];
	let totalRequests = 0; // Track number of requests

	// Function to make an API request with offset
	const fetchData = async (offset = 0) => {
	try {
	const response = await axios.get(API_ENDPOINT, {
	params: {
	$limit: MAX_RESULTS,
	$offset: offset
	}
	});
	return response.data;
	} catch (error) {
	console.error(`Error fetching data: ${error.message}`);
	return [];
	}
	};

	// Function to fetch a limited number of rows for the snapshot
	const fetchSnapshot = async (offset = 0, maxRequests = MAX_REQUESTS_PER_HOUR) => {
	const data = await limiter.schedule(() => fetchData(offset));

	if (data.length > 0) {
	allData = allData.concat(data);
	totalRequests++;
	console.log(`Fetched ${data.length} records, total so far: ${allData.length}`);

	// Fetch next batch until we hit the request limit or all data is fetched
	if (totalRequests < maxRequests && offset + MAX_RESULTS < TOTAL_ROWS) {
	await fetchSnapshot(offset + MAX_RESULTS, maxRequests);
	} else {
	console.log(`Snapshot complete, writing ${allData.length} records to CSV...`);
	writeToCsv(allData, OUTPUT_CSV);
	}
	}
	};

	// Function to resume fetching the rest of the data
	const fetchRemainingData = async (offset = allData.length, maxRequests = MAX_REQUESTS_PER_HOUR) => {
	const data = await limiter.schedule(() => fetchData(offset));

	if (data.length > 0) {
	allData = allData.concat(data);
	totalRequests++;
	console.log(`Fetched ${data.length} records, total so far: ${allData.length}`);

	// Fetch next batch until all data is fetched
	if (offset + MAX_RESULTS < TOTAL_ROWS) {
	await fetchRemainingData(offset + MAX_RESULTS, maxRequests);
	} else {
	console.log(`All data fetched, writing full ${allData.length} records to CSV...`);
	writeToCsv(allData, FULL_OUTPUT_CSV);
	}
	}
	};

	// Function to write data to CSV
	const writeToCsv = (data, outputFile) => {
	const csvWriter = createCsvWriter({
	path: outputFile,
	header: Object.keys(data[0]).map((key) => ({ id: key, title: key }))
	});

	csvWriter.writeRecords(data)
	.then(() => {
	console.log(`CSV file successfully written to ${outputFile}!`);
	})
	.catch((error) => {
	console.error(`Error writing CSV: ${error.message}`);
	});
	};

	// Start fetching a snapshot now
	fetchSnapshot();

	// You can later resume with something like `fetchRemainingData()`
	// This function will allow you to pick up where the snapshot left off, scraping the rest of the data later tonight.
	// fetchRemainingData();