Skip to content

Instantly share code, notes, and snippets.

@ejfox
Created October 12, 2024 16:57
Show Gist options
  • Save ejfox/cf9638c34c411f7fb870655613fc1ed9 to your computer and use it in GitHub Desktop.
Save ejfox/cf9638c34c411f7fb870655613fc1ed9 to your computer and use it in GitHub Desktop.
const axios = require('axios');
const Bottleneck = require('bottleneck');
const fs = require('fs');
const createCsvWriter = require('csv-writer').createObjectCsvWriter;
const API_ENDPOINT = 'https://data.ny.gov/resource/uhf3-t34z.csv'; // NY data API
const MAX_RESULTS = 50000; // Maximum number of rows per request
const TOTAL_ROWS = 1500000; // Total number of rows to scrape
const MAX_REQUESTS_PER_HOUR = 1000; // Limit to 1000 requests per hour
const OUTPUT_CSV = 'ny_data_snapshot.csv'; // Snapshot CSV file for immediate use
const FULL_OUTPUT_CSV = 'ny_data_full.csv'; // Full output CSV for the complete run
// Set up a rate limiter (limit to 1000 requests/hour)
const limiter = new Bottleneck({
minTime: 3600 / MAX_REQUESTS_PER_HOUR // Minimum time between requests to stay within limit
});
let allData = [];
let totalRequests = 0; // Track number of requests
// Function to make an API request with offset
const fetchData = async (offset = 0) => {
try {
const response = await axios.get(API_ENDPOINT, {
params: {
$limit: MAX_RESULTS,
$offset: offset
}
});
return response.data;
} catch (error) {
console.error(`Error fetching data: ${error.message}`);
return [];
}
};
// Function to fetch a limited number of rows for the snapshot
const fetchSnapshot = async (offset = 0, maxRequests = MAX_REQUESTS_PER_HOUR) => {
const data = await limiter.schedule(() => fetchData(offset));
if (data.length > 0) {
allData = allData.concat(data);
totalRequests++;
console.log(`Fetched ${data.length} records, total so far: ${allData.length}`);
// Fetch next batch until we hit the request limit or all data is fetched
if (totalRequests < maxRequests && offset + MAX_RESULTS < TOTAL_ROWS) {
await fetchSnapshot(offset + MAX_RESULTS, maxRequests);
} else {
console.log(`Snapshot complete, writing ${allData.length} records to CSV...`);
writeToCsv(allData, OUTPUT_CSV);
}
}
};
// Function to resume fetching the rest of the data
const fetchRemainingData = async (offset = allData.length, maxRequests = MAX_REQUESTS_PER_HOUR) => {
const data = await limiter.schedule(() => fetchData(offset));
if (data.length > 0) {
allData = allData.concat(data);
totalRequests++;
console.log(`Fetched ${data.length} records, total so far: ${allData.length}`);
// Fetch next batch until all data is fetched
if (offset + MAX_RESULTS < TOTAL_ROWS) {
await fetchRemainingData(offset + MAX_RESULTS, maxRequests);
} else {
console.log(`All data fetched, writing full ${allData.length} records to CSV...`);
writeToCsv(allData, FULL_OUTPUT_CSV);
}
}
};
// Function to write data to CSV
const writeToCsv = (data, outputFile) => {
const csvWriter = createCsvWriter({
path: outputFile,
header: Object.keys(data[0]).map((key) => ({ id: key, title: key }))
});
csvWriter.writeRecords(data)
.then(() => {
console.log(`CSV file successfully written to ${outputFile}!`);
})
.catch((error) => {
console.error(`Error writing CSV: ${error.message}`);
});
};
// Start fetching a snapshot now
fetchSnapshot();
// You can later resume with something like `fetchRemainingData()`
// This function will allow you to pick up where the snapshot left off, scraping the rest of the data later tonight.
// fetchRemainingData();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment