Created
October 12, 2024 16:57
-
-
Save ejfox/cf9638c34c411f7fb870655613fc1ed9 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const axios = require('axios'); | |
const Bottleneck = require('bottleneck'); | |
const fs = require('fs'); | |
const createCsvWriter = require('csv-writer').createObjectCsvWriter; | |
const API_ENDPOINT = 'https://data.ny.gov/resource/uhf3-t34z.csv'; // NY data API | |
const MAX_RESULTS = 50000; // Maximum number of rows per request | |
const TOTAL_ROWS = 1500000; // Total number of rows to scrape | |
const MAX_REQUESTS_PER_HOUR = 1000; // Limit to 1000 requests per hour | |
const OUTPUT_CSV = 'ny_data_snapshot.csv'; // Snapshot CSV file for immediate use | |
const FULL_OUTPUT_CSV = 'ny_data_full.csv'; // Full output CSV for the complete run | |
// Set up a rate limiter (limit to 1000 requests/hour) | |
const limiter = new Bottleneck({ | |
minTime: 3600 / MAX_REQUESTS_PER_HOUR // Minimum time between requests to stay within limit | |
}); | |
let allData = []; | |
let totalRequests = 0; // Track number of requests | |
// Function to make an API request with offset | |
const fetchData = async (offset = 0) => { | |
try { | |
const response = await axios.get(API_ENDPOINT, { | |
params: { | |
$limit: MAX_RESULTS, | |
$offset: offset | |
} | |
}); | |
return response.data; | |
} catch (error) { | |
console.error(`Error fetching data: ${error.message}`); | |
return []; | |
} | |
}; | |
// Function to fetch a limited number of rows for the snapshot | |
const fetchSnapshot = async (offset = 0, maxRequests = MAX_REQUESTS_PER_HOUR) => { | |
const data = await limiter.schedule(() => fetchData(offset)); | |
if (data.length > 0) { | |
allData = allData.concat(data); | |
totalRequests++; | |
console.log(`Fetched ${data.length} records, total so far: ${allData.length}`); | |
// Fetch next batch until we hit the request limit or all data is fetched | |
if (totalRequests < maxRequests && offset + MAX_RESULTS < TOTAL_ROWS) { | |
await fetchSnapshot(offset + MAX_RESULTS, maxRequests); | |
} else { | |
console.log(`Snapshot complete, writing ${allData.length} records to CSV...`); | |
writeToCsv(allData, OUTPUT_CSV); | |
} | |
} | |
}; | |
// Function to resume fetching the rest of the data | |
const fetchRemainingData = async (offset = allData.length, maxRequests = MAX_REQUESTS_PER_HOUR) => { | |
const data = await limiter.schedule(() => fetchData(offset)); | |
if (data.length > 0) { | |
allData = allData.concat(data); | |
totalRequests++; | |
console.log(`Fetched ${data.length} records, total so far: ${allData.length}`); | |
// Fetch next batch until all data is fetched | |
if (offset + MAX_RESULTS < TOTAL_ROWS) { | |
await fetchRemainingData(offset + MAX_RESULTS, maxRequests); | |
} else { | |
console.log(`All data fetched, writing full ${allData.length} records to CSV...`); | |
writeToCsv(allData, FULL_OUTPUT_CSV); | |
} | |
} | |
}; | |
// Function to write data to CSV | |
const writeToCsv = (data, outputFile) => { | |
const csvWriter = createCsvWriter({ | |
path: outputFile, | |
header: Object.keys(data[0]).map((key) => ({ id: key, title: key })) | |
}); | |
csvWriter.writeRecords(data) | |
.then(() => { | |
console.log(`CSV file successfully written to ${outputFile}!`); | |
}) | |
.catch((error) => { | |
console.error(`Error writing CSV: ${error.message}`); | |
}); | |
}; | |
// Start fetching a snapshot now | |
fetchSnapshot(); | |
// You can later resume with something like `fetchRemainingData()` | |
// This function will allow you to pick up where the snapshot left off, scraping the rest of the data later tonight. | |
// fetchRemainingData(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment