Skip to content

Instantly share code, notes, and snippets.

@techsin
Created April 7, 2019 19:17
Show Gist options
  • Save techsin/3aab5e1c0b4d6fe6305411c79f4b0e32 to your computer and use it in GitHub Desktop.
Save techsin/3aab5e1c0b4d6fe6305411c79f4b0e32 to your computer and use it in GitHub Desktop.
scrape_companies_commerce_nyc
class Company {
static fetchStack = [];
static data = [];
static fetched = 0;
static interval = 300;
static intervalVariance = 100;
constructor(tr) {
Company.fetchStack.push(tr);
}
static async fetchAllCompanies() {
if (this.fetched >= this.fetchStack.length) {
console.log(this.data);
return;
}
if (this.fetched % 10 === 0) {
console.log(this.fetched, this.fetchStack.length);
}
let i = this.fetched++;
let company = this.fetchStack[i];
let url = company.querySelector('a').href;
let result = await fetchData(url);
this.data.push(result);
setTimeout(this.fetchAllCompanies.bind(this), this.interval + Math.round(this.intervalVariance * Math.random()));
}
}
async function fetchData(url) {
let html = await fetch(url).then(res => res.text());
let parser = new DOMParser();
let doc = parser.parseFromString(html, "text/html");
let tr = Array.from(doc.querySelectorAll('#ux-page > div > div.grid_12 > table tbody tr')).map(d => d.children[1].innerText);
let [company_name,
address,
city_zip,
person_name,
phone,
fax,
link,
industry,
employees] = [...tr];
let result = {
company_name,
address : `${address} ${city_zip}`,
person_name,
phone,
fax,
link,
industry,
employees
};
return result;
}
$$('tbody tr').map(x => new Company(x));
Company.fetchAllCompanies();
// disable js to show all then enable to run settimeout
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment