Last active
April 23, 2019 16:06
-
-
Save ozozozd/505422932c775bc4a95da777ac3da6cd to your computer and use it in GitHub Desktop.
zillow crawler
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// NEIL's thing | |
var pagestogo = 3; | |
var script_jQuery=document.createElement('script'); | |
script_jQuery.setAttribute('src','//code.jquery.com/jquery-latest.min.js'); | |
document.body.appendChild(script_jQuery); | |
var apiKey = "keylGUSqrw15bnk8T"; | |
var hostUrl = "https://api.airtable.com/v0/appG3ImiHnRxIvJoo/Table%201"; | |
var headers = { | |
"Authorization": "Bearer keylGUSqrw15bnk8T", | |
"Content-Type": "application/json" | |
}; | |
var pagesdone = 0; | |
var race = (...promises) => | |
new Promise((res, rej) => { | |
promises.forEach(p => p.then(res).catch(rej)); | |
}); | |
function checkElements(win, selectors) { | |
function rafAsync() { | |
return new Promise(resolve => { | |
requestAnimationFrame(resolve); //faster than set time out | |
}); | |
} | |
if (selectors.map(selector => win.document.querySelector(selector) !== null).reduce((val,acc) => val && acc, true)) { | |
return Promise.resolve(true); | |
} else { | |
return rafAsync().then(() => checkElements(win, selectors)); | |
} | |
} | |
function nextPage() { | |
let q = window.open($("a:contains('Next')")[0].href, "_blank"); | |
setTimeout(function() { | |
console.log("starting the new page"); | |
urls = q.document.querySelectorAll("article"); | |
urls = [].slice.call(urls).map(function(k) { | |
return k.querySelector('a').href; | |
}); | |
console.log("urls:" + urls); | |
if (urls.length > 0) { | |
q.close(); | |
loop(null, urls); | |
} | |
else { | |
console.log("Couldn't get urls"); | |
} | |
}, 4000); | |
} | |
var urls = []; | |
function getLinks() { | |
urls = $$("article").map(function(k) | |
{ | |
return $$("a", k)[0].href | |
}); | |
} | |
var loop = function(prevWin, urls) { | |
if (urls.length == 0) { | |
console.log("going to next page"); | |
nextPage(); | |
return; | |
} | |
console.log("IN LOOP URLS IS: " + urls.length); | |
var currWin = window.open(urls.shift(), "_blank"); | |
setTimeout(function() { try { scrapeNewWindow(currWin)} catch (err) {console.log(err); } }, 3000); | |
(prevWin && prevWin.close()); | |
window.setTimeout(function() { | |
loop(currWin, urls); | |
}, 5000); | |
}; | |
function scrapeNewWindow(q) { | |
console.log("Got to scrape for " + q.window.location.href); | |
type1 = checkElements(q, [".ds-address-container", ".cf-listing-agent-display-name", ".cf-listing-agent-info"]); | |
type2 = checkElements(q, [".ds-address-container", ".isListingAgent"]); | |
race(type1, type2).then(function(value) { | |
console.log("Race won " + q.window.location.href); | |
var address = q.document.querySelector(".ds-address-container").innerText; | |
var agentName = null; | |
var agentPhoneNum = null; | |
if (q.document.querySelector(".isListingAgent") != null) { | |
agentPhoneNum = q.document.querySelector(".phone").innerText; | |
agentName = q.document.querySelector(".cf-rpt-display-name").innerText; | |
console.log("LISTING AGENT PATH FOR ") | |
} else { | |
console.log("Got into else"); | |
agentName = q.document.querySelector(".cf-listing-agent-display-name").innerText; | |
agentPhoneNum = q.document.querySelector(".cf-listing-agent-info").innerText; | |
var regex = /(?:(?:\+?([1-9]|[0-9][0-9]|[0-9][0-9][0-9])\s*(?:[.-]\s*)?)?(?:\(\s*([2-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9])\s*\)|([0-9][1-9]|[0-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9]))\s*(?:[.-]\s*)?)?([2-9]1[02-9]|[2-9][02-9]1|[2-9][02-9]{2})\s*(?:[.-]\s*)?([0-9]{4})(?:\s*(?:#|x\.?|ext\.?|extension)\s*(\d+))?/ | |
var regexMatch = agentPhoneNum.replace('\n', ' ').match(regex); | |
agentPhoneNum = regexMatch && regexMatch[0]; | |
} | |
if (agentPhoneNum != null) { | |
var data = { | |
"fields": { | |
"Address": address, | |
"Agent Name": agentName, | |
"Phone Number": agentPhoneNum | |
} | |
} | |
console.log("SENT THE THING for !"); | |
$.ajax({ | |
url: hostUrl, | |
type: "POST", | |
data: JSON.stringify(data), | |
dataType: "json", | |
processData: false, | |
headers: headers | |
}).done(function() { | |
console.log("DONE, son!"); | |
}); | |
} else { | |
console.log("NO PHONE NUMBER"); | |
} | |
}); | |
} | |
getLinks() | |
setTimeout(loop, 2000, null, urls); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment