Created
December 15, 2020 20:31
-
-
Save Announcement/b95196a9572b940afd8d91e277c42e06 to your computer and use it in GitHub Desktop.
scrape
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// run this in the 'browser' | |
function * getPersonsResults () { | |
for (const result of document.querySelector('#results-list').querySelectorAll('.result')) { | |
const nameContainer = result.querySelector('.name-container'); | |
const primaryName = nameContainer.querySelector('h3').textContent.trim(); | |
const aliases = [...nameContainer.querySelectorAll('.aliases .alias')].map(it => it.textContent.trim()) | |
const locations = [...result.querySelectorAll('.locations-container .person-location')].map(location => location.textContent.trim()); | |
const relatives = [...result.querySelectorAll('.relatives-container .person-relative')].map(location => location.textContent.trim()); | |
const age = result.querySelector('.age-container h2').textContent.trim(); | |
yield { name: primaryName, aliases, locations, relatives, age } | |
} | |
} | |
[...getPersonsResults()] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// run in your shell | |
// node '.\webscraper test list.js' '.\webscraper test list.txt' | |
const window = { | |
location: { | |
protocol: 'https:', | |
host: 'www.instantcheckmate.com' | |
} | |
} | |
// node.js built-in library to read file system. | |
const fs = require('fs'); | |
const https = require('https'); | |
// read the file provided from the command line | |
const contents = fs.readFileSync(process.argv[2]).toString('utf8'); | |
// how are the rows delimited? in this case it's lines. | |
const expressionRow = createExpression(/^.+$/gm); | |
// * if you're interested the regular expression: /^.+$/gim (this is the pattern) | |
// regular expressions just match text. | |
// they're very helpful for what you're trying to do. | |
// /gm are the flags | |
// g = global (so more than one (in this case lines)) | |
// m = multiline, should be obvious why we need this. | |
// /^.+$/ is a little more daunting, if you don't know what it is, so we'll go | |
// one symbol at a time... | |
// the first / just means start a regular expression | |
// the ^ means at the start of a line | |
// the . means any character | |
// the + means repeat the last thing as much as we can. | |
// the $ means until the end of the line | |
// the last / obviously means we're done. | |
// how are the columsn delimited? in this case, tabs. | |
const expressionColumn = createExpression(/[^\t]+/gm); | |
// [^\t]+ is the only part that changed here so we'll focus on that. | |
// when things are in between a [] it means, match anything inside of this set | |
// when a [] set starts with ^ it means, except. | |
// \t just means a tab, which is what excel uses to seperate .txt files. | |
// again + just means repeat that last thing. | |
// so really just means match any character except \t and repeat | |
// the best way to think of an array is as a | |
// list of things, like item #1, item #2... | |
// a table is just a list of lists, | |
// for example a list of columns, | |
// where the columns are inside of the list of rows. | |
// so lets start the list of rows, we'll fill it up later. | |
let table = []; | |
// so basically lets get all of the things that match the expressionRow pattern, | |
// and call each of these matches a 'row'. | |
for (const row of expressionRow(contents)) { | |
// lets start a list of columns | |
const columns = []; | |
// and find all of the things that match the column expression, | |
// * the reason we gave it row[0] instead of just row, | |
// is because regular expressions have many features and | |
// execution results have complicated tags, like groups and indexes and such. | |
// we only want the text that is matched by the pattern which is stored in [0] | |
// and call each a 'column' | |
for (const column of row[0].split(/\t/g)) | |
// add the current column to the list of columns | |
// (push means add to the end) | |
columns.push(column) | |
//and add the list of columns to the table. | |
table.push(columns); | |
} | |
// console.log(table) | |
// in your file, the first row was not data, but it was headers, | |
// we we can use that information to label our data. | |
const headers = table[0]; | |
// and this will be a list where we hold our labeled data. | |
let data = []; | |
// in my notation, I use the lowercase letter as the starting point, | |
// and a capital letter as the ending point. | |
// a for loop just does thing for as long as it's conditions are met. | |
// they're defined in three parts. | |
// the beginning, `let h = 1, H = table.length;` is where you initialize. | |
// the second part is the condition, | |
// so whenever this condition is no longer met, the loop is done. | |
// the third part is the progression. | |
// so, lowercase k is a number that = 1, | |
// and uppercase K is a number that = the number of items in the array(list), | |
// called table | |
// for the duration that lowercase k is less than uppercase K, | |
// we'll do some stuff, and k++, which means add 1 to k; | |
for (let k = 1, K = table.length; k < K; k++) | |
{ | |
let entry = {}; | |
const cells = table[k]; | |
// so {} means declare an object. | |
// objects are like arrays but intead of being numbered, entries have names. | |
// for example, getting data out of an array would look like: | |
// array[0], array[1] | |
// and getting data out of an object would look like: | |
// object.city, object. | |
// anyways, let's basically do the same thing we did with the previous loop, | |
// but instead of going threw the whole table, we'll just gow threw each row. | |
// where H = table[k].length, aka the number of cells/columns in the row. | |
for (let h = 0, H = table[k].length; h < H; h++) | |
{ | |
// const property = headers[h]; | |
// const value = cells[h]; | |
entry[table[0][h]] = table[k][h]; | |
} | |
data.push(entry); | |
} | |
/* the following is poorly documented */ | |
// generate the urls | |
for (const entry of data) { | |
const url = | |
submitPersonSearch({ | |
firstName: entry['Owner 1 First Name'], | |
lastName: entry['Owner 1 Last Name'], | |
city: entry['City'], | |
state: entry['State'] | |
}); | |
console.log(url) | |
// try { | |
// https.get(url, response => { | |
// let data = ''; | |
// console.log(response.statusCode); | |
// response.setEncoding('utf8'); | |
// response.on('data', $data => { | |
// data += $data; | |
// }); | |
// response.on('end', () => { | |
// console.log(data); | |
// }) | |
// }) | |
// } catch (e){console.log(e)} | |
} | |
// this code is stolen straight from the website and just modified a little bit to not suck. heh. | |
function submitPersonSearch({ firstName, middleName, lastName, city, state, age }) { | |
// show_icm_loader('Performing search for ' + firstName + ' ' + lastName); | |
var nameSearchUrl = window.location.protocol + '//' + window.location.host + '/dashboard/search/person/?first=' + firstName; | |
if (middleName !== '' && middleName !== undefined && middleName !== undefined) { | |
nameSearchUrl += '&middle=' + middleName; | |
} | |
nameSearchUrl += '&last=' + lastName; | |
if (city !== '' && city !== undefined && city !== undefined) { | |
nameSearchUrl += '&city=' + city; | |
} | |
nameSearchUrl += '&state=' + state; | |
if (age !== '' && age !== undefined && age !== undefined) { | |
nameSearchUrl += '&age=' + age; | |
} | |
// window.location.href = nameSearchUrl; | |
return nameSearchUrl; | |
} | |
/* documentation should be a *little* better from here on out, */ | |
function createExpression ({ source, flags }) { | |
flags = [...eachItem(flags, 'g')] | |
// [...] means take a generator's values and store it in an array. | |
.filter((value, index, array) => array.indexOf(value) === index) | |
// => just means it's an unnamed function and return the result afterwards. | |
// .filter is built-in to modern javascript, | |
// it takes arrays and removes the values that don't pass the test. | |
.join(''); | |
// .join is also built-in to javascript (I think even old versions) | |
// it just takes a list of characters(strings) and | |
// puts them together using whatever you told it to put them together with. | |
const pattern = new RegExp(source, flags); | |
return function * matches (it) { | |
let that; | |
while (isExistant(that = pattern.exec(it))) | |
yield that; | |
} | |
} | |
// generate a list of items in them. | |
function * eachItem (collection) { | |
// just read it man. | |
// (ignore the word const, that is just declaring your variables) | |
for (const those of collection) | |
for (const each of those) | |
yield each; | |
// the yield part might be a little confusing. | |
// generators work a little like this... | |
// they don't make the whole list, | |
// they give you a piece of it every time you ask. | |
// yield just means, after you ask... | |
// the following is the next piece to... provide (or yield) | |
} | |
// this is a function to make things easier. | |
// it tells you if something is existant, as you can tell by the name. | |
function isExistant (it) { | |
// null is a symbol in javascript that means it has been defined but not set. | |
// undefined means that it has referenced but not defined. | |
// so if it is neither of those, than it exists! | |
return it !== null && it !== undefined; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment