Last active
February 6, 2024 20:45
-
-
Save azanli/face0c38163dc6c4273325bbdd6783e3 to your computer and use it in GitHub Desktop.
An email scraper for websites - extracts names and emails from a page & inserts them into your Google Spreadsheet for Mail Merge
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class Email_Scraper { | |
constructor(log = false, names = true, scriptURL = '', spreadsheetURL = '') { | |
this.currIndex = 0; | |
this.log = log; | |
this.names = names; | |
this.pendingRecursive = 1; | |
this.scriptURL = scriptURL; | |
this.spreadsheetURL = spreadsheetURL; | |
this.sourceIndex = 0; | |
this.sources = {}; | |
} | |
traverseDOM(email, dom = document.body) { | |
if (dom.childNodes.length === 0) { | |
let tmp = document.createElement("div"); | |
tmp.appendChild(dom.cloneNode(false)); | |
const value = tmp.innerHTML; | |
if (value.length <= 30 && value.includes('.')) { | |
let indexAt = -1; | |
if (email) { | |
indexAt = value.indexOf(email); | |
} else { | |
indexAt = value.indexOf('@'); | |
} | |
if (indexAt > -1) { | |
const name = value.substr(0, indexAt); | |
const alphaMap = {}; | |
for (let i = 0; i < indexAt; i++) { | |
alphaMap[value[i]] = true; | |
} | |
this.sources[this.sourceIndex] = { | |
name, | |
email: value, | |
score: 0, | |
alphaMap, | |
} | |
this.sourceIndex++; | |
} | |
} | |
} else if (dom.childNodes.length) { | |
for (let i = 0; i <= dom.childNodes.length; i += 1) { | |
if (dom.childNodes[i]) { | |
this.pendingRecursive++; | |
this.traverseDOM(email, dom.childNodes[i]); | |
} | |
} | |
} | |
if (--this.pendingRecursive === 0) { | |
this.pendingRecursive = 1; | |
if (this.log && !this.names) { | |
for (let key in this.sources) { | |
console.log(this.sources[key]['email']); | |
} | |
} | |
if (this.names) this.traverseAgainForNames(document.body); | |
} | |
}; | |
traverseAgainForNames(dom) { | |
if (this.sourceIndex === 0) return; | |
if (dom.childNodes.length === 0) { | |
let tmp = document.createElement("div"); | |
tmp.appendChild(dom.cloneNode(false)); | |
const value = tmp.innerHTML; | |
if (value.charCodeAt(0) >= 65 && value.charCodeAt(0) <= 90 && value.length < 30 && value.length >= 5) { // Greater than 30 characters may be a sentence rather than a name. | |
Object.keys(this.sources).some(index => { | |
if (this.sources[index]['score'] > 100) return; | |
let name = this.sources[index]['name']; | |
// Incase the name has numbers appended to their last names. | |
let lastIndex = name.length - 1; | |
if (!isNaN(parseInt(name[lastIndex]))) { | |
while (!isNaN(parseInt(name[lastIndex]))) { | |
lastIndex--; | |
} | |
} | |
const lowerCaseName = name.toLowerCase(); | |
const lowerCaseValue = value.toLowerCase(); | |
if (lowerCaseName[lastIndex].toLowerCase() === lowerCaseValue[value.length - 1].toLowerCase() && | |
lowerCaseName[lastIndex - 1].toLowerCase() === lowerCaseValue[value.length - 2].toLowerCase() && | |
(lowerCaseName[lastIndex - 2].toLowerCase() === lowerCaseValue[value.length - 3].toLowerCase() || lowerCaseName[lastIndex - 2].toLowerCase() === lowerCaseValue[0].toLowerCase())) { | |
if (lowerCaseName[0].toLowerCase() === lowerCaseValue[0].toLowerCase()) { | |
let parsedValue = value; | |
while (parsedValue.includes('nbsp')) { | |
if (parsedValue.includes(' ')) { | |
parsedValue = parsedValue.replace(' ', ' '); | |
} else if (parsedValue.includes(' ')) { | |
parsedValue = parsedValue.replace(' ', ' '); | |
} else if (parsedValue.includes('nbsp;')) { | |
parsedValue = parsedValue.replace('nbsp;', ' '); | |
} else { | |
parsedValue = parsedValue.replace('nbsp', ' '); | |
} | |
} | |
if (lowerCaseValues[0] === 'm') { | |
if (lowerCaseValue.includes('mrs ') || lowerCaseValue.includes('mr ') || lowerCaseValue.includes('ms ')) { | |
parsedValue = parsedValue.substr(parsedValue.indexOf(' ') + 1); | |
} | |
} | |
let count = 0; | |
for (let i = 0; i < parsedValue.length; i++) { | |
if (this.sources[index]['alphaMap'][parsedValue[i]]) count += 1; | |
} | |
const newScore = (count / Object.keys(this.sources[index]['alphaMap']).length) * 100; | |
if (newScore > this.sources[index]['score']) { | |
if (this.log) console.log('New score:', newScore, 'Old score:', this.sources[index]['score']); | |
if (this.log) console.log('Changing name from', name, 'to', parsedValue); | |
this.sources[index]['name'] = parsedValue; | |
this.sources[index]['score'] = newScore; | |
return true; | |
} | |
} | |
} | |
}); | |
} | |
} else { | |
for (let i = 0; i <= dom.childNodes.length; i += 1) { | |
if (dom.childNodes[i]) { | |
this.pendingRecursive++; | |
this.traverseAgainForNames(dom.childNodes[i]); | |
} | |
} | |
} | |
if (--this.pendingRecursive === 0) { | |
this.pendingRecursive = 1; | |
if (this.log) console.log('Names & Emails:', this.sources); | |
if (this.spreadsheetURL && this.sourceIndex > 0) { | |
this.submitDataToSpreadsheet(); | |
} | |
} | |
return; | |
}; | |
submitDataToSpreadsheet() { | |
if (this.currIndex >= this.sourceIndex) return; | |
let name = this.sources[this.currIndex]['name']; | |
if (this.includesSpecialChars(name)) { | |
name = this.replaceSpecialChars(name); | |
} | |
if (name.charCodeAt(0) >= 97 && name.charCodeAt(0) <= 122) { | |
name = this.fixLowerCaseName(name); | |
} | |
const lastNameIndex = name.indexOf(' '); | |
let firstName = name; | |
let lastName = ''; | |
if (lastNameIndex > 0) { | |
firstName = name.substr(0, lastNameIndex); | |
const middleNameIndex = name.lastIndexOf(' '); | |
if (lastNameIndex === middleNameIndex) { | |
lastName = name.substr(lastNameIndex + 1); | |
} else { | |
lastName = name.substr(middleNameIndex + 1); | |
} | |
} | |
const data = { | |
'First Name': firstName, | |
'Last Name': lastName, | |
'Email Address': this.sources[this.currIndex]['email'], | |
'Spreadsheet URL': this.spreadsheetURL, | |
} | |
this.currIndex++; | |
this.handleSpreadsheetSubmit(data); | |
}; | |
handleSpreadsheetSubmit(data = {}) { | |
// data = { | |
// 'First Name': 'Test', | |
// 'Last Name': 'Testing', | |
// 'Email Address': '[email protected]', | |
// 'Spreadsheet URL': 'https://docs.google.com/spreadsheets/d/test_example', | |
// }; | |
const xhr = new XMLHttpRequest(); | |
const url = this.scriptURL; | |
xhr.open('POST', url); | |
xhr.setRequestHeader("Content-Type", "application/x-www-form-urlencoded"); | |
xhr.onreadystatechange = () => { | |
if (this.log) console.log(xhr.status, xhr.statusText); | |
if (this.log) console.log(xhr.responseText); | |
this.submitDataToSpreadsheet(); | |
return; | |
}; | |
const encoded = Object.keys(data).map(function(k) { | |
return encodeURIComponent(k) + "=" + encodeURIComponent(data[k]); | |
}).join('&'); | |
xhr.send(encoded); | |
}; | |
includesSpecialChars(str) { | |
return /[^A-Za-z\s]/g.test(str); | |
} | |
replaceSpecialChars(str) { | |
return str.replace(/[^A-Za-z\s]/g, ' '); | |
} | |
fixLowerCaseName(str) { | |
let fixedName = '' + str[0].toUpperCase(); | |
for (let i = 1; i < str.length; i++) { | |
if (str[i] === ' ') { | |
fixedName += ` ${str[i + 1].toUpperCase()}`; | |
i += 1; | |
} | |
fixedName += str[i]; | |
} | |
return fixedName; | |
} | |
} | |
const email_scraper = new Email_Scraper(false, true, '/* Paste your Google Script URL */', '/* Paste your Google Spreadsheet URL */'); | |
email_scraper.traverseDOM(); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
function doPost(e) { | |
try { | |
Logger.log(e); // the Google Script version of console.log see: Class Logger | |
record_data(e); | |
} catch(error) { // if error return this | |
Logger.log(error); | |
return ContentService | |
.createTextOutput(JSON.stringify({"result":"error", "error": error})) | |
.setMimeType(ContentService.MimeType.JSON); | |
} | |
} | |
/** | |
* record_data inserts the data received from the html form submission | |
* e is the data received from the POST | |
*/ | |
function record_data(e) { | |
try { | |
Logger.log(JSON.stringify(e)); // log the POST data in case we need to debug it | |
var url = e.parameters['Spreadsheet URL'][0]; | |
var doc = SpreadsheetApp.openByUrl(url); | |
Logger.log(doc); | |
var sheet = doc.getActiveSheet(); | |
var column = 3; // Index for 'Email Address' column | |
var columnValues = sheet.getRange(2, column, sheet.getLastRow()).getValues(); | |
// Logger.log(columnValues); | |
var searchResult = columnValues.findIndex(e.parameters['Email Address'][0]); | |
// Logger.log(searchResult); | |
if (searchResult !== -1.0) { | |
Logger.log('Email already exists. Exiting script.'); | |
return ContentService | |
.createTextOutput( | |
JSON.stringify({"result":"already exists", | |
"data": JSON.stringify(e.parameters) })) | |
.setMimeType(ContentService.MimeType.JSON); | |
} | |
var oldHeader = sheet.getRange(1, 1, 1, sheet.getLastColumn()).getValues()[0]; | |
var newHeader = oldHeader.slice(); | |
var fieldsFromForm = getDataColumns(e.parameters); | |
var row = []; | |
// loop through the header columns | |
for (var i = 0; i < oldHeader.length; i++) { | |
var field = oldHeader[i]; | |
var output = getFieldFromData(field, e.parameters); | |
row.push(output); | |
// mark as stored by removing from form fields | |
var formIndex = fieldsFromForm.indexOf(field); | |
if (formIndex > -1) { | |
fieldsFromForm.splice(formIndex, 1); | |
} | |
} | |
// more efficient to set values as [][] array than individually | |
var nextRow = sheet.getLastRow() + 1; // get next row | |
sheet.getRange(nextRow, 1, 1, row.length).setValues([row]); | |
} | |
catch(error) { | |
Logger.log(error); | |
} | |
finally { | |
return ContentService | |
.createTextOutput( | |
JSON.stringify({"result":"success", | |
"data": JSON.stringify(e.parameters) })) | |
.setMimeType(ContentService.MimeType.JSON); | |
} | |
} | |
function getDataColumns(data) { | |
return Object.keys(data).filter(function(column) { | |
return typeof column === 'string'; | |
}); | |
} | |
function getFieldFromData(field, data) { | |
var values = data[field] || ''; | |
var output = values.join ? values.join(', ') : values; | |
return output; | |
} | |
Array.prototype.findIndex = function(search){ | |
if (search == "") return false; | |
for (var i = 0; i < this.length; i++) { | |
if (this[i] == search) return i; | |
} | |
return -1; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment