Skip to content

Instantly share code, notes, and snippets.

@azanli
Last active February 6, 2024 20:45
Show Gist options
  • Save azanli/face0c38163dc6c4273325bbdd6783e3 to your computer and use it in GitHub Desktop.
Save azanli/face0c38163dc6c4273325bbdd6783e3 to your computer and use it in GitHub Desktop.
An email scraper for websites - extracts names and emails from a page & inserts them into your Google Spreadsheet for Mail Merge
class Email_Scraper {
constructor(log = false, names = true, scriptURL = '', spreadsheetURL = '') {
this.currIndex = 0;
this.log = log;
this.names = names;
this.pendingRecursive = 1;
this.scriptURL = scriptURL;
this.spreadsheetURL = spreadsheetURL;
this.sourceIndex = 0;
this.sources = {};
}
traverseDOM(email, dom = document.body) {
if (dom.childNodes.length === 0) {
let tmp = document.createElement("div");
tmp.appendChild(dom.cloneNode(false));
const value = tmp.innerHTML;
if (value.length <= 30 && value.includes('.')) {
let indexAt = -1;
if (email) {
indexAt = value.indexOf(email);
} else {
indexAt = value.indexOf('@');
}
if (indexAt > -1) {
const name = value.substr(0, indexAt);
const alphaMap = {};
for (let i = 0; i < indexAt; i++) {
alphaMap[value[i]] = true;
}
this.sources[this.sourceIndex] = {
name,
email: value,
score: 0,
alphaMap,
}
this.sourceIndex++;
}
}
} else if (dom.childNodes.length) {
for (let i = 0; i <= dom.childNodes.length; i += 1) {
if (dom.childNodes[i]) {
this.pendingRecursive++;
this.traverseDOM(email, dom.childNodes[i]);
}
}
}
if (--this.pendingRecursive === 0) {
this.pendingRecursive = 1;
if (this.log && !this.names) {
for (let key in this.sources) {
console.log(this.sources[key]['email']);
}
}
if (this.names) this.traverseAgainForNames(document.body);
}
};
traverseAgainForNames(dom) {
if (this.sourceIndex === 0) return;
if (dom.childNodes.length === 0) {
let tmp = document.createElement("div");
tmp.appendChild(dom.cloneNode(false));
const value = tmp.innerHTML;
if (value.charCodeAt(0) >= 65 && value.charCodeAt(0) <= 90 && value.length < 30 && value.length >= 5) { // Greater than 30 characters may be a sentence rather than a name.
Object.keys(this.sources).some(index => {
if (this.sources[index]['score'] > 100) return;
let name = this.sources[index]['name'];
// Incase the name has numbers appended to their last names.
let lastIndex = name.length - 1;
if (!isNaN(parseInt(name[lastIndex]))) {
while (!isNaN(parseInt(name[lastIndex]))) {
lastIndex--;
}
}
const lowerCaseName = name.toLowerCase();
const lowerCaseValue = value.toLowerCase();
if (lowerCaseName[lastIndex].toLowerCase() === lowerCaseValue[value.length - 1].toLowerCase() &&
lowerCaseName[lastIndex - 1].toLowerCase() === lowerCaseValue[value.length - 2].toLowerCase() &&
(lowerCaseName[lastIndex - 2].toLowerCase() === lowerCaseValue[value.length - 3].toLowerCase() || lowerCaseName[lastIndex - 2].toLowerCase() === lowerCaseValue[0].toLowerCase())) {
if (lowerCaseName[0].toLowerCase() === lowerCaseValue[0].toLowerCase()) {
let parsedValue = value;
while (parsedValue.includes('nbsp')) {
if (parsedValue.includes('&nbsp;')) {
parsedValue = parsedValue.replace('&nbsp;', ' ');
} else if (parsedValue.includes('&nbsp')) {
parsedValue = parsedValue.replace('&nbsp', ' ');
} else if (parsedValue.includes('nbsp;')) {
parsedValue = parsedValue.replace('nbsp;', ' ');
} else {
parsedValue = parsedValue.replace('nbsp', ' ');
}
}
if (lowerCaseValues[0] === 'm') {
if (lowerCaseValue.includes('mrs ') || lowerCaseValue.includes('mr ') || lowerCaseValue.includes('ms ')) {
parsedValue = parsedValue.substr(parsedValue.indexOf(' ') + 1);
}
}
let count = 0;
for (let i = 0; i < parsedValue.length; i++) {
if (this.sources[index]['alphaMap'][parsedValue[i]]) count += 1;
}
const newScore = (count / Object.keys(this.sources[index]['alphaMap']).length) * 100;
if (newScore > this.sources[index]['score']) {
if (this.log) console.log('New score:', newScore, 'Old score:', this.sources[index]['score']);
if (this.log) console.log('Changing name from', name, 'to', parsedValue);
this.sources[index]['name'] = parsedValue;
this.sources[index]['score'] = newScore;
return true;
}
}
}
});
}
} else {
for (let i = 0; i <= dom.childNodes.length; i += 1) {
if (dom.childNodes[i]) {
this.pendingRecursive++;
this.traverseAgainForNames(dom.childNodes[i]);
}
}
}
if (--this.pendingRecursive === 0) {
this.pendingRecursive = 1;
if (this.log) console.log('Names & Emails:', this.sources);
if (this.spreadsheetURL && this.sourceIndex > 0) {
this.submitDataToSpreadsheet();
}
}
return;
};
submitDataToSpreadsheet() {
if (this.currIndex >= this.sourceIndex) return;
let name = this.sources[this.currIndex]['name'];
if (this.includesSpecialChars(name)) {
name = this.replaceSpecialChars(name);
}
if (name.charCodeAt(0) >= 97 && name.charCodeAt(0) <= 122) {
name = this.fixLowerCaseName(name);
}
const lastNameIndex = name.indexOf(' ');
let firstName = name;
let lastName = '';
if (lastNameIndex > 0) {
firstName = name.substr(0, lastNameIndex);
const middleNameIndex = name.lastIndexOf(' ');
if (lastNameIndex === middleNameIndex) {
lastName = name.substr(lastNameIndex + 1);
} else {
lastName = name.substr(middleNameIndex + 1);
}
}
const data = {
'First Name': firstName,
'Last Name': lastName,
'Email Address': this.sources[this.currIndex]['email'],
'Spreadsheet URL': this.spreadsheetURL,
}
this.currIndex++;
this.handleSpreadsheetSubmit(data);
};
handleSpreadsheetSubmit(data = {}) {
// data = {
// 'First Name': 'Test',
// 'Last Name': 'Testing',
// 'Email Address': '[email protected]',
// 'Spreadsheet URL': 'https://docs.google.com/spreadsheets/d/test_example',
// };
const xhr = new XMLHttpRequest();
const url = this.scriptURL;
xhr.open('POST', url);
xhr.setRequestHeader("Content-Type", "application/x-www-form-urlencoded");
xhr.onreadystatechange = () => {
if (this.log) console.log(xhr.status, xhr.statusText);
if (this.log) console.log(xhr.responseText);
this.submitDataToSpreadsheet();
return;
};
const encoded = Object.keys(data).map(function(k) {
return encodeURIComponent(k) + "=" + encodeURIComponent(data[k]);
}).join('&');
xhr.send(encoded);
};
includesSpecialChars(str) {
return /[^A-Za-z\s]/g.test(str);
}
replaceSpecialChars(str) {
return str.replace(/[^A-Za-z\s]/g, ' ');
}
fixLowerCaseName(str) {
let fixedName = '' + str[0].toUpperCase();
for (let i = 1; i < str.length; i++) {
if (str[i] === ' ') {
fixedName += ` ${str[i + 1].toUpperCase()}`;
i += 1;
}
fixedName += str[i];
}
return fixedName;
}
}
const email_scraper = new Email_Scraper(false, true, '/* Paste your Google Script URL */', '/* Paste your Google Spreadsheet URL */');
email_scraper.traverseDOM();
function doPost(e) {
try {
Logger.log(e); // the Google Script version of console.log see: Class Logger
record_data(e);
} catch(error) { // if error return this
Logger.log(error);
return ContentService
.createTextOutput(JSON.stringify({"result":"error", "error": error}))
.setMimeType(ContentService.MimeType.JSON);
}
}
/**
* record_data inserts the data received from the html form submission
* e is the data received from the POST
*/
function record_data(e) {
try {
Logger.log(JSON.stringify(e)); // log the POST data in case we need to debug it
var url = e.parameters['Spreadsheet URL'][0];
var doc = SpreadsheetApp.openByUrl(url);
Logger.log(doc);
var sheet = doc.getActiveSheet();
var column = 3; // Index for 'Email Address' column
var columnValues = sheet.getRange(2, column, sheet.getLastRow()).getValues();
// Logger.log(columnValues);
var searchResult = columnValues.findIndex(e.parameters['Email Address'][0]);
// Logger.log(searchResult);
if (searchResult !== -1.0) {
Logger.log('Email already exists. Exiting script.');
return ContentService
.createTextOutput(
JSON.stringify({"result":"already exists",
"data": JSON.stringify(e.parameters) }))
.setMimeType(ContentService.MimeType.JSON);
}
var oldHeader = sheet.getRange(1, 1, 1, sheet.getLastColumn()).getValues()[0];
var newHeader = oldHeader.slice();
var fieldsFromForm = getDataColumns(e.parameters);
var row = [];
// loop through the header columns
for (var i = 0; i < oldHeader.length; i++) {
var field = oldHeader[i];
var output = getFieldFromData(field, e.parameters);
row.push(output);
// mark as stored by removing from form fields
var formIndex = fieldsFromForm.indexOf(field);
if (formIndex > -1) {
fieldsFromForm.splice(formIndex, 1);
}
}
// more efficient to set values as [][] array than individually
var nextRow = sheet.getLastRow() + 1; // get next row
sheet.getRange(nextRow, 1, 1, row.length).setValues([row]);
}
catch(error) {
Logger.log(error);
}
finally {
return ContentService
.createTextOutput(
JSON.stringify({"result":"success",
"data": JSON.stringify(e.parameters) }))
.setMimeType(ContentService.MimeType.JSON);
}
}
function getDataColumns(data) {
return Object.keys(data).filter(function(column) {
return typeof column === 'string';
});
}
function getFieldFromData(field, data) {
var values = data[field] || '';
var output = values.join ? values.join(', ') : values;
return output;
}
Array.prototype.findIndex = function(search){
if (search == "") return false;
for (var i = 0; i < this.length; i++) {
if (this[i] == search) return i;
}
return -1;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment