Last active
March 21, 2017 03:43
-
-
Save C-Rodg/4c6e739b27bcaca0c1c82779334be302 to your computer and use it in GitHub Desktop.
A quick script that after navigating to a google search page will download the scrapped content to a csv file.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
function CsvWriter(del, enc) { | |
this.del = del || ','; | |
this.enc = enc || '"'; | |
this.escapeCol = (col) => { | |
if(isNaN(col)) { | |
if(!col) { | |
col = ''; | |
} else { | |
col = String(col); | |
if(col.length > 0) { | |
col = col.split(this.enc).join(this.enc + this.enc); | |
col = this.enc + col + this.enc; | |
} | |
} | |
} | |
return col; | |
}; | |
this.arrayToRow = (arr) => { | |
let arr2 = arr.slice(0); | |
let i, ii = arr2.length; | |
for(i = 0; i < ii; i++) { | |
arr2[i] = this.escapeCol(arr2[i]); | |
} | |
return arr2.join(this.del); | |
}; | |
this.arrayToCSVString = (arr) => { | |
let arr2 = arr.slice(0); | |
arr2.unshift(["TITLE", "DESCRIPTION", "LINK", "DOMAIN"]); // COLUMN TITLES | |
let i, ii = arr2.length; | |
for(i = 0; i < ii; i++) { | |
arr2[i] = this.arrayToRow(arr2[i]); | |
} | |
return arr2.join("\r\n"); | |
}; | |
this.downloadCSV = (arr) => { | |
let csvContent = this.arrayToCSVString(arr); | |
csvContent = "data:text/csv;charset=utf-8," + csvContent; | |
let encoded = encodeURI(csvContent); | |
let link = document.createElement('a'); | |
link.setAttribute('href', encoded); | |
link.setAttribute('download', searchTerm + '.csv'); | |
document.body.appendChild(link); | |
link.click(); | |
}; | |
} | |
var inp = document.querySelector('input[title="Search"]'), | |
searchTerm = inp.value, | |
sheetArray = [], | |
boxes = document.querySelectorAll('div.g'); | |
Array.prototype.forEach.call(boxes, (box) => { | |
var x = [], | |
title = box.querySelector('h3.r>a'), | |
site = box.querySelector('cite'), | |
para = box.querySelector('span.st'); | |
var titleText = title ? title.textContent : "", | |
siteText = site ? site.textContent : "", | |
paraText = para ? para.textContent : ""; | |
var domainEx = /^(?:https?:\/\/)?(?:[^@\/\n]+@)?(?:www\.)?([^:\/\n]+)/i, | |
domainMatches = siteText.match(domainEx), | |
domainShort = (domainMatches && domainMatches.length > 0) ? domainMatches[1] : ""; | |
x.push(titleText, paraText, siteText, domainShort); | |
sheetArray.push(x); | |
}); | |
let csv = new CsvWriter(); | |
csv.downloadCSV(sheetArray); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment