-
-
Save peterherrmann/5691884 to your computer and use it in GitHub Desktop.
//load configuration details and start logging - creates and sets up sheets the first time they are run | |
var CONFIG_SPREADSHEET_KEY = '<ssid_goes_here>'; | |
var Config = SettingsManager.load(CONFIG_SPREADSHEET_KEY); //Add Mafviu9bMfg9xVu21LGfpWnHAGDwXQ1CH in Resources > Libraries | |
Logger = BetterLog.useSpreadsheet(Config['logSpreadsheetId'].value);//Add MYB7yzedMbnJaMKECt6Sm7FLDhaBgl_dE in Resources > Libraries | |
// trigger this function | |
function outerLoop() { | |
try { | |
// to calc elapsed time | |
var isOverMaxRuntime = false, | |
startTime = new Date(); | |
// Deletes all occurrences of the Repeating trigger we don't end up with undeleted time based triggers all over the place | |
ScriptApp.getProjectTriggers().forEach(function(i) { | |
if (i.getHandlerFunction()==='outerLoopRepeating') {ScriptApp.deleteTrigger(i);} | |
}); | |
//Logger.finer('Entering the "%s" function', arguments.callee.name); | |
// Handle max execution times in our outer loop | |
// Get start index if we hit max execution time last run | |
var start = parseInt(PropertiesService.getScriptProperties().getProperty(arguments.callee.name + "-start")) || 0; | |
var thingies = ['stuff to process', 'in an Array',,,,]; // | |
for (var i = start ; i < thingies.length; i++) { | |
if (Math.round((new Date() - startTime)/1000) > 300) { //360 seconds is Google Apps Script max run time | |
//We've hit max runtime. | |
isOverMaxRuntime = true; | |
break; | |
} | |
//do our work here | |
Logger.finest('Inside the for loop that does the xyz work. i is currently: %d', i); | |
var processingMessage = Utilities.formatString('%d of %d thingies: %s <%s>', i+1, thingies.length, thingyName, thingyId); | |
//do our work above here | |
} | |
if (isOverMaxRuntime) { | |
//save state in user/project prop if required | |
PropertiesService.getScriptProperties().setProperty(arguments.callee.name + '-start', i); | |
//create another trigger | |
ScriptApp.newTrigger('outerLoopRepeating').timeBased().everyMinutes(10).create(); | |
Logger.info('Hit max run time - last iteration completed was i=%d', i-1); | |
} else { | |
Logger.fine('Done all the work and all iterations'); | |
PropertiesService.getScriptProperties().deleteProperty(arguments.callee.name + '-start'); | |
Logger.info('Completed processing all %d things with the "%s" function', thingies.length, arguments.callee.name); | |
} | |
} catch (e) { | |
Logger.severe('%s. While processing %s', JSON.stringify(e, null, 2), processingMessage); | |
throw e; | |
} | |
} | |
//automatically invoked from outerLoop()'s creation of a new trigger if required to get work done | |
function outerLoopRepeating() { | |
outerLoop(); | |
} |
Job - Did you ever hear back on that question?
Hello Peter,
How we add random waits between requests.
If you can set it to run async so every line will get a randomized time when it should be executed will be even better
//Add menu buttons
function onOpen(){
var ui = SpreadsheetApp.getUi();
ui.createMenu("Update list")
.addItem("Update", "updateList")
.addItem("Sort responses", "sortResponses")
.addToUi();
}
//Sort rows in response sheets by ASIN order
function sortResponses(){
//Get sheets and values
var ss = SpreadsheetApp.getActiveSpreadsheet();
var requestSheet = ss.getSheetByName("Request");
var responseSheet = ss.getSheetByName("Response");
var asinValues = requestSheet.getRange("A2:A").getDisplayValues();
var asinOrder = [];
//Consolidate asin values and order
for(var i=0;i<asinValues.length;i++){
if(asinValues[i][0]!=""){
asinOrder.push(asinValues[i][0]);
}
}
var responseRange = responseSheet.getRange("A2:F");
var responseValues = responseRange.getValues();
//Sort response values by asin order
responseValues.sort(function(a, b){
if(a[0]===""){
return 1;
}else if(b[0]===""){
return -1;
}else if(a[0]===""&&b[0]===""){
return 0;
}else if(a[0]===b[0]){
return 0;
}
if(asinOrder.indexOf(a[0])<asinOrder.indexOf(b[0])){
return -1;
}else{
return 1;
}
});
//Set new response values in correct order
responseRange.setValues(responseValues);
}
//Scrap and update request sheets
function updateList(){
//Get spreadsheets and values
var ss = SpreadsheetApp.getActiveSpreadsheet();
var sheet = ss.getSheetByName("Request");
var responseSheet = ss.getSheetByName("Response");
var sheetRange = sheet.getRange("A2:D");
var sheetValues = sheetRange.getValues();
var responseRange = responseSheet.getRange("A2:F");
var responseValues = responseRange.getValues();
var reviewers = [];
var newResponses = [];
//Get a list of reviewers and existing data
for(var i=0;i<responseValues.length;i++){
if(responseValues[i][2]!=""){
newResponses.push(responseValues[i]);
reviewers.push(responseValues[i][2]);
}
}
//Check if rerun is necessary
var failed = false;
//Loop through URLs
for(var i=0;i<sheetValues.length;i++){
//Check if URL is valid and whether row already has data
if((sheetValues[i][1]+"").indexOf("http")>-1&&sheetValues[i][2]===""){
try{
//Random wait between 0 and 10 seconds
// var min = 0;
// var max = 10000;
// var rand = Math.random() * (+max - +min) + +min;
// Utilities.sleep(Math.round(rand));
//Utilities.sleep((Math.pow(2,n)*1000) + (Math.round(Math.random() * 1000)));
//Utilities.sleep(Math.random() * 1000);
//Fetch URL page data
var response = UrlFetchApp.fetch(sheetValues[i][1]).getContentText();
//Get stars and number of reviews info
var asin = sheetValues[i][0];
var stars = response.split('<span id="acrPopover" class="reviewCountTextLinkedHistogram noUnderline" title="')[1].split(" ")[0];
var numReviews = response.split('<span id="acrCustomerReviewText" class="a-size-base">')[1].split(" ")[0];
//Set rating and reviews for sheet
sheetValues[i][2] = stars;
sheetValues[i][3] = numReviews;
//Get list of reviews on first page
var reviews = response.split('<span class="a-profile-name">');
//Loop through reviews
for(var i=1;i<reviews.length;i++){
//Get reviewer name
var reviewer = reviews[i].split("</span>")[0];
//Check if row has been scraped before
if(reviewers.indexOf(reviewer)==-1){
reviewers.push(reviewer);
}else{
continue;
}
//Get current review info
var reviewRating = reviews[i].split('<span class="a-icon-alt">')[1].split('</span>')[0];
var reviewTitle = reviews[i].split('<span class="">')[1].split('</span>')[0];
var reviewDate = reviews[i].split('class="a-size-base a-color-secondary review-date">')[1].split('</span>')[0];
var reviewBody = reviews[i].split('class="a-expander-content reviewText review-text-content a-expander-partial-collapse-content"><span class="">')[1].split('</span>')[0];
//Clean up review text html
reviewBody = reviewBody.replace(/<br\s*\/?>/mg,"\n");
if(reviewBody.indexOf(" ")>-1){
reviewBody = reviewBody.split(" ")[1];
}
//Push newly scrapped review row data
newResponses.push([asin, reviewRating, reviewer, reviewDate, reviewTitle, reviewBody]);
}
//Update spreadsheet with new row data
sheetRange.setValues(sheetValues);
if(newResponses.length>0){
responseSheet.getRange(2, 1, newResponses.length, newResponses[0].length).setValues(newResponses);
}
}catch(e){
//Check if failed to reach server for current URL
var test = e;
failed = true;
//Utilities.sleep(1000);
var n = 0;
if (n == 5) {
throw e;
}
Utilities.sleep((Math.pow(2,n)*1000) + (Math.round(Math.random() * 1000)));
continue;
}
}
}
//Set last review data in google sheet
sheetRange.setValues(sheetValues);
responseRange.clearContent();
if(newResponses.length>0){
responseSheet.getRange(2, 1, newResponses.length, newResponses[0].length).setValues(newResponses);
}
//Pause for 10s before re-running script if there are unsuccessful rows
if(failed){
SpreadsheetApp.flush();
Utilities.sleep(10000);
updateList();
}
}
Hello Peter,
First, congrats. You did a good work to deal with time limits to run long and serial work that depends on previous data.
Sorry to ask, but whats the reason of SettingsManager ? Why not just use direct CONFIG_SPREADSHEET_KEY . ?
Thanks
Job Diogenes