Skip to content

Instantly share code, notes, and snippets.

@pollend
Last active March 15, 2018 20:10
Show Gist options
  • Select an option

  • Save pollend/fd23dba030e20eb0dc21a2c4728af4a2 to your computer and use it in GitHub Desktop.

Select an option

Save pollend/fd23dba030e20eb0dc21a2c4728af4a2 to your computer and use it in GitHub Desktop.
{
"name": "wos_scraper_nightmare",
"version": "1.0.0",
"main": "index.js",
"license": "MIT",
"dependencies": {
"nightmare": "^2.10.0",
"vo": "^4.0.2"
}
}
const Nightmare = require('nightmare')
const vo = require('vo')
const fs = require('fs')
const URI = "https://apps.webofknowledge.com"
const SEARCH = "climate*"
let SKIP_PAGE = 2538
// OR meteorology OR earth science OR global warming
const query_page = function* (links) {
return Promise.all(links.map((link) => {
return Nightmare().useragent('chrome')
.goto(URI + link)
.evaluate(function () {
var result = {}
var authors = []
$(".l-content .block-record-info:eq(0)>.FR_field").each(function () {
if ($(this).find(".FR_label").text() === "By:") {
$(this).text().trim().substring(3).replace(/(\n | \s)+/g, " ").split(";").forEach(function (e) {
var keys = []
var key_match = e.match(/\[(.*)\]/)
if (key_match !== null) {
key_match[1].split(',').forEach(function (key) {
keys.push(parseInt(key.trim()))
})
}
var author = e.match(/\((.*)\)/)
if (author !== null) {
authors.push({
"name": author[1].trim(),
"keys": keys
})
}
})
}
})
$(".l-content .block-record-info:eq(0)>#show_resc_blurb tr").each(function () {
var _this = this
authors.forEach(function (t) {
if(t["name"] === $(_this).find("td:eq(0)").text().trim()) {
t["Researcher_ID"] = $(_this).find("td:eq(1)").text().trim()
t["ORCID"] = $(_this).find("td:eq(2)").text().trim()
}
})
})
result["Authors"] = authors
result["Article_Title"] = $(".l-content .title>value").text().trim()
$(".l-content .block-record-info").each(function () {
var title = $(this).find(".title3");
if (title.length === 0)
return;
switch (title.text().trim()) {
case "Abstract":
$(this).find(".FR_field").each(function () {
result["abstract"] = $(this).text().trim()
});
break;
case "Funding":
result["GrantName_Concatenated"] = []
$(this).find(".FR_table_borders>tbody>.fr_data_row").each(function () {
var grant_entry = {}
grant_entry["number"] = []
grant_entry["agency"] = $(this).find("td:eq(0)").text().trim()
$(this).find("td:eq(1)>div").each(function () {
grant_entry["number"].push($(this).text().trim())
});
result["GrantName_Concatenated"].push(grant_entry)
});
break;
case "Document Information":
$(this).find(".FR_field").each(function () {
switch ($(this).find(".FR_label").text()) {
case "PubMed ID:":
result["Pub_ID"] = $(this).find("value").text().trim()
break;
case "ISSN:":
result["ISSN"] = $(this).find("value").text().trim()
break;
case "eISSN:":
result["eISSN"] = $(this).find("value").text().trim()
break;
}
});
break;
case "Author Information":
var e = $(this).children();
for (var j = 0; j < e.length; j++) {
if (e.eq(j).find(".FR_label").text().trim() === "Addresses:") {
var addresses = {}
e.eq(j+1).find("tbody>tr>.fr_address_row2>a").each(function () {
var location = $(this).text().match(/\[([0-9 ]+)\](.*)/)
addresses[parseInt(location[1])] = location[2]
})
result["location"] = addresses
}
}
break;
case "Categories / Classification":
$(this).find(".FR_field").each(function () {
switch ($(this).find(".FR_label").text()) {
case "Research Areas:":
result["Research_Area"] = $(this).contents().filter(function () {
return this.nodeType === 3;
}).text().trim()
break;
case "Web of Science Categories:":
result["Wos_Category"] = $(this).contents().filter(function () {
return this.nodeType === 3;
}).text().trim()
break;
}
})
break
case "Keywords":
$(this).find(".FR_field").each(function () {
switch ($(this).find(".FR_label").text()) {
case "Author Keywords:":
result["Author_Keywords"] = [];
$(this).find("a").each(function () {
result["Author_Keywords"].push($(this).text().trim())
});
break;
case "KeyWords Plus:":
result["KeyWords_Plus"] = [];
$(this).find("a").each(function () {
result["KeyWords_Plus"].push($(this).text().trim())
});
break;
}
});
break;
}
});
result["Journal_Title"] = $(".l-content .block-record-info.block-record-info-source>.sourceTitle").text().trim();
$(".l-content .block-record-info.block-record-info-source .FR_field").each(function () {
switch ($(this).find(".FR_label").text()) {
case "Published:":
var date = $(this).find("value").text().split(" ")
if(date.length === 2) {
result["Month"] = date[0]
result["Year"] = parseInt(date[date.length - 1])
} else {
result["Month"] = date[0]
result["Day"] = parseInt(date[1])
result["Year"] = parseInt(date[date.length - 1])
}
break;
case "DOI:":
result["DOI"] = $(this).find("value").text().trim()
break;
}
})
$(".l-content .block-record-info.block-record-info-source .block-record-info-source-values .FR_field").each(function () {
switch($(this).find(".FR_label").text()) {
case "Volume:":
result["Volume"] = $(this).find("value").text().trim()
break;
case "Pages:":
result["Pages"] = $(this).find("value").text().trim()
break;
}
})
result["key"] = ("Pub_ID" in result) ? result["Pub_ID"] : (("ISSN" in result) ? result["ISSN"] : result["eISSN"])
result["Authors"].forEach(function (author) {
author["affiliation"] = []
author["keys"].forEach(function (key) {
author["affiliation"].push(result["location"][key].trim())
})
})
return result
})
.end()
}))
};
const nightmare = Nightmare({ show: true }).useragent('chrome')
var run = function*() {
yield nightmare.goto(URI)
.evaluate(function (search) {
$("#value\\(input1\\)").val(search);
$(".searchInputForm").submit();
return true;
},SEARCH)
.wait("#summaryRecordsTable").url()
if(SKIP_PAGE > 0) {
nightmare.evaluate(function (page) {
$("#summary_navigation .goToPageNumber-input").attr("value", page)
$("#summary_navigation").submit()
},SKIP_PAGE)
.wait("#summaryRecordsTable")
}
while(true){
let pages = yield nightmare
.evaluate(function () {
var items = []
var elements = document.querySelectorAll("#summaryRecordsTable .search-results>div")
for(var i = 0; i < elements.length; i++)
{
items.push($(elements[i]).find(".search-results-content .smallV110").first().attr('href'))
}
return items
})
console.log("process pages")
let final = yield vo(query_page)(pages)
console.log("finished processing pages")
for(let x = 0; x < final.length; x++) {
fs.appendFile("wos_result.json", JSON.stringify(final[x]) + "\n", (error) => {
if (error)
console.log(error)
})
}
console.log("finished saving")
let next_href = yield nightmare
.evaluate(function () {
return $(".searchPagination .paginationNext:eq(0)").attr("href")
})
if(next_href === '' || next_href === null)
throw new Error("can't find next href");
console.log("next page:" + next_href);
SKIP_PAGE = parseInt(next_href.match(/page=([0-9]+)/)[1]) - 1
console.log("skip page:" + SKIP_PAGE)
yield nightmare
.goto(next_href)
.wait("#summaryRecordsTable")
}
return "finished_search"
}
vo(run)(function(err, result) {
if (err) {
console.error('an error occurred: ' + err);
vo(run)(function (err, result) {
if (err) {
console.error('an error occurred: ' + err);
}
console.log(result);
});
}
console.log(result);
});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment