Last active
March 15, 2018 20:10
-
-
Save pollend/fd23dba030e20eb0dc21a2c4728af4a2 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "name": "wos_scraper_nightmare", | |
| "version": "1.0.0", | |
| "main": "index.js", | |
| "license": "MIT", | |
| "dependencies": { | |
| "nightmare": "^2.10.0", | |
| "vo": "^4.0.2" | |
| } | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| const Nightmare = require('nightmare') | |
| const vo = require('vo') | |
| const fs = require('fs') | |
| const URI = "https://apps.webofknowledge.com" | |
| const SEARCH = "climate*" | |
| let SKIP_PAGE = 2538 | |
| // OR meteorology OR earth science OR global warming | |
| const query_page = function* (links) { | |
| return Promise.all(links.map((link) => { | |
| return Nightmare().useragent('chrome') | |
| .goto(URI + link) | |
| .evaluate(function () { | |
| var result = {} | |
| var authors = [] | |
| $(".l-content .block-record-info:eq(0)>.FR_field").each(function () { | |
| if ($(this).find(".FR_label").text() === "By:") { | |
| $(this).text().trim().substring(3).replace(/(\n | \s)+/g, " ").split(";").forEach(function (e) { | |
| var keys = [] | |
| var key_match = e.match(/\[(.*)\]/) | |
| if (key_match !== null) { | |
| key_match[1].split(',').forEach(function (key) { | |
| keys.push(parseInt(key.trim())) | |
| }) | |
| } | |
| var author = e.match(/\((.*)\)/) | |
| if (author !== null) { | |
| authors.push({ | |
| "name": author[1].trim(), | |
| "keys": keys | |
| }) | |
| } | |
| }) | |
| } | |
| }) | |
| $(".l-content .block-record-info:eq(0)>#show_resc_blurb tr").each(function () { | |
| var _this = this | |
| authors.forEach(function (t) { | |
| if(t["name"] === $(_this).find("td:eq(0)").text().trim()) { | |
| t["Researcher_ID"] = $(_this).find("td:eq(1)").text().trim() | |
| t["ORCID"] = $(_this).find("td:eq(2)").text().trim() | |
| } | |
| }) | |
| }) | |
| result["Authors"] = authors | |
| result["Article_Title"] = $(".l-content .title>value").text().trim() | |
| $(".l-content .block-record-info").each(function () { | |
| var title = $(this).find(".title3"); | |
| if (title.length === 0) | |
| return; | |
| switch (title.text().trim()) { | |
| case "Abstract": | |
| $(this).find(".FR_field").each(function () { | |
| result["abstract"] = $(this).text().trim() | |
| }); | |
| break; | |
| case "Funding": | |
| result["GrantName_Concatenated"] = [] | |
| $(this).find(".FR_table_borders>tbody>.fr_data_row").each(function () { | |
| var grant_entry = {} | |
| grant_entry["number"] = [] | |
| grant_entry["agency"] = $(this).find("td:eq(0)").text().trim() | |
| $(this).find("td:eq(1)>div").each(function () { | |
| grant_entry["number"].push($(this).text().trim()) | |
| }); | |
| result["GrantName_Concatenated"].push(grant_entry) | |
| }); | |
| break; | |
| case "Document Information": | |
| $(this).find(".FR_field").each(function () { | |
| switch ($(this).find(".FR_label").text()) { | |
| case "PubMed ID:": | |
| result["Pub_ID"] = $(this).find("value").text().trim() | |
| break; | |
| case "ISSN:": | |
| result["ISSN"] = $(this).find("value").text().trim() | |
| break; | |
| case "eISSN:": | |
| result["eISSN"] = $(this).find("value").text().trim() | |
| break; | |
| } | |
| }); | |
| break; | |
| case "Author Information": | |
| var e = $(this).children(); | |
| for (var j = 0; j < e.length; j++) { | |
| if (e.eq(j).find(".FR_label").text().trim() === "Addresses:") { | |
| var addresses = {} | |
| e.eq(j+1).find("tbody>tr>.fr_address_row2>a").each(function () { | |
| var location = $(this).text().match(/\[([0-9 ]+)\](.*)/) | |
| addresses[parseInt(location[1])] = location[2] | |
| }) | |
| result["location"] = addresses | |
| } | |
| } | |
| break; | |
| case "Categories / Classification": | |
| $(this).find(".FR_field").each(function () { | |
| switch ($(this).find(".FR_label").text()) { | |
| case "Research Areas:": | |
| result["Research_Area"] = $(this).contents().filter(function () { | |
| return this.nodeType === 3; | |
| }).text().trim() | |
| break; | |
| case "Web of Science Categories:": | |
| result["Wos_Category"] = $(this).contents().filter(function () { | |
| return this.nodeType === 3; | |
| }).text().trim() | |
| break; | |
| } | |
| }) | |
| break | |
| case "Keywords": | |
| $(this).find(".FR_field").each(function () { | |
| switch ($(this).find(".FR_label").text()) { | |
| case "Author Keywords:": | |
| result["Author_Keywords"] = []; | |
| $(this).find("a").each(function () { | |
| result["Author_Keywords"].push($(this).text().trim()) | |
| }); | |
| break; | |
| case "KeyWords Plus:": | |
| result["KeyWords_Plus"] = []; | |
| $(this).find("a").each(function () { | |
| result["KeyWords_Plus"].push($(this).text().trim()) | |
| }); | |
| break; | |
| } | |
| }); | |
| break; | |
| } | |
| }); | |
| result["Journal_Title"] = $(".l-content .block-record-info.block-record-info-source>.sourceTitle").text().trim(); | |
| $(".l-content .block-record-info.block-record-info-source .FR_field").each(function () { | |
| switch ($(this).find(".FR_label").text()) { | |
| case "Published:": | |
| var date = $(this).find("value").text().split(" ") | |
| if(date.length === 2) { | |
| result["Month"] = date[0] | |
| result["Year"] = parseInt(date[date.length - 1]) | |
| } else { | |
| result["Month"] = date[0] | |
| result["Day"] = parseInt(date[1]) | |
| result["Year"] = parseInt(date[date.length - 1]) | |
| } | |
| break; | |
| case "DOI:": | |
| result["DOI"] = $(this).find("value").text().trim() | |
| break; | |
| } | |
| }) | |
| $(".l-content .block-record-info.block-record-info-source .block-record-info-source-values .FR_field").each(function () { | |
| switch($(this).find(".FR_label").text()) { | |
| case "Volume:": | |
| result["Volume"] = $(this).find("value").text().trim() | |
| break; | |
| case "Pages:": | |
| result["Pages"] = $(this).find("value").text().trim() | |
| break; | |
| } | |
| }) | |
| result["key"] = ("Pub_ID" in result) ? result["Pub_ID"] : (("ISSN" in result) ? result["ISSN"] : result["eISSN"]) | |
| result["Authors"].forEach(function (author) { | |
| author["affiliation"] = [] | |
| author["keys"].forEach(function (key) { | |
| author["affiliation"].push(result["location"][key].trim()) | |
| }) | |
| }) | |
| return result | |
| }) | |
| .end() | |
| })) | |
| }; | |
| const nightmare = Nightmare({ show: true }).useragent('chrome') | |
| var run = function*() { | |
| yield nightmare.goto(URI) | |
| .evaluate(function (search) { | |
| $("#value\\(input1\\)").val(search); | |
| $(".searchInputForm").submit(); | |
| return true; | |
| },SEARCH) | |
| .wait("#summaryRecordsTable").url() | |
| if(SKIP_PAGE > 0) { | |
| nightmare.evaluate(function (page) { | |
| $("#summary_navigation .goToPageNumber-input").attr("value", page) | |
| $("#summary_navigation").submit() | |
| },SKIP_PAGE) | |
| .wait("#summaryRecordsTable") | |
| } | |
| while(true){ | |
| let pages = yield nightmare | |
| .evaluate(function () { | |
| var items = [] | |
| var elements = document.querySelectorAll("#summaryRecordsTable .search-results>div") | |
| for(var i = 0; i < elements.length; i++) | |
| { | |
| items.push($(elements[i]).find(".search-results-content .smallV110").first().attr('href')) | |
| } | |
| return items | |
| }) | |
| console.log("process pages") | |
| let final = yield vo(query_page)(pages) | |
| console.log("finished processing pages") | |
| for(let x = 0; x < final.length; x++) { | |
| fs.appendFile("wos_result.json", JSON.stringify(final[x]) + "\n", (error) => { | |
| if (error) | |
| console.log(error) | |
| }) | |
| } | |
| console.log("finished saving") | |
| let next_href = yield nightmare | |
| .evaluate(function () { | |
| return $(".searchPagination .paginationNext:eq(0)").attr("href") | |
| }) | |
| if(next_href === '' || next_href === null) | |
| throw new Error("can't find next href"); | |
| console.log("next page:" + next_href); | |
| SKIP_PAGE = parseInt(next_href.match(/page=([0-9]+)/)[1]) - 1 | |
| console.log("skip page:" + SKIP_PAGE) | |
| yield nightmare | |
| .goto(next_href) | |
| .wait("#summaryRecordsTable") | |
| } | |
| return "finished_search" | |
| } | |
| vo(run)(function(err, result) { | |
| if (err) { | |
| console.error('an error occurred: ' + err); | |
| vo(run)(function (err, result) { | |
| if (err) { | |
| console.error('an error occurred: ' + err); | |
| } | |
| console.log(result); | |
| }); | |
| } | |
| console.log(result); | |
| }); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment