Last active
January 17, 2024 10:46
-
-
Save Rmanaf/18e0e4730c68dd46b2562851357384d2 to your computer and use it in GitHub Desktop.
This JavaScript code is designed to scrape data from a Google Scholar profile page and calculate various research indices for a researcher.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* This JavaScript code is designed to scrape data from a Google Scholar profile page and calculate | |
* various research indices for a researcher. It extracts information such as publication years, | |
* citations, and the h-index, and then computes several research metrics, including the R-index, | |
* A-index, G-index, M-index, E-index, AR-index, and HG-index. The code also provides statistics | |
* about the researcher's publications, total citations, h-core citations, and more. | |
* | |
* | |
* ** Usage: ** | |
* | |
* 1. First, locate the Google Scholar profile page of the researcher you want to analyze. | |
* 2. Ensure that you load and expand the list of articles all the way down on the profile page. | |
* 3. Copy the code provided here. | |
* 4. Open your browser's developer console: | |
* - On Windows (Google Chrome), you can press F12 or Ctrl + Shift + I to access the console. | |
* - On macOS (Google Chrome), you can press Cmd + Option + I to access the console. | |
* 5. Paste the code into the console, and press Enter to run it. | |
* | |
* | |
* ** Considerations: ** | |
* | |
* 1. To ensure accurate results, make sure you have expanded the list of articles on the profile page | |
* to access all the researcher's publications. The code relies on having access to all articles to | |
* work correctly. | |
* 2. The code may not consider articles with undeclared years or citations in the calculations. | |
* 3. In some cases, the code may encounter profiles with h-index values that are less than the | |
* actual h-index. This could be due to variations in Google Scholar data presentation. | |
* | |
* | |
* ** References: ** | |
* | |
* 1. [The R- and AR-indices: Complementing the h-index](https://documentserver.uhasselt.be/bitstream/1942/1787/1/complementing%201.pdf) | |
* 2. [Scientometrics: An overview of concepts, applications and indicators](https://irje.tums.ac.ir/article-1-5292-fa.pdf) | |
* | |
* | |
* For questions, bug reports, suggestions for improvement, or any other inquiries, please feel free | |
* to contact me at <[email protected]> or reach out to the code gist at <https://gist.github.com/Rmanaf/18e0e4730c68dd46b2562851357384d2> and leave a comment. | |
* | |
* Happy Analyzing! | |
* | |
*/ | |
// Retrieve the table containing the publications from the Google Scholar profile page | |
const pubs_table = document.getElementById('gsc_a_t'); | |
// Extract statistical data from the profile, like total citations, h-index, etc. | |
const stats = Array.from(document.getElementById('gsc_rsb_st').querySelectorAll('.gsc_rsb_std')) | |
.filter(item => is_number(item.innerText)) | |
.map(item => parseInt(item.innerText)); | |
// Extract data for each article, including publication year and citations | |
const articles = Array.from(pubs_table.querySelectorAll('.gsc_a_tr')).map(item => { | |
let year = parseInt(item.querySelector('.gsc_a_y > span').innerText); | |
let citations = parseInt(item.querySelector('.gsc_a_c > a').innerText); | |
return { | |
year: year, | |
citations: citations | |
} | |
}); | |
const h_index = stats[2]; | |
const years = get_years(); | |
const since = Math.min(...years); | |
const last = Math.max(...years); | |
const this_year = new Date().getFullYear(); | |
/** | |
* Checks if a value is a number using regular expressions. | |
* | |
* Note: This function uses regular expressions to determine if a value is a number, | |
* as the standard 'number != NaN' check may not work properly for all cases, especially | |
* when dealing with parsed text. | |
* | |
* @param {string} value - The value to be checked. | |
* @returns {boolean} - True if the value is a number; otherwise, false. | |
*/ | |
function is_number(value) { | |
const integerRegExp = /^\d+$/; | |
return integerRegExp.test(value); | |
} | |
/** | |
* Converts a numeric value to a floating-point number with two decimal places. | |
* | |
* @param {number} value - The numeric value to be converted. | |
* @returns {number} - A floating-point number with two decimal places. | |
*/ | |
function to_float(value) { | |
return parseFloat(value.toFixed(2)); | |
} | |
/** | |
* Extracts and returns the citations of articles from the 'articles' array. | |
* | |
* @returns {number[]} - An array of article citations that are considered as valid numbers. | |
*/ | |
function get_citations() { | |
return articles | |
.map(item => item.citations) // Extract the citations from each article. | |
.filter(item => is_number(item)); // Filter and keep only valid numeric citations. | |
} | |
/** | |
* Extracts and returns the publication years of articles from the 'articles' array. | |
* | |
* @returns {number[]} - An array of publication years that are considered as valid numbers. | |
*/ | |
function get_years() { | |
return articles | |
.map(item => item.year) // Extract the publication years from each article. | |
.filter(item => is_number(item)); // Filter and keep only valid numeric publication years. | |
} | |
/** | |
* Calculates and returns the total number of citations from the extracted articles' citations. | |
* | |
* @returns {number} - The total number of citations. | |
*/ | |
function get_total_citations() { | |
// Retrieve the valid article citations using the 'get_citations' function and sum them. | |
return get_citations().reduce((a, b) => a + b, 0); | |
} | |
/** | |
* Extracts and returns articles that are considered part of the H-core, based on the provided h-index. | |
* It filters and extracts articles from the 'articles' array that have valid citations and meet or exceed | |
* the provided h-index. | |
* | |
* Note: This function is designed to prevent potential issues and variations mentioned in the code introduction[3]. | |
* Typically, H-core articles are the first 'h_index' elements when sorted by citations. | |
* | |
* @returns {Object[]} - An array of articles that meet the H-core criteria. | |
*/ | |
function get_h_core() { | |
return articles.filter(item => is_number(item.citations)).slice(0, h_index); | |
// return articles.filter(item => is_number(item.citations) && (item.citations >= h_index)); | |
} | |
/** | |
* Calculates and returns the sum of citations from articles considered part of the H-core. | |
* | |
* @returns {number} - The sum of citations from H-core articles. | |
*/ | |
function get_sum_h_core() { | |
// Use the 'get_h_core' function to obtain H-core articles, extract their citations, and calculate the sum. | |
return get_h_core().map(item => item.citations).reduce((a, b) => a + b, 0); | |
} | |
/** | |
* Calculates and returns the M-index value. | |
* | |
* @returns {number} - The M-index value. | |
*/ | |
function get_m_index() { | |
// Calculate the M-index using the provided h-index, last year of publications, and the first year of publications. | |
return h_index / Math.max(last - since, 1); | |
} | |
/** | |
* Calculates and returns the G-index value. | |
* | |
* @returns {number} - The G-index value. | |
*/ | |
function get_g_index() { | |
// Retrieve the citations from articles using the 'get_citations' function. | |
const citations = get_citations(); | |
let g_index = 0; | |
let total = citations[0]; | |
// Iterate to find the G-index based on cumulative citations. | |
while (total >= (g_index * g_index)) { | |
g_index++; | |
total += citations[g_index]; | |
} | |
// Return the calculated G-index value. | |
return g_index; | |
} | |
/** | |
* Calculates and returns the HG-index value. | |
* | |
* @returns {number} - The HG-index value. | |
*/ | |
function get_hg_index() { | |
// Calculate the HG-index based on the G-index and h-index. | |
return Math.sqrt(get_g_index() * h_index); | |
} | |
/** | |
* Calculates and returns the A-index value. | |
* | |
* @returns {number} - The A-index value. | |
*/ | |
function get_a_index() { | |
// Calculate the A-index based on the h-index and the sum of citations from H-core articles. | |
return (1 / h_index) * get_sum_h_core(); | |
} | |
/** | |
* Calculates and returns the R-index value. | |
* | |
* @returns {number} - The R-index value. | |
*/ | |
function get_r_index() { | |
// Calculate the R-index based on the A-index and h-index. | |
return Math.sqrt(get_a_index() * h_index); | |
} | |
/** | |
* Calculates and returns the AR-index value. | |
* | |
* @returns {number} - The AR-index value. | |
*/ | |
function get_ar_index() { | |
// Use the 'get_h_core' function to obtain H-core articles. | |
const h_core = get_h_core(); | |
let sum = 0; | |
let h_max = { age: 0, citations: 0 }; | |
// Iterate through H-core articles to calculate their AR-index contribution. | |
// This calculation involves summing articles' citations divided by the age of the article. | |
// Additionally, we look for articles with the same citation count as the h-index. | |
// According to the reference[2], if there are several publications with exactly h citations, | |
// we include the most recent ones in the h-core. | |
h_core.forEach(item => { | |
let year = item.year; | |
// Calculate the age of the article based on its publication year. | |
// According to the reference[2], when an article is published in the current year, | |
// an average age of 0.5 is used for calculation. | |
let age = Math.max(this_year - item.year, 0.5); | |
if (item.citations == h_index) { | |
if (h_max.age < age) { | |
h_max = { | |
age: age, | |
citations: h_index | |
} | |
} | |
return; | |
} | |
sum += item.citations / age; | |
}); | |
sum += h_max.citations / Math.max(1, h_max.age); | |
return Math.sqrt(sum); | |
} | |
/** | |
* Calculates and returns the E-index value. | |
* | |
* @returns {number} - The E-index value. | |
*/ | |
function get_e_index() { | |
// Calculate the E-index based on the sum of H-core citations and the square of the h-index. | |
return Math.sqrt(get_sum_h_core() - (h_index * h_index)); | |
} | |
// Output the calculated indices and other relevant information to the console | |
console.table({ | |
"Articles": articles.length, | |
"Total citations (Google / Real)": stats[0] + " / " + get_total_citations(), | |
"Total h-core citations": get_sum_h_core(), | |
"Since": since, | |
"Last": last, | |
"H-Index": h_index, | |
"R-index": to_float(get_r_index()), | |
"A-index": to_float(get_a_index()), | |
"G-index": get_g_index(), | |
"M-index": to_float(get_m_index()), | |
"E-index": to_float(get_e_index()), | |
"AR-index": to_float(get_ar_index()), | |
"HG-index": to_float(get_hg_index()), | |
}); | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment