Last active
July 2, 2019 23:56
-
-
Save nkthiebaut/55c64dcad2165f3c7933fab9b4bb59ce to your computer and use it in GitHub Desktop.
Get and plot Google scholar search queries volume, for different keywords, with the XKCD plot style. Results from direct queries to the Google scholar APIs.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Based on https://github.com/Pold87/academic-keyword-occurrence""" | |
import re | |
import urllib | |
from functools import partial | |
from typing import Iterable | |
from urllib.parse import urlencode | |
from urllib.request import Request, build_opener | |
import matplotlib | |
import matplotlib.pyplot as plt | |
import pandas as pd | |
from bs4 import BeautifulSoup | |
GOOGLE_SCHOLAR_BASE_URL = "https://scholar.google.com/scholar?as_vis=1&hl=en&as_sdt=1,5&" | |
USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.109 Safari/537.36' | |
def get_num_results(search_term: str, year: int) -> int: | |
"""Get the number of occurences of the input search term during input year.""" | |
query_params = {"q": search_term, "as_ylo": year, "as_yhi": year} | |
url = GOOGLE_SCHOLAR_BASE_URL + urllib.parse.urlencode(query_params) | |
request = Request(url=url, headers={"User-Agent": USER_AGENT}) | |
handler = build_opener().open(request) | |
html = handler.read() | |
soup = BeautifulSoup(html, "html.parser") | |
# div content like: 'About x results (y sec)' | |
num_results_str = soup.find("div", {"id": "gs_ab_md"}).find("div").text | |
parsed_result = re.search(r"\d+(,\d+)*", num_results_str) | |
if parsed_result: | |
return int(parsed_result.group().replace(",", "")) | |
return 0 | |
def get_keywords_histories( | |
search_terms: Iterable, start_year: int, end_year: int | |
) -> pd.DataFrame: | |
""" | |
Retrieve several keywords Google search histories, from start_year to end_year included. | |
⚠️ This function can be a bit long as the Google Trend API is (volontarily) slow. | |
""" | |
df = pd.DataFrame({"year": list(range(start_year, end_year + 1))}) | |
for term in search_terms: | |
print(f"Starting Google Trends API queries with search term: {term}") | |
get_yearly = partial(get_num_results, term) | |
df[term] = df["year"].apply(get_yearly) | |
return df | |
if __name__ == "__main__": | |
keywords = ["deep learning", "explainable ar"] | |
search_history = get_keywords_histories(keywords, 2010, 2018) | |
plt.xkcd() | |
ax = search_history.plot(x="year") | |
ax.set(xlabel="Year", ylabel="Google scholar searches") | |
ax.get_yaxis().set_major_formatter( | |
matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ",")) | |
) | |
ax.figure.savefig("xkcd-trend.png", bbox_inches="tight") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment