Skip to content

Instantly share code, notes, and snippets.

@nkthiebaut
Last active July 2, 2019 23:56
Show Gist options
  • Save nkthiebaut/55c64dcad2165f3c7933fab9b4bb59ce to your computer and use it in GitHub Desktop.
Save nkthiebaut/55c64dcad2165f3c7933fab9b4bb59ce to your computer and use it in GitHub Desktop.
Get and plot Google scholar search queries volume, for different keywords, with the XKCD plot style. Results from direct queries to the Google scholar APIs.
"""Based on https://github.com/Pold87/academic-keyword-occurrence"""
import re
import urllib
from functools import partial
from typing import Iterable
from urllib.parse import urlencode
from urllib.request import Request, build_opener
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
from bs4 import BeautifulSoup
GOOGLE_SCHOLAR_BASE_URL = "https://scholar.google.com/scholar?as_vis=1&hl=en&as_sdt=1,5&"
USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.109 Safari/537.36'
def get_num_results(search_term: str, year: int) -> int:
"""Get the number of occurences of the input search term during input year."""
query_params = {"q": search_term, "as_ylo": year, "as_yhi": year}
url = GOOGLE_SCHOLAR_BASE_URL + urllib.parse.urlencode(query_params)
request = Request(url=url, headers={"User-Agent": USER_AGENT})
handler = build_opener().open(request)
html = handler.read()
soup = BeautifulSoup(html, "html.parser")
# div content like: 'About x results (y sec)'
num_results_str = soup.find("div", {"id": "gs_ab_md"}).find("div").text
parsed_result = re.search(r"\d+(,\d+)*", num_results_str)
if parsed_result:
return int(parsed_result.group().replace(",", ""))
return 0
def get_keywords_histories(
search_terms: Iterable, start_year: int, end_year: int
) -> pd.DataFrame:
"""
Retrieve several keywords Google search histories, from start_year to end_year included.
⚠️ This function can be a bit long as the Google Trend API is (volontarily) slow.
"""
df = pd.DataFrame({"year": list(range(start_year, end_year + 1))})
for term in search_terms:
print(f"Starting Google Trends API queries with search term: {term}")
get_yearly = partial(get_num_results, term)
df[term] = df["year"].apply(get_yearly)
return df
if __name__ == "__main__":
keywords = ["deep learning", "explainable ar"]
search_history = get_keywords_histories(keywords, 2010, 2018)
plt.xkcd()
ax = search_history.plot(x="year")
ax.set(xlabel="Year", ylabel="Google scholar searches")
ax.get_yaxis().set_major_formatter(
matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ","))
)
ax.figure.savefig("xkcd-trend.png", bbox_inches="tight")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment