Skip to content

Instantly share code, notes, and snippets.

@parttimenerd
Last active May 29, 2017 11:44
Show Gist options
  • Save parttimenerd/f1b66b0a9063da4e15f67a1ffbe331fa to your computer and use it in GitHub Desktop.
Save parttimenerd/f1b66b0a9063da4e15f67a1ffbe331fa to your computer and use it in GitHub Desktop.
Crawler to get a rough estimate on the academic articles in the field of microRNA research
"""
Copyright Johannes Bechberger (2017)
Licensed under the MIT license
Requirements:
- python3 (>= 3.4)
- requests
- matplotlib
- seaborn
- bs4
It isn't advised to crawl google scholar many times as google will prevent this via captchas.
"""
import subprocess
from pprint import pprint
from time import sleep
import math
from bs4 import BeautifulSoup
from numpy.matlib import rand, random
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
def get_number_of_entries(curl_command):
print(curl_command)
html = subprocess.check_output(curl_command.replace("--2.0", ""),
stderr=subprocess.STDOUT,
shell=True)
parsed_html = BeautifulSoup(html)
div_text = parsed_html.body.find('div', attrs={'id': 'gs_ab_md'}).text
num = 0
try:
num = int(div_text.strip().split(" ")[1].replace(".", ""))
except:
try:
num_ = int(div_text.strip().split(" ")[0].replace(".", ""))
except:
pass
return num
def get_numbers_per_year(curl, start=1950, end=2017):
ret = ([], {})
for year in range(start, end + 1):
sleep(random.randint(1, 10))
mcurl = curl.replace("' --2.0", "&as_ylo={}&as_yhi={}'".format(year, year)).replace("&oq=+m", "")
num = get_number_of_entries(mcurl)
print("{}: {:5d}".format(year, num))
ret[0].append(num)
ret[1][year] = num
return ret
"""as_sdt=1,5
q=microrna
hl=de
as_ylo=2017
as_yhi=2017
as_vis=1"""
CURL = """curl 'https://scholar.google.com/scholar?q=allintitle:+~microrna+-review&btnG=&hl=de&as_vis=1&as_sdt=1,5' --2.0 -H 'Host: scholar.google.com' -H 'User-Agent: Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:53.0) Gecko/20100101 Firefox/53.0' -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' -H 'Accept-Language: en-US,en;q=0.5' --compressed -H 'Cookie: CONSENT=YES+DE.de+V9; NID=103=psYAFvMJWDuUIrZOT3wfW31TJI3vY1rIAsOOJ27fQBo5l1TU3f1-QKgi5YgOdRMNoNjeL3qgL_-6MDoRzasGOe3C-DAgMXUflf9ILXl8RzbdxH49BbEQY3SzKezHQ-FUduq6w7EiidNfSdGJNOswVVXZtWwkmwSJcW-qnu-hJvKDWRTnD3S86nCtJ6Vf9nB0aICTHqodjN7orwP8pPTH4YxYbAEYj2uiT3uH_A; GSP=A=g9IqMg:CPTS=1494569370:LM=1494569370:S=jpvYzADMgygojoqI; SID=sgRo3gKcC_BLzqu0ttgutggIA0UZh-dYA9I5jIik0nsRBJARVDpr3JWsXQFmctJDa4CPRQ.; HSID=ADLfgSnNigMCMwa-A; SSID=Axqut4qnUCH8dPsie; APISID=LraoZIGH26G2dhF2/Ak-y0s0Rgy8EMcR1S; SAPISID=dLoQPLaAp-JIBSbN/A_6Chz9ryjET-oqam; OGPC=5061821-3:' -H 'DNT: 1' -H 'Connection: keep-alive' -H 'Upgrade-Insecure-Requests: 1' """
START_YEAR = 1950
#ret = get_numbers_per_year(CURL, start=START_YEAR)
# microrna
ret = ([0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
16,
0,
19,
0,
18,
13,
20,
21,
16,
0,
30,
25,
0,
0,
0,
39,
0,
0,
0,
44,
0,
55,
0,
0,
756,
899,
746,
763,
753,
1230,
1630,
2140,
2580,
2730,
3490,
3770,
4340,
4930,
5570,
5940,
6820,
8060,
9590,
14300,
17700,
25000,
27900,
38200,
46400,
49500,
46500,
36600,
23300],
{1950: 0,
1951: 0,
1952: 0,
1953: 0,
1954: 0,
1955: 0,
1956: 0,
1957: 0,
1958: 0,
1959: 0,
1960: 0,
1961: 0,
1962: 0,
1963: 0,
1964: 0,
1965: 16,
1966: 0,
1967: 19,
1968: 0,
1969: 18,
1970: 13,
1971: 20,
1972: 21,
1973: 16,
1974: 0,
1975: 30,
1976: 25,
1977: 0,
1978: 0,
1979: 0,
1980: 39,
1981: 0,
1982: 0,
1983: 0,
1984: 44,
1985: 0,
1986: 55,
1987: 0,
1988: 0,
1989: 756,
1990: 899,
1991: 746,
1992: 763,
1993: 753,
1994: 1230,
1995: 1630,
1996: 2140,
1997: 2580,
1998: 2730,
1999: 3490,
2000: 3770,
2001: 4340,
2002: 4930,
2003: 5570,
2004: 5940,
2005: 6820,
2006: 8060,
2007: 9590,
2008: 14300,
2009: 17700,
2010: 25000,
2011: 27900,
2012: 38200,
2013: 46400,
2014: 49500,
2015: 46500,
2016: 36600,
2017: 23300})
# microrna+cancer
ret2 = ([0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
17,
0,
0,
16,
0,
15,
17,
22,
21,
23,
25,
26,
43,
48,
61,
67,
80,
88,
77,
107,
104,
152,
263,
283,
359,
461,
518,
598,
839,
1130,
1480,
2420,
3830,
6950,
8340,
11500,
14800,
20200,
25300,
27700,
34300,
31400,
12000],
{1950: 0,
1951: 0,
1952: 0,
1953: 0,
1954: 0,
1955: 0,
1956: 0,
1957: 0,
1958: 0,
1959: 0,
1960: 0,
1961: 0,
1962: 0,
1963: 0,
1964: 0,
1965: 0,
1966: 0,
1967: 0,
1968: 0,
1969: 0,
1970: 0,
1971: 0,
1972: 0,
1973: 0,
1974: 0,
1975: 17,
1976: 0,
1977: 0,
1978: 16,
1979: 0,
1980: 15,
1981: 17,
1982: 22,
1983: 21,
1984: 23,
1985: 25,
1986: 26,
1987: 43,
1988: 48,
1989: 61,
1990: 67,
1991: 80,
1992: 88,
1993: 77,
1994: 107,
1995: 104,
1996: 152,
1997: 263,
1998: 283,
1999: 359,
2000: 461,
2001: 518,
2002: 598,
2003: 839,
2004: 1130,
2005: 1480,
2006: 2420,
2007: 3830,
2008: 6950,
2009: 8340,
2010: 11500,
2011: 14800,
2012: 20200,
2013: 25300,
2014: 27700,
2015: 34300,
2016: 31400,
2017: 12000})
pprint("[[" + ", ".join(map(str, ret[0])) + "]]")
g = sns.factorplot(x="year", y="papers", kind="bar",
data=pd.DataFrame({"papers": ret[0], "year": np.arange(START_YEAR, 2018)}),
order=np.arange(START_YEAR, 2018), size=6)
axes = plt.gca()
axes.set_ylim([0, max(ret[0]) * 1.1])
#plt.yscale('log')
g.set_xticklabels(step=5)
SUB = 25
Y_LABEL = "search results" #"google scholar search results for \"microrna cancer\""
g = sns.factorplot(x="year", y=Y_LABEL, kind="bar",
data=pd.DataFrame({Y_LABEL: ret2[0][-SUB:], "year": np.arange(2018-SUB, 2018)}),
order=np.arange(2018-SUB, 2018), size=6)
axes = plt.gca()
axes.set_ylim([0, max(ret2[0]) * 1.1])
g.set_xticklabels(step=5)
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment