Last active
May 29, 2017 11:44
-
-
Save parttimenerd/f1b66b0a9063da4e15f67a1ffbe331fa to your computer and use it in GitHub Desktop.
Crawler to get a rough estimate on the academic articles in the field of microRNA research
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Copyright Johannes Bechberger (2017) | |
Licensed under the MIT license | |
Requirements: | |
- python3 (>= 3.4) | |
- requests | |
- matplotlib | |
- seaborn | |
- bs4 | |
It isn't advised to crawl google scholar many times as google will prevent this via captchas. | |
""" | |
import subprocess | |
from pprint import pprint | |
from time import sleep | |
import math | |
from bs4 import BeautifulSoup | |
from numpy.matlib import rand, random | |
import numpy as np | |
import seaborn as sns | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
def get_number_of_entries(curl_command): | |
print(curl_command) | |
html = subprocess.check_output(curl_command.replace("--2.0", ""), | |
stderr=subprocess.STDOUT, | |
shell=True) | |
parsed_html = BeautifulSoup(html) | |
div_text = parsed_html.body.find('div', attrs={'id': 'gs_ab_md'}).text | |
num = 0 | |
try: | |
num = int(div_text.strip().split(" ")[1].replace(".", "")) | |
except: | |
try: | |
num_ = int(div_text.strip().split(" ")[0].replace(".", "")) | |
except: | |
pass | |
return num | |
def get_numbers_per_year(curl, start=1950, end=2017): | |
ret = ([], {}) | |
for year in range(start, end + 1): | |
sleep(random.randint(1, 10)) | |
mcurl = curl.replace("' --2.0", "&as_ylo={}&as_yhi={}'".format(year, year)).replace("&oq=+m", "") | |
num = get_number_of_entries(mcurl) | |
print("{}: {:5d}".format(year, num)) | |
ret[0].append(num) | |
ret[1][year] = num | |
return ret | |
"""as_sdt=1,5 | |
q=microrna | |
hl=de | |
as_ylo=2017 | |
as_yhi=2017 | |
as_vis=1""" | |
CURL = """curl 'https://scholar.google.com/scholar?q=allintitle:+~microrna+-review&btnG=&hl=de&as_vis=1&as_sdt=1,5' --2.0 -H 'Host: scholar.google.com' -H 'User-Agent: Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:53.0) Gecko/20100101 Firefox/53.0' -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' -H 'Accept-Language: en-US,en;q=0.5' --compressed -H 'Cookie: CONSENT=YES+DE.de+V9; NID=103=psYAFvMJWDuUIrZOT3wfW31TJI3vY1rIAsOOJ27fQBo5l1TU3f1-QKgi5YgOdRMNoNjeL3qgL_-6MDoRzasGOe3C-DAgMXUflf9ILXl8RzbdxH49BbEQY3SzKezHQ-FUduq6w7EiidNfSdGJNOswVVXZtWwkmwSJcW-qnu-hJvKDWRTnD3S86nCtJ6Vf9nB0aICTHqodjN7orwP8pPTH4YxYbAEYj2uiT3uH_A; GSP=A=g9IqMg:CPTS=1494569370:LM=1494569370:S=jpvYzADMgygojoqI; SID=sgRo3gKcC_BLzqu0ttgutggIA0UZh-dYA9I5jIik0nsRBJARVDpr3JWsXQFmctJDa4CPRQ.; HSID=ADLfgSnNigMCMwa-A; SSID=Axqut4qnUCH8dPsie; APISID=LraoZIGH26G2dhF2/Ak-y0s0Rgy8EMcR1S; SAPISID=dLoQPLaAp-JIBSbN/A_6Chz9ryjET-oqam; OGPC=5061821-3:' -H 'DNT: 1' -H 'Connection: keep-alive' -H 'Upgrade-Insecure-Requests: 1' """ | |
START_YEAR = 1950 | |
#ret = get_numbers_per_year(CURL, start=START_YEAR) | |
# microrna | |
ret = ([0, | |
0, | |
0, | |
0, | |
0, | |
0, | |
0, | |
0, | |
0, | |
0, | |
0, | |
0, | |
0, | |
0, | |
0, | |
16, | |
0, | |
19, | |
0, | |
18, | |
13, | |
20, | |
21, | |
16, | |
0, | |
30, | |
25, | |
0, | |
0, | |
0, | |
39, | |
0, | |
0, | |
0, | |
44, | |
0, | |
55, | |
0, | |
0, | |
756, | |
899, | |
746, | |
763, | |
753, | |
1230, | |
1630, | |
2140, | |
2580, | |
2730, | |
3490, | |
3770, | |
4340, | |
4930, | |
5570, | |
5940, | |
6820, | |
8060, | |
9590, | |
14300, | |
17700, | |
25000, | |
27900, | |
38200, | |
46400, | |
49500, | |
46500, | |
36600, | |
23300], | |
{1950: 0, | |
1951: 0, | |
1952: 0, | |
1953: 0, | |
1954: 0, | |
1955: 0, | |
1956: 0, | |
1957: 0, | |
1958: 0, | |
1959: 0, | |
1960: 0, | |
1961: 0, | |
1962: 0, | |
1963: 0, | |
1964: 0, | |
1965: 16, | |
1966: 0, | |
1967: 19, | |
1968: 0, | |
1969: 18, | |
1970: 13, | |
1971: 20, | |
1972: 21, | |
1973: 16, | |
1974: 0, | |
1975: 30, | |
1976: 25, | |
1977: 0, | |
1978: 0, | |
1979: 0, | |
1980: 39, | |
1981: 0, | |
1982: 0, | |
1983: 0, | |
1984: 44, | |
1985: 0, | |
1986: 55, | |
1987: 0, | |
1988: 0, | |
1989: 756, | |
1990: 899, | |
1991: 746, | |
1992: 763, | |
1993: 753, | |
1994: 1230, | |
1995: 1630, | |
1996: 2140, | |
1997: 2580, | |
1998: 2730, | |
1999: 3490, | |
2000: 3770, | |
2001: 4340, | |
2002: 4930, | |
2003: 5570, | |
2004: 5940, | |
2005: 6820, | |
2006: 8060, | |
2007: 9590, | |
2008: 14300, | |
2009: 17700, | |
2010: 25000, | |
2011: 27900, | |
2012: 38200, | |
2013: 46400, | |
2014: 49500, | |
2015: 46500, | |
2016: 36600, | |
2017: 23300}) | |
# microrna+cancer | |
ret2 = ([0, | |
0, | |
0, | |
0, | |
0, | |
0, | |
0, | |
0, | |
0, | |
0, | |
0, | |
0, | |
0, | |
0, | |
0, | |
0, | |
0, | |
0, | |
0, | |
0, | |
0, | |
0, | |
0, | |
0, | |
0, | |
17, | |
0, | |
0, | |
16, | |
0, | |
15, | |
17, | |
22, | |
21, | |
23, | |
25, | |
26, | |
43, | |
48, | |
61, | |
67, | |
80, | |
88, | |
77, | |
107, | |
104, | |
152, | |
263, | |
283, | |
359, | |
461, | |
518, | |
598, | |
839, | |
1130, | |
1480, | |
2420, | |
3830, | |
6950, | |
8340, | |
11500, | |
14800, | |
20200, | |
25300, | |
27700, | |
34300, | |
31400, | |
12000], | |
{1950: 0, | |
1951: 0, | |
1952: 0, | |
1953: 0, | |
1954: 0, | |
1955: 0, | |
1956: 0, | |
1957: 0, | |
1958: 0, | |
1959: 0, | |
1960: 0, | |
1961: 0, | |
1962: 0, | |
1963: 0, | |
1964: 0, | |
1965: 0, | |
1966: 0, | |
1967: 0, | |
1968: 0, | |
1969: 0, | |
1970: 0, | |
1971: 0, | |
1972: 0, | |
1973: 0, | |
1974: 0, | |
1975: 17, | |
1976: 0, | |
1977: 0, | |
1978: 16, | |
1979: 0, | |
1980: 15, | |
1981: 17, | |
1982: 22, | |
1983: 21, | |
1984: 23, | |
1985: 25, | |
1986: 26, | |
1987: 43, | |
1988: 48, | |
1989: 61, | |
1990: 67, | |
1991: 80, | |
1992: 88, | |
1993: 77, | |
1994: 107, | |
1995: 104, | |
1996: 152, | |
1997: 263, | |
1998: 283, | |
1999: 359, | |
2000: 461, | |
2001: 518, | |
2002: 598, | |
2003: 839, | |
2004: 1130, | |
2005: 1480, | |
2006: 2420, | |
2007: 3830, | |
2008: 6950, | |
2009: 8340, | |
2010: 11500, | |
2011: 14800, | |
2012: 20200, | |
2013: 25300, | |
2014: 27700, | |
2015: 34300, | |
2016: 31400, | |
2017: 12000}) | |
pprint("[[" + ", ".join(map(str, ret[0])) + "]]") | |
g = sns.factorplot(x="year", y="papers", kind="bar", | |
data=pd.DataFrame({"papers": ret[0], "year": np.arange(START_YEAR, 2018)}), | |
order=np.arange(START_YEAR, 2018), size=6) | |
axes = plt.gca() | |
axes.set_ylim([0, max(ret[0]) * 1.1]) | |
#plt.yscale('log') | |
g.set_xticklabels(step=5) | |
SUB = 25 | |
Y_LABEL = "search results" #"google scholar search results for \"microrna cancer\"" | |
g = sns.factorplot(x="year", y=Y_LABEL, kind="bar", | |
data=pd.DataFrame({Y_LABEL: ret2[0][-SUB:], "year": np.arange(2018-SUB, 2018)}), | |
order=np.arange(2018-SUB, 2018), size=6) | |
axes = plt.gca() | |
axes.set_ylim([0, max(ret2[0]) * 1.1]) | |
g.set_xticklabels(step=5) | |
plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment