Skip to content

Instantly share code, notes, and snippets.

@itherunder
Created December 20, 2021 03:47
Show Gist options
  • Save itherunder/15e93e60e2e2b11d08d634e1bd71d0a9 to your computer and use it in GitHub Desktop.
Save itherunder/15e93e60e2e2b11d08d634e1bd71d0a9 to your computer and use it in GitHub Desktop.
通过search_word 在谷歌学术上搜索论文并获取论文标题、bib引用、下载链接以及摘要,目前只搞了论文标题和下载链接
from genericpath import getctime
from urllib import parse
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from webdriver_manager.utils import download_file
search_word = 'ethereum smart contract'
query = '+'.join(search_word.split(' '))
url = 'https://scholar.google.com/scholar?hl=en&start=%d&q=%s'
driver = webdriver.Chrome(ChromeDriverManager().install())
def get_title(paper_element):
title = paper_element.find_element_by_class_name('gs_rt')
return title.text
def get_cite(paper_element):
pass
def get_download_url(paper_element):
try:
download_url = paper_element.find_element_by_class_name('gs_or_ggsm')
except Exception as e:
return 'no_download_url'
download_url = download_url.find_element_by_tag_name('a')
if '[PDF]' in download_url.text:
return download_url.get_attribute('href')
return 'no_pdf_download_url'
def get_abstract(paper_element):
pass
def get_info(start):
driver.get(url % (start, query))
papers = driver.find_elements_by_xpath("//div[@class='gs_r gs_or gs_scl']")
for paper in papers:
title, cite, download_url = get_title(paper), get_cite(paper), get_download_url(paper)
with open('papers.txt', 'a', encoding='utf-8') as a:
a.write('%s#%s\n' % (title, download_url))
for i in range(1, 100):
print('[INFO] ===========================', i)
get_info(i * 10)
# driver.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment