Created
December 20, 2021 03:47
-
-
Save itherunder/15e93e60e2e2b11d08d634e1bd71d0a9 to your computer and use it in GitHub Desktop.
通过search_word 在谷歌学术上搜索论文并获取论文标题、bib引用、下载链接以及摘要,目前只搞了论文标题和下载链接
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from genericpath import getctime | |
from urllib import parse | |
from selenium import webdriver | |
from webdriver_manager.chrome import ChromeDriverManager | |
from webdriver_manager.utils import download_file | |
search_word = 'ethereum smart contract' | |
query = '+'.join(search_word.split(' ')) | |
url = 'https://scholar.google.com/scholar?hl=en&start=%d&q=%s' | |
driver = webdriver.Chrome(ChromeDriverManager().install()) | |
def get_title(paper_element): | |
title = paper_element.find_element_by_class_name('gs_rt') | |
return title.text | |
def get_cite(paper_element): | |
pass | |
def get_download_url(paper_element): | |
try: | |
download_url = paper_element.find_element_by_class_name('gs_or_ggsm') | |
except Exception as e: | |
return 'no_download_url' | |
download_url = download_url.find_element_by_tag_name('a') | |
if '[PDF]' in download_url.text: | |
return download_url.get_attribute('href') | |
return 'no_pdf_download_url' | |
def get_abstract(paper_element): | |
pass | |
def get_info(start): | |
driver.get(url % (start, query)) | |
papers = driver.find_elements_by_xpath("//div[@class='gs_r gs_or gs_scl']") | |
for paper in papers: | |
title, cite, download_url = get_title(paper), get_cite(paper), get_download_url(paper) | |
with open('papers.txt', 'a', encoding='utf-8') as a: | |
a.write('%s#%s\n' % (title, download_url)) | |
for i in range(1, 100): | |
print('[INFO] ===========================', i) | |
get_info(i * 10) | |
# driver.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment