Created
December 21, 2019 10:14
-
-
Save niujiabenben/76b082a4e94ad922b8f946a7c1825995 to your computer and use it in GitHub Desktop.
获取某个会议的文章的引用数.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
""" | |
获取某个顶会的文章的引用数, 并输出其前10%的文章列表. | |
由于google scholor有防爬机制, 这里假定所有的html的源文件都手动下载到本地. | |
""" | |
import re | |
import os | |
from bs4 import BeautifulSoup | |
####################### Parse google scholar pages ############################ | |
def _parse_article(article): | |
"""解析google scholar网页中的一个article.""" | |
title = article.find_all("h3", "gs_rt") | |
assert len(title) == 1, title | |
title = title[0].find_all("a", href=re.compile("^http")) | |
assert len(title) == 1, title | |
### 这里在某些特殊字符(非ascii)的情况下会为None | |
title = title[0].string | |
cite = article.find_all("a", href=re.compile("cites")) | |
### 某些情况下文章没有cite信息 | |
assert len(cite) <= 1, str(cite) | |
cite = int(cite[0].string.split()[-1]) if len(cite) > 0 else -1 | |
return title, cite | |
def _parse_html(html): | |
"""解析一个(本地的)google scholar网页.""" | |
with open(html, "r", encoding="utf-8") as srcfile: | |
content = srcfile.read() | |
soup = BeautifulSoup(content, features="html.parser") | |
articles = [] | |
for article in soup.find_all("div", "gs_ri"): | |
title, cite = _parse_article(article) | |
if title is not None: | |
articles.append((title.lower(), cite)) | |
return articles | |
def get_all_articles(html_dir): | |
"""解析一个google scholar网页的目录. 这些网页是手动保存到本地的.""" | |
articles = [] | |
for name in os.listdir(html_dir): | |
path = os.path.join(html_dir, name) | |
articles.extend(_parse_html(path)) | |
return articles | |
######################## Parse openaccess page ################################ | |
def get_all_titles(openaccess_file): | |
"""解析openaccess网页. 这个网页也是手动保存到本地的.""" | |
with open(openaccess_file, "r", encoding="utf-8") as srcfile: | |
content = srcfile.read() | |
soup = BeautifulSoup(content, features="html.parser") | |
all_titles = [] | |
pattern = re.compile("^title = {(.+)},$") | |
for article in soup.find_all("div", "bibref"): | |
title_line = article.get_text().split("\n")[3] | |
match = pattern.match(title_line) | |
assert match is not None | |
all_titles.append(match[1].lower()) | |
return all_titles | |
############################## main part ###################################### | |
def main(html_dir, openaccess_file): | |
articles = get_all_articles(html_dir) | |
titles = get_all_titles(openaccess_file) | |
### 归一化article的title | |
regex = re.compile('[^a-zA-Z]') | |
normed_cite_map = {} | |
for title, cite in articles: | |
normed = regex.sub("", title) | |
normed_cite_map[normed] = cite | |
normed_titles = [regex.sub("", title) for title in titles] | |
normed_title_map = dict(zip(normed_titles, titles)) | |
### 统计没被google scholar搜索到的文章有多少 | |
outside = set(normed_titles) - set(normed_cite_map.keys()) | |
found = len(normed_titles) - len(outside) | |
print("titles: {}, found: {}, outside: {}".format( | |
len(titles), found, len(outside))) | |
### 输出按引用排序的文章 (top10%) | |
merged_map = {normed: -100 for normed in normed_titles} | |
for normed, cite in normed_cite_map.items(): | |
if normed in merged_map: | |
merged_map[normed] = max(merged_map[normed], cite) | |
final_results = sorted(merged_map.items(), key=lambda x: -x[1]) | |
top10p = len(final_results) // 10 | |
for idx, (normed, cite) in enumerate(final_results[:top10p]): | |
print("{:<140} {}".format(normed_title_map[normed], cite)) | |
print("Done!") | |
if __name__ == "__main__": | |
main("./cvpr2017", "./cvpr2017.html") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment