Skip to content

Instantly share code, notes, and snippets.

@manuzhang
Last active March 1, 2021 18:20
Show Gist options
  • Save manuzhang/c1b4ccd4605aeded24c246fe2a958c99 to your computer and use it in GitHub Desktop.
Save manuzhang/c1b4ccd4605aeded24c246fe2a958c99 to your computer and use it in GitHub Desktop.
import base64
from bs4 import BeautifulSoup
import json
import mistune
import re
def load_results(fileName):
f = open(fileName, "r+")
return json.loads(f.read())
def get_abstracts(ps):
for non_text in parser(['img', 'blockquote', 'pre', 'a', 'table']): # get rid of non-text contents
non_text.extract()
def get_text(ps):
result = ''
i = 0
regex = re.compile(u'\W+|[\u4E00-\u9FA5]|[0-9_]')
for p in ps:
text = p.get_text()
text = regex.sub(' ', text)
if not regex.fullmatch(text):
i += 1
result += text + ' '
if i >= 5: # heuristically get at most 5 paragraphs
break
return result
return get_text(parser('p'))
def parseMarkdown(md):
'''
Returns HTML
'''
html = mistune.markdown(md, escape=False)
return html
abstracts = {}
parsers = {}
languages = ["Scala", "Java", "Python", "Javascript", "Go", "C++", "HTML", "shell", "Jupyter Notebook", "C"]
blacklist = []
for lang in languages:
repos = load_results("result-" + lang + ".json")
for i in range(len(repos)):
repo = repos[i]
name = repo['name']
if name in blacklist:
continue
s = base64.b64decode(repo['content']).decode('utf-8')
parser = BeautifulSoup(parseMarkdown(s))
parsers[name] = parser
abstracts[name] = get_abstracts(parser)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment