Created
November 21, 2013 07:52
-
-
Save ofan/7577535 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import logging | |
import os | |
import errno | |
import shutil | |
from urlparse import urlparse, urlunparse | |
from bs4 import BeautifulSoup | |
from . import types | |
from .base import _BaseParser | |
import re | |
log = logging.getLogger(__name__) | |
FileCache = {} | |
def _remove_anchor(url): | |
res = '' | |
try: | |
res = url[:url.index('#')] | |
except: | |
res = url | |
return res.strip() | |
def _guess_type(url): | |
anchor = urlparse(url).fragment | |
if "t:" in anchor: | |
return types.CLASS | |
elif "v:" in anchor: | |
return types.FUNCTION | |
def _ignore_files(f): | |
return (".html" not in f) and ("/" not in f) | |
def _link2dest(path, docpath, copy=False): | |
moduleMatch = r'.*/((\w|\.|-|\d)+\d)/.*' | |
log.debug("docpath: %s" % docpath) | |
log.debug("path: %s" % path) | |
if path.startswith('file:///'): | |
path = path.replace('file://','') | |
if r"doc/html" in path: | |
# path uses absolute path | |
try: | |
moduleName = re.match(moduleMatch, | |
_remove_anchor(path)).group(1) | |
log.debug("moduleName: " + moduleName) | |
f = os.path.join(moduleName, os.path.basename(path)) | |
except: | |
f = os.path.basename(path) | |
else: | |
# path uses relative path | |
moduleName = os.path.basename(os.path.dirname(path)) | |
f = os.path.join(moduleName, os.path.basename(path)) | |
p = os.path.join(docpath, _remove_anchor(f)) | |
log.debug("p: " + p) | |
if not os.path.exists(os.path.dirname(p)): | |
os.makedirs(os.path.dirname(p)) | |
auxFiles = filter(_ignore_files, os.listdir(docpath)) | |
log.debug("auxFiles: " + str(auxFiles) + "\nmodule: " + moduleName) | |
# link essential haddock template files(e.g. css,js etc) | |
for af in auxFiles: | |
if not os.path.isdir(os.path.join(docpath, af)): | |
log.debug("ln " | |
+ os.path.abspath(os.path.join(docpath, af)) | |
+ "\nto " + os.path.join(os.path.dirname(p), af)) | |
os.symlink(os.path.abspath(os.path.join(docpath, af)), | |
os.path.join(os.path.dirname(p), af)) | |
# Link src dir if exists | |
srcDir = os.path.join(os.path.dirname(path), 'src') | |
dstSrcDir = os.path.join(os.path.dirname(p), 'src') | |
if os.path.exists(srcDir): | |
os.symlink(srcDir, dstSrcDir) | |
if not os.path.exists(p): | |
if copy: | |
try: | |
shutil.copyfile(path, p) | |
except: | |
log.debug("Cannot copy file %s to %s" | |
% (path, p)) | |
else: | |
if os.path.lexists(p): | |
log.debug("Broken symlink '%s'" % p) | |
log.debug("Remove it") | |
try: | |
os.remove(p) | |
except: | |
log.debug("Failed to remove.") | |
os.symlink(_remove_anchor(path), p) | |
return f | |
else: | |
return path | |
def _fix_infix(n, t): | |
init = n[0] | |
if (t == types.FUNCTION and | |
not init.isalpha() and | |
init not in '_('): | |
n = u'(%s)' % n | |
return n | |
def _fix_links(soup): | |
module = soup.find('div', id='module-header')\ | |
.find('p', attrs={'class': 'caption'}).string.strip() | |
if module not in FileCache: | |
FileCache[module] = True | |
div = soup.find('div', id='content') | |
for a in div.find_all('a', href=True): | |
# Skip source links | |
if a.string == 'Source': | |
continue | |
a['href'] = os.path.basename(a['href']) | |
class HaddockParser(_BaseParser): | |
""" Parser for Haskell haddock documentation. """ | |
name = "haddock" | |
DETECT_FILE = "haddock-util.js" | |
DETECT_PATTERN = '''Haddock''' | |
INDEX_FILES = ['doc-index-All.html', 'doc-index.html'] | |
def parse(self): | |
"""Parse haddock docs at *docpath*. | |
yields tuples of symbol name, type and path | |
""" | |
# Cache added module names to prevent duplicates | |
modCache = {} | |
for indexFile in HaddockParser.INDEX_FILES: | |
try: | |
soup = BeautifulSoup(open(os.path.join(self.docpath, | |
indexFile)), 'xml') | |
log.debug("Detected " + indexFile) | |
break | |
except IOError: | |
pass | |
else: | |
raise IOError(errno.ENOENT, "Essential index file not found.") | |
log.info('Creating database...') | |
symName = '' | |
symType = '' | |
symPath = '' | |
trs = soup.body.find_all('tr') | |
log.debug("Found %d <tr>" % len(trs)) | |
for tr in trs: | |
for td in tr.find_all('td'): | |
if 'class' not in td.attrs: | |
# Empty td entry, omit it | |
continue | |
cl = td['class'] | |
if 'src' in cl: | |
symName = td.string.strip() | |
# Reached a new symbol name, reset type | |
symType = '' | |
elif 'module' in cl: | |
modules = tr.find_all('a') | |
if len(modules) <= 0: | |
continue | |
for m in modules: | |
log.debug('----------') | |
mName = m.string.strip() | |
symPath = m['href'].strip() | |
log.debug('symPath: %s' % symPath) | |
if mName not in modCache: | |
modCache[mName] = True | |
mPath = _link2dest(_remove_anchor(symPath), | |
self.docpath, copy=False) | |
log.debug("Adding module: %s (%s) in '%s'" | |
% (mName, types.PACKAGE, mPath)) | |
yield mName, types.PACKAGE, mPath | |
if symName: | |
symType = _guess_type(symPath) | |
# Enclose infix operators by parentheses | |
symName = _fix_infix(symName, symType) | |
# Copy or link target file to Documents/ | |
symPath = _link2dest(symPath, self.docpath, | |
copy=False) | |
log.debug("Adding symbol: %s (%s) in '%s'" | |
% (symName, symType, symPath)) | |
yield symName, symType, symPath | |
def find_and_patch_entry(self, soup, entry): | |
""" Verify whether the anchor is actually in the target file. | |
""" | |
_fix_links(soup) | |
return soup.find('a', attrs={'name': entry.anchor}) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment