|
# generates keys to be used in dictionary - tag name or object type |
|
def getElemKey(pEl): |
|
if isinstance(pEl, str): return 'string' |
|
pName, pClass = getattr(pEl, 'name', None), str(type(pEl)) |
|
pClass = pClass.replace("<class '",'',1).rstrip("'>").split('.')[-1] |
|
return str(pName).strip() if isinstance(pName, str) else pClass |
|
|
|
# converts list of tuples to dictionary; adds _# to avoid duplicate keys |
|
def dict_enumKeys(tupList, noCount=['textContent', 'contents']): |
|
invalidT = [t for t in tupList if ( |
|
isinstance(tupList, tuple) and len(t)==2 |
|
)] if isinstance(tupList, list) else True |
|
if invalidT: return tupList |
|
if len(tupList)==1 and tupList[0][0] in noCount: return tupList[0][1] |
|
|
|
keyCt, toRet, tl_len = {}, {}, len(tupList) |
|
for k, v in tupList: |
|
# skip empty strings |
|
if k == 'string' and isinstance(v, list) and len(v) == 1: |
|
if isinstance(v[0], tuple) and len(v[0]) == 2: |
|
if isinstance(v[0][1], str) and not v[0][1].strip(): |
|
continue |
|
|
|
kCt = keyCt[str(k)] = keyCt.get(str(k), 0) + 1 |
|
if not (k in noCount and kCt==1): k = f'{k}_{kCt}' |
|
try: toRet[k] = dict_enumKeys(v) |
|
except RecursionError: toRet[k] = v |
|
return toRet |
|
|
|
|
|
|
|
def nestHtmlChildren(pTag, chRef={}, asDict='no_str'): |
|
chList, tnList = [], chRef.get(getElemKey(pTag), None) |
|
if tnList is not None: |
|
if not isinstance(tnList, list): tnList = [tnList] |
|
sel = ', '.join([f'{s}:not({s} {s})' for s in tnList]) |
|
chList = [s for s in pTag.select(f':where({sel})')] |
|
chList = [c for c in chList if not(isinstance(c,str) and not str(c))] |
|
|
|
if chList: |
|
try: |
|
tList = [(getElemKey(childEl), nestHtmlChildren( |
|
childEl, chRef=chRef, asDict=True |
|
)) for childEl in chList] |
|
return dict_enumKeys(tList) if asDict else tList |
|
except RecursionError: pass |
|
|
|
tCon = pTag.get_text(' ').strip() if callable( |
|
getattr(pTag, 'get_text', None) |
|
) else str(pTag) |
|
return {'textContent': tCon} if asDict=='no_str' else ( |
|
tCon if asDict else [('textContent', tCon)]) |
|
|
|
|
|
def nestHtmlSiblings(hSoup, levelsRef, cNestRef={}, recursive=False): |
|
sibList, isRoot = getattr(hSoup, 'contents', None), getElemKey(hSoup) |
|
if not hSoup: return hSoup |
|
if not isinstance(hSoup, list): |
|
if not (sibList and isinstance(sibList, list)): |
|
hDict = nestHtmlChildren(hSoup, cNestRef, 'no_str') |
|
return {getElemKey(hSoup): hDict} |
|
else: sibList, isRoot = hSoup, False |
|
|
|
if not all([isinstance(s, tuple) and len(s)>1 for s in sibList]): |
|
sibList, retList = [ |
|
s[:2] if isinstance(s, tuple) and len(s)>1 |
|
else (getElemKey(s), s) for s in sibList |
|
], False |
|
else: retList = True |
|
|
|
nestSibs, curContainer, sibContainer, curKey, = [], [], [], None |
|
pKeys, maxLevel = list(levelsRef.keys()), max(levelsRef.values()) |
|
for k, el in sibList + [(None, None)]: |
|
isLast = k is None and el is None |
|
invCur = curKey is None or curKey not in pKeys |
|
if not (k in pKeys or isLast or invCur): |
|
sibContainer.append((k, el)) |
|
continue |
|
|
|
if curKey is not None: |
|
try: |
|
sibContainer = [s for s in sibContainer if not ( |
|
s[0]=='string' and not str(s[1]).strip() )] |
|
for_cc = nestHtmlSiblings( |
|
sibContainer, levelsRef, cNestRef, recursive) |
|
except RecursionError: |
|
for_cc = [('error', f'{type(r)} {r}'), ('curEl', str(el))] |
|
nestSibs += [(curKey, curContainer+(for_cc if for_cc else[]))] |
|
|
|
|
|
curKey, curContainer, sibContainer = k, [], [] |
|
pKeys = [ |
|
lk for lk,l in levelsRef.items() |
|
if levelsRef.get(k, maxLevel) >= l |
|
] |
|
if isLast: continue |
|
|
|
try: |
|
if recursive and callable(getattr(el, 'find', None)): |
|
if not isinstance(el,str) and el.find(pKeys): |
|
curContainer.append(('contents', nestHtmlSiblings( |
|
el.contents, levelsRef, cNestRef, recursive))) |
|
continue |
|
curContainer += nestHtmlChildren(el, cNestRef, asDict=False) |
|
except RecursionError as r: |
|
curContainer += [('error', f'{type(r)} {r}'), |
|
('curEl', str(el))] |
|
|
|
|
|
if isRoot: nestSibsDict = {isRoot: dict_enumKeys(nestSibs)} |
|
elif retList: nestSibsDict = nestSibs |
|
else: nestSibsDict = dict_enumKeys(nestSibs) |
|
return nestSibsDict |