Created
February 7, 2019 19:39
-
-
Save drrobotnik/2fb14929428ba30deff79f30cba14fe7 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from collections import Counter | |
import argparse | |
import fnmatch | |
import os | |
import sys | |
from bs4 import BeautifulSoup | |
matches = [] | |
elements = {} | |
parser = argparse.ArgumentParser(description='scrape files and parse doms') | |
parser.add_argument('--path', help='path of files') | |
parser.add_argument('--element', help='dom element type') | |
args = parser.parse_args() | |
for root, dirnames, filenames in os.walk(args.path): | |
for filename in fnmatch.filter(filenames, '*.html'): | |
matches.append(os.path.join(root, filename)) | |
def walker(soup): | |
if soup.name is not None: | |
for child in soup.children: | |
childName = str(child.name) | |
if childName != 'None': | |
if args.element: | |
if childName not in elements: | |
elements[childName] = [child] | |
else: | |
elements[childName].append(child) | |
else: | |
elements.append(child) | |
walker(child) | |
for file in matches: | |
data = open(file) | |
soup = BeautifulSoup(data, 'lxml') | |
walker(soup) | |
elementsByType = elements | |
if args.element: | |
elementsByType = elements[args.element] | |
storelen = len(elementsByType) | |
nodupes = list(set(elementsByType)) | |
#mostFrequent = sorted(elementsByType, key=Counter(elementsByType).get, reverse=True) | |
#sorted = sorted(nodupes, key=len) | |
mostCommon = Counter(elementsByType).most_common() | |
for commonElement in mostCommon: | |
print "==============================" | |
print "Element occurs:", commonElement[1], " times" | |
print "==============================" | |
print commonElement[0] | |
print "==============================" | |
#for element in sorted: | |
#for element in mostFrequent: | |
# print element |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
mkdir $(date +%Y%m%d) && cd $(date +%Y%m%d)
python soup.py --path="../../spider/20180709/domain.to.parse.com/" --element="div"