drrobotnik · February 7, 2019 19:39 · drrobotnik · Feb 7, 2019
diff --git a/soup.py b/soup.py
 from collections import Counter
 import argparse
 import fnmatch
 import os
 import sys
 from bs4 import BeautifulSoup

 matches = []
 elements = {}

 parser = argparse.ArgumentParser(description='scrape files and parse doms')

 parser.add_argument('--path', help='path of files')
 parser.add_argument('--element', help='dom element type')

 args = parser.parse_args()

 for root, dirnames, filenames in os.walk(args.path):
    for filename in fnmatch.filter(filenames, '*.html'):
        matches.append(os.path.join(root, filename))

 def walker(soup):
    if soup.name is not None:
        for child in soup.children:
            childName = str(child.name)
            if childName != 'None':
                if args.element:
                    if childName not in elements:
                        elements[childName] = [child]
                    else:
                        elements[childName].append(child)
                else:
                    elements.append(child)
                walker(child)

 for file in matches:
    data = open(file)
    soup = BeautifulSoup(data, 'lxml')
    walker(soup)


 elementsByType = elements

 if args.element:
    elementsByType = elements[args.element]


 storelen = len(elementsByType)
 nodupes = list(set(elementsByType))

 #mostFrequent = sorted(elementsByType, key=Counter(elementsByType).get, reverse=True)
 #sorted = sorted(nodupes, key=len)

 mostCommon = Counter(elementsByType).most_common()

 for commonElement in mostCommon:
    print "=============================="
    print "Element occurs:", commonElement[1], " times"
    print "=============================="
    print commonElement[0]
    print "=============================="

 #for element in sorted:
 #for element in mostFrequent:
 #    print element
	from collections import Counter
	import argparse
	import fnmatch
	import os
	import sys
	from bs4 import BeautifulSoup

	matches = []
	elements = {}

	parser = argparse.ArgumentParser(description='scrape files and parse doms')

	parser.add_argument('--path', help='path of files')
	parser.add_argument('--element', help='dom element type')

	args = parser.parse_args()

	for root, dirnames, filenames in os.walk(args.path):
	for filename in fnmatch.filter(filenames, '*.html'):
	matches.append(os.path.join(root, filename))

	def walker(soup):
	if soup.name is not None:
	for child in soup.children:
	childName = str(child.name)
	if childName != 'None':
	if args.element:
	if childName not in elements:
	elements[childName] = [child]
	else:
	elements[childName].append(child)
	else:
	elements.append(child)
	walker(child)

	for file in matches:
	data = open(file)
	soup = BeautifulSoup(data, 'lxml')
	walker(soup)


	elementsByType = elements

	if args.element:
	elementsByType = elements[args.element]


	storelen = len(elementsByType)
	nodupes = list(set(elementsByType))

	#mostFrequent = sorted(elementsByType, key=Counter(elementsByType).get, reverse=True)
	#sorted = sorted(nodupes, key=len)

	mostCommon = Counter(elementsByType).most_common()

	for commonElement in mostCommon:
	print "=============================="
	print "Element occurs:", commonElement[1], " times"
	print "=============================="
	print commonElement[0]
	print "=============================="

	#for element in sorted:
	#for element in mostFrequent:
	# print element