lrnselfreliance · November 15, 2022 17:26
diff --git a/htmlfind.py b/htmlfind.py
 #!/usr/bin/env python3
 """
 Searches through stdin/file for HTML elements matching the class/attribute filters.

 # echo "<html>
 <body>
 <p>Some links</p>
 <a href="https://example.com/1" class="video">Click me</a>
 <a href="https://example.com/2" class="video">Click me</a>
 <a href="https://example.com/2" class="video">Click me</a>
 </body>
 </html>" | htmlfind.py --element a --clss video --attr href
 https://example.com/1
 https://example.com/2
 https://example.com/3

 # htmlfind.py --element a --clss video --attr href some-file.html
 """
 import argparse
 import pathlib
 import sys

 try:
    from BeautifulSoup import BeautifulSoup
 except ImportError:
    from bs4 import BeautifulSoup


 def find_elements(content: str, element: str, clss: str, attribute: str, pretty: bool = False):
    """Searches for elements in `content` that matches the element, class, attribute."""
    attrs = {'class': clss} if clss else None

    soup = BeautifulSoup(content, features="html.parser")
    results = soup.findAll(element, attrs=attrs)
    for result in results:
        # result: bs4.Tag
        if attribute:
            try:
                yield str(result[attribute]).strip()
            except KeyError:
                # Found a matching element, but it does not have the attribute.
                continue
        elif pretty:
            yield str(result.prettify()).strip()
        else:
            yield str(result).strip()


 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-c', '--clss', type=str,
                        help='Search for any element with a class of this value')
    parser.add_argument('-e', '--element', type=str,
                        help='Search for all of these elements (div/a)')
    parser.add_argument('-a', '--attr', type=str,
                        help='Return only this attribute of the element (href)')
    parser.add_argument('-p', '--pretty', default=False, action='store_true',
                        help='Prettify output')
    parser.add_argument('path', default='-', nargs='?',
                        help='The file to search, default is STDIN.')
    args = parser.parse_args()

    if args.path == '-':
        content = sys.stdin.read()
        if not content:
            print('No content in STDIN!', file=sys.stderr)
            return 1
    else:
        path = pathlib.Path(args.path)
        if not path.is_file():
            print(f'File does not exist {path}', file=sys.stderr)
            return 1
        content = path.read_text()
        if not content:
            print(f'No content in {args.path}', file=sys.stderr)
            return 1

    results = find_elements(
        content=content,
        element=args.element,
        clss=args.clss,
        attribute=args.attr,
        pretty=args.pretty,
    )
    for result in results:
        print(result)

    return 0


 if __name__ == '__main__':
    status_code = main()
    sys.exit(status_code)
	#!/usr/bin/env python3
	"""
	Searches through stdin/file for HTML elements matching the class/attribute filters.

	# echo "<html>
	<body>
	<p>Some links</p>
	<a href="https://example.com/1" class="video">Click me</a>
	<a href="https://example.com/2" class="video">Click me</a>
	<a href="https://example.com/2" class="video">Click me</a>
	</body>
	</html>" \| htmlfind.py --element a --clss video --attr href
	https://example.com/1
	https://example.com/2
	https://example.com/3

	# htmlfind.py --element a --clss video --attr href some-file.html
	"""
	import argparse
	import pathlib
	import sys

	try:
	from BeautifulSoup import BeautifulSoup
	except ImportError:
	from bs4 import BeautifulSoup


	def find_elements(content: str, element: str, clss: str, attribute: str, pretty: bool = False):
	"""Searches for elements in `content` that matches the element, class, attribute."""
	attrs = {'class': clss} if clss else None

	soup = BeautifulSoup(content, features="html.parser")
	results = soup.findAll(element, attrs=attrs)
	for result in results:
	# result: bs4.Tag
	if attribute:
	try:
	yield str(result[attribute]).strip()
	except KeyError:
	# Found a matching element, but it does not have the attribute.
	continue
	elif pretty:
	yield str(result.prettify()).strip()
	else:
	yield str(result).strip()


	def main():
	parser = argparse.ArgumentParser()
	parser.add_argument('-c', '--clss', type=str,
	help='Search for any element with a class of this value')
	parser.add_argument('-e', '--element', type=str,
	help='Search for all of these elements (div/a)')
	parser.add_argument('-a', '--attr', type=str,
	help='Return only this attribute of the element (href)')
	parser.add_argument('-p', '--pretty', default=False, action='store_true',
	help='Prettify output')
	parser.add_argument('path', default='-', nargs='?',
	help='The file to search, default is STDIN.')
	args = parser.parse_args()

	if args.path == '-':
	content = sys.stdin.read()
	if not content:
	print('No content in STDIN!', file=sys.stderr)
	return 1
	else:
	path = pathlib.Path(args.path)
	if not path.is_file():
	print(f'File does not exist {path}', file=sys.stderr)
	return 1
	content = path.read_text()
	if not content:
	print(f'No content in {args.path}', file=sys.stderr)
	return 1

	results = find_elements(
	content=content,
	element=args.element,
	clss=args.clss,
	attribute=args.attr,
	pretty=args.pretty,
	)
	for result in results:
	print(result)

	return 0


	if __name__ == '__main__':
	status_code = main()
	sys.exit(status_code)
No results found