Skip to content

Instantly share code, notes, and snippets.

@lrnselfreliance
Created November 15, 2022 17:26
Show Gist options
  • Select an option

  • Save lrnselfreliance/fbe944833670ef2c057fbe3f84ce55a7 to your computer and use it in GitHub Desktop.

Select an option

Save lrnselfreliance/fbe944833670ef2c057fbe3f84ce55a7 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
"""
Searches through stdin/file for HTML elements matching the class/attribute filters.
# echo "<html>
<body>
<p>Some links</p>
<a href="https://example.com/1" class="video">Click me</a>
<a href="https://example.com/2" class="video">Click me</a>
<a href="https://example.com/2" class="video">Click me</a>
</body>
</html>" | htmlfind.py --element a --clss video --attr href
https://example.com/1
https://example.com/2
https://example.com/3
# htmlfind.py --element a --clss video --attr href some-file.html
"""
import argparse
import pathlib
import sys
try:
from BeautifulSoup import BeautifulSoup
except ImportError:
from bs4 import BeautifulSoup
def find_elements(content: str, element: str, clss: str, attribute: str, pretty: bool = False):
"""Searches for elements in `content` that matches the element, class, attribute."""
attrs = {'class': clss} if clss else None
soup = BeautifulSoup(content, features="html.parser")
results = soup.findAll(element, attrs=attrs)
for result in results:
# result: bs4.Tag
if attribute:
try:
yield str(result[attribute]).strip()
except KeyError:
# Found a matching element, but it does not have the attribute.
continue
elif pretty:
yield str(result.prettify()).strip()
else:
yield str(result).strip()
def main():
parser = argparse.ArgumentParser()
parser.add_argument('-c', '--clss', type=str,
help='Search for any element with a class of this value')
parser.add_argument('-e', '--element', type=str,
help='Search for all of these elements (div/a)')
parser.add_argument('-a', '--attr', type=str,
help='Return only this attribute of the element (href)')
parser.add_argument('-p', '--pretty', default=False, action='store_true',
help='Prettify output')
parser.add_argument('path', default='-', nargs='?',
help='The file to search, default is STDIN.')
args = parser.parse_args()
if args.path == '-':
content = sys.stdin.read()
if not content:
print('No content in STDIN!', file=sys.stderr)
return 1
else:
path = pathlib.Path(args.path)
if not path.is_file():
print(f'File does not exist {path}', file=sys.stderr)
return 1
content = path.read_text()
if not content:
print(f'No content in {args.path}', file=sys.stderr)
return 1
results = find_elements(
content=content,
element=args.element,
clss=args.clss,
attribute=args.attr,
pretty=args.pretty,
)
for result in results:
print(result)
return 0
if __name__ == '__main__':
status_code = main()
sys.exit(status_code)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment