Created
November 15, 2022 17:26
-
-
Save lrnselfreliance/fbe944833670ef2c057fbe3f84ce55a7 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """ | |
| Searches through stdin/file for HTML elements matching the class/attribute filters. | |
| # echo "<html> | |
| <body> | |
| <p>Some links</p> | |
| <a href="https://example.com/1" class="video">Click me</a> | |
| <a href="https://example.com/2" class="video">Click me</a> | |
| <a href="https://example.com/2" class="video">Click me</a> | |
| </body> | |
| </html>" | htmlfind.py --element a --clss video --attr href | |
| https://example.com/1 | |
| https://example.com/2 | |
| https://example.com/3 | |
| # htmlfind.py --element a --clss video --attr href some-file.html | |
| """ | |
| import argparse | |
| import pathlib | |
| import sys | |
| try: | |
| from BeautifulSoup import BeautifulSoup | |
| except ImportError: | |
| from bs4 import BeautifulSoup | |
| def find_elements(content: str, element: str, clss: str, attribute: str, pretty: bool = False): | |
| """Searches for elements in `content` that matches the element, class, attribute.""" | |
| attrs = {'class': clss} if clss else None | |
| soup = BeautifulSoup(content, features="html.parser") | |
| results = soup.findAll(element, attrs=attrs) | |
| for result in results: | |
| # result: bs4.Tag | |
| if attribute: | |
| try: | |
| yield str(result[attribute]).strip() | |
| except KeyError: | |
| # Found a matching element, but it does not have the attribute. | |
| continue | |
| elif pretty: | |
| yield str(result.prettify()).strip() | |
| else: | |
| yield str(result).strip() | |
| def main(): | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument('-c', '--clss', type=str, | |
| help='Search for any element with a class of this value') | |
| parser.add_argument('-e', '--element', type=str, | |
| help='Search for all of these elements (div/a)') | |
| parser.add_argument('-a', '--attr', type=str, | |
| help='Return only this attribute of the element (href)') | |
| parser.add_argument('-p', '--pretty', default=False, action='store_true', | |
| help='Prettify output') | |
| parser.add_argument('path', default='-', nargs='?', | |
| help='The file to search, default is STDIN.') | |
| args = parser.parse_args() | |
| if args.path == '-': | |
| content = sys.stdin.read() | |
| if not content: | |
| print('No content in STDIN!', file=sys.stderr) | |
| return 1 | |
| else: | |
| path = pathlib.Path(args.path) | |
| if not path.is_file(): | |
| print(f'File does not exist {path}', file=sys.stderr) | |
| return 1 | |
| content = path.read_text() | |
| if not content: | |
| print(f'No content in {args.path}', file=sys.stderr) | |
| return 1 | |
| results = find_elements( | |
| content=content, | |
| element=args.element, | |
| clss=args.clss, | |
| attribute=args.attr, | |
| pretty=args.pretty, | |
| ) | |
| for result in results: | |
| print(result) | |
| return 0 | |
| if __name__ == '__main__': | |
| status_code = main() | |
| sys.exit(status_code) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment