Created
December 13, 2021 21:04
-
-
Save nhoffman/626a536efa21b984c7e401504eb1ae3a to your computer and use it in GitHub Desktop.
scrape urls from an html file
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
"""Scrape all urls from an html document | |
""" | |
import os | |
import sys | |
import argparse | |
# pip install beautifulsoup4 | |
from bs4 import BeautifulSoup | |
def main(arguments): | |
parser = argparse.ArgumentParser( | |
description=__doc__, | |
formatter_class=argparse.RawDescriptionHelpFormatter) | |
parser.add_argument('infile', help="Input file", type=argparse.FileType('r')) | |
parser.add_argument('-o', '--outfile', help="Output file", | |
default=sys.stdout, type=argparse.FileType('w')) | |
args = parser.parse_args(arguments) | |
doc = args.infile.read() | |
soup = BeautifulSoup(doc, 'html.parser') | |
for link in soup.find_all('a'): | |
print(link.get('href')) | |
if __name__ == '__main__': | |
sys.exit(main(sys.argv[1:])) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment