Skip to content

Instantly share code, notes, and snippets.

@nhoffman
Created December 13, 2021 21:04
Show Gist options
  • Save nhoffman/626a536efa21b984c7e401504eb1ae3a to your computer and use it in GitHub Desktop.
Save nhoffman/626a536efa21b984c7e401504eb1ae3a to your computer and use it in GitHub Desktop.
scrape urls from an html file
#!/usr/bin/env python3
"""Scrape all urls from an html document
"""
import os
import sys
import argparse
# pip install beautifulsoup4
from bs4 import BeautifulSoup
def main(arguments):
parser = argparse.ArgumentParser(
description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('infile', help="Input file", type=argparse.FileType('r'))
parser.add_argument('-o', '--outfile', help="Output file",
default=sys.stdout, type=argparse.FileType('w'))
args = parser.parse_args(arguments)
doc = args.infile.read()
soup = BeautifulSoup(doc, 'html.parser')
for link in soup.find_all('a'):
print(link.get('href'))
if __name__ == '__main__':
sys.exit(main(sys.argv[1:]))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment