adrianshort · September 6, 2012 17:30
diff --git a/extract-urls.py b/extract-urls.py
 # Extract URLs from a web page to a CSV file
 # $ python extract-urls.py http://mysite.com/mypage.html myfile.csv
 # By Adrian Short 6 Sep 2012
  
 import sys
 import urllib
 import csv
 from bs4 import BeautifulSoup

 url = sys.argv.pop(1)
 out_fn = sys.argv.pop(1) # output filename for CSV file

 infile = urllib.urlopen(url)
 html = infile.read()
 soup = BeautifulSoup(html)

 with open(out_fn, 'wb') as outfile:
    writer = csv.writer(outfile)
    
    # You can use a CSS selector as an alias for find_all()
    for link in soup('a'):
        writer.writerow([link.string, link.get('href')])
	# Extract URLs from a web page to a CSV file
	# $ python extract-urls.py http://mysite.com/mypage.html myfile.csv
	# By Adrian Short 6 Sep 2012

	import sys
	import urllib
	import csv
	from bs4 import BeautifulSoup

	url = sys.argv.pop(1)
	out_fn = sys.argv.pop(1) # output filename for CSV file

	infile = urllib.urlopen(url)
	html = infile.read()
	soup = BeautifulSoup(html)

	with open(out_fn, 'wb') as outfile:
	writer = csv.writer(outfile)

	# You can use a CSS selector as an alias for find_all()
	for link in soup('a'):
	writer.writerow([link.string, link.get('href')])
No results found