Last active
February 9, 2021 08:25
-
-
Save NateWeiler/11af609047eece8f90396af5942b9c28 to your computer and use it in GitHub Desktop.
Extract href tag values (hyperlinks) from a webpage.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
from HTMLParser import HTMLParser | |
class MyHTMLParser(HTMLParser): | |
def handle_starttag(self, tag, attrs): | |
if tag == "a": | |
for name, value in attrs: | |
if name == "href": | |
print name, "=", value | |
parser = MyHTMLParser() | |
parser.feed(your_html_string) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
from HTMLParser import HTMLParser | |
class MyHTMLParser(HTMLParser): | |
def handle_starttag(self, tag, attrs): | |
if tag == "a": | |
for name, value in attrs: | |
if name == "href": | |
print name, "=", value | |
parser = MyHTMLParser() | |
parser.feed(your_html_string) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
from BeautifulSoup import BeautifulSoup | |
import urllib2 | |
import re | |
html_page = urllib2.urlopen("http://example.com/example.html") | |
soup = BeautifulSoup(html_page) | |
for link in soup.findAll('a'): | |
print link.get('href') |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
#-*- coding: utf-8 -*- | |
# usage ./find_hyperlinks.py "https://example.com/example.txt" | |
import os | |
import sys | |
import wget | |
from BeautifulSoup import BeautifulSoup | |
sys.setdefaultencoding('UTF8') | |
url = sys.argv[1] | |
filename = os.path.basename(url) | |
soup = BeautifulSoup(filename) | |
for tag in soup.findAll('a', href=True): | |
print(str(tag['href'])) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment