Created
November 1, 2014 10:05
-
-
Save Jack2/93e49409cd5f470c9b93 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#-*- coding: utf-8 -*- | |
import urllib | |
from bs4 import BeautifulSoup | |
#---------------------------------------------------------------------- | |
## Print all links from a .html | |
def print_type_from_site(fpath,tag1,tag2): | |
f = open(fpath,'r'); | |
lines = f.readlines() | |
for line in lines: | |
soup = BeautifulSoup(line) | |
#EDIT - <a> tag + <class> name tag | |
links=soup.findAll(tag1) | |
x=[] | |
if links is not None : | |
try: | |
for u in links: | |
type_url = u[tag2].lower() | |
if type_url is not None: | |
l = type_url.encode('ascii','ignore') | |
print l | |
except Exception, e: | |
print e | |
f.close() | |
#---------------------------------------------------------------------- | |
if __name__ == "__main__": | |
#EDIT | |
fpath = 'index_real_.html' | |
print_type_from_site(fpath,'img','src') | |
print_type_from_site(fpath,'a','href') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment