Skip to content

Instantly share code, notes, and snippets.

@Jack2
Created November 22, 2014 09:29
Show Gist options
  • Save Jack2/02b347f82dc52b3c1b77 to your computer and use it in GitHub Desktop.
Save Jack2/02b347f82dc52b3c1b77 to your computer and use it in GitHub Desktop.
#-*- coding: utf-8 -*-
import urllib
import csv
from bs4 import BeautifulSoup
#----------------------------------------------------------------------
## save all links from URL
def grap_type_from_url(url,tag1,tag2):
soup = BeautifulSoup(urllib.urlopen(url))
links=soup.findAll(tag1)
x=[]
#
if links is not None :
try:
for u in links:
type_url = u[tag2].lower()
if type_url is not None:
l = type_url.encode('ascii','ignore')
#print l
f3 = open('test_grap_url.csv','a')
f3.write(l+"\n")
f3.close()
#grap_type_from_url(l,tag1,tag2)
except Exception, e:
print e
## save all links from a .html
def save_type_from_file(fpath,tag1,tag2):
f = open(fpath,'r')
f2 = open(fpath+'_convert.csv', 'w')
lines = f.readlines()
x=[]
for line in lines:
soup = BeautifulSoup(line)
links=soup.findAll(tag1)
if links is not None :
try:
for u in links:
type_url = u[tag2].lower()
if type_url is not None:
l = type_url.encode('ascii','ignore')
f2.write(l+"\n")
grap_type_from_url(l,tag1,tag2)
except Exception, e:
print e
f2.close()
f.close()
#----------------------------------------------------------------------
if __name__ == "__main__":
fpath = 'index_real.html'
#url = "http://www.skt-lte.co.kr/"
save_type_from_file(fpath,'a','href')
#grap_type_from_url (url,'a','href')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment