Last active
August 29, 2015 14:09
-
-
Save yurihan/148f05dbe3e3213893d0 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
#-*- coding: utf-8 -*- | |
import urllib2 | |
import urllib | |
from BeautifulSoup import BeautifulSoup | |
import re | |
#대충대충 갑시다. | |
def crawlMuseum(): #국립 | |
museumList = [] | |
result = [] | |
baseUrl="http://www.museum.or.kr/organ/" | |
kind = ['국립','공립','사립','대학'] | |
print '목록 긁기' | |
for i in range(len(kind)) : # 1. 국립 2. 공립 3. 사립 4. 대학 | |
lst = [3,14,35,12] # 각 운영주체별 리스트 페이지 개수. 일단 대충 하드코딩. | |
print kind[i] | |
for j in range(lst[i]): | |
print '페이지 %d'%j | |
url = baseUrl+"museums01.php?cmdProc=listform&page=%d&orderByField=MuseumIntroduce_idx&orderBySort=DESC&MuseumIntroduce_kind=%d&sub_menu=%d"%(j+1,i+1,i+1) | |
#print url | |
req = urllib2.Request(url) | |
response = urllib2.urlopen(req,timeout=5) | |
# 파싱. | |
html = unicode(response.read(),'cp949').encode('utf-8') | |
#lst = re.findall('\\goList',html) | |
#next = len(lst)-2 | |
bs = BeautifulSoup(html) | |
m = bs('td',{'class':'CommuniPd02'}) | |
for mm in m : | |
href = mm('a')[0]['href'] | |
museumList.append(href) | |
print '정보 긁기' | |
for cnt,url in enumerate(museumList) : | |
req = urllib2.Request(baseUrl+url) | |
response = urllib2.urlopen(req,timeout=5) | |
# 파싱. | |
html = unicode(response.read(),'cp949').encode('utf-8') | |
bs = BeautifulSoup(html) | |
name = bs('td',{'class':'DataFont01'})[0].string.strip() | |
etc = bs('td',{'class':'PdTop5'}) | |
addr = etc[4].string.strip() | |
print '%5d/%-5d\t%s,%s'%(cnt+1,len(museumList),name,addr) | |
result.append([name,addr]) | |
print '파일 쓰기' | |
with open('museum.csv','w') as f : | |
#f.write('name|addr\n') | |
for m in result : | |
f.write(('M|%s|%s\n'%(m[0],m[1])).encode('utf-8')) | |
crawlMuseum() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment