Skip to content

Instantly share code, notes, and snippets.

@search5
Created December 20, 2020 13:03
Show Gist options
  • Save search5/247dab7e25dd23c031e3a385bc5314e1 to your computer and use it in GitHub Desktop.
Save search5/247dab7e25dd23c031e3a385bc5314e1 to your computer and use it in GitHub Desktop.
aldin_crawl.py
# 제목, 지은이, 출판사, 출판일, ISBN, 그림
url1 = "https://www.aladin.co.kr/shop/wproduct.aspx?ItemId=248169181"
# url = "https://www.aladin.co.kr/shop/wproduct.aspx?ItemId=256383383"
# url = "https://www.aladin.co.kr/shop/wproduct.aspx?ItemId=220179301"
# url = "https://www.aladin.co.kr/shop/wproduct.aspx?ItemId=200010445"
# url = "https://www.aladin.co.kr/shop/UsedShop/wuseditemall.aspx?ItemId=243636605"
# url = "https://www.aladin.co.kr/shop/wproduct.aspx?ItemId=165668006"
import requests
from bs4 import BeautifulSoup, Tag, NavigableString
def parseAladin(url):
r = requests.get(url)
b = BeautifulSoup(r.text, 'lxml')
# 제목
title = b.select_one(".Ere_bo_title")
writer_area = b.select("li.Ere_sub2_title")
tmp_author_area = []
for entry in writer_area[0]:
if type(entry) is Tag and entry.name == 'span':
continue
if type(entry) == NavigableString:
tmp_author_area.append(entry[:-1] if entry.endswith(",") else entry)
else:
tmp_author_area.append(entry)
def filterIndex(find_str, tmp_author_area):
find_index = -1
try:
find_index = tmp_author_area.index(find_str)
except ValueError as e:
pass
tmp_filter_records = []
if find_index > -1:
tmp_filter_records = tmp_author_area[:find_index]
tmp_author_area = tmp_author_area[find_index + 1:]
return tmp_filter_records, tmp_author_area
tmp_authors, tmp_author_area = filterIndex("\xa0(지은이)", tmp_author_area)
tmp_translator, tmp_author_area = filterIndex("\xa0(옮긴이)", tmp_author_area)
tmp_picture, tmp_author_area = filterIndex("\xa0(그림)", tmp_author_area)
press = None
for entry in tmp_author_area:
if type(entry) is Tag and entry.name == 'a':
if 'PublisherSearch' in entry['href']:
press = entry
break
# 출판일
next_item = list(press.next_siblings)
press_date = None
for entry in next_item:
if type(entry) is NavigableString:
press_date = entry
break
# 원제
original_name = ''
for entry in list(press.next_siblings):
if type(entry) is Tag and '원제' in entry.text:
original_name = entry
break
# ISBN
isbn = ""
for item in b.select(".conts_info_list1 ul li"):
if item.text.startswith("ISBN"):
isbn = item.text.split(" : ")[-1]
break
# 그림
cover_image = b.select_one("#CoverMainImage")["src"]
return dict(
title=title,
authors=tmp_authors,
translator=tmp_translator,
pitctures=tmp_picture,
press=press,
press_date=press_date,
original_name=original_name,
isbn=isbn,
cover_image=cover_image,
)
if __name__ == '__main__':
print(parseAladin(url1))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment