Created
December 20, 2020 13:03
-
-
Save search5/247dab7e25dd23c031e3a385bc5314e1 to your computer and use it in GitHub Desktop.
aldin_crawl.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# 제목, 지은이, 출판사, 출판일, ISBN, 그림 | |
url1 = "https://www.aladin.co.kr/shop/wproduct.aspx?ItemId=248169181" | |
# url = "https://www.aladin.co.kr/shop/wproduct.aspx?ItemId=256383383" | |
# url = "https://www.aladin.co.kr/shop/wproduct.aspx?ItemId=220179301" | |
# url = "https://www.aladin.co.kr/shop/wproduct.aspx?ItemId=200010445" | |
# url = "https://www.aladin.co.kr/shop/UsedShop/wuseditemall.aspx?ItemId=243636605" | |
# url = "https://www.aladin.co.kr/shop/wproduct.aspx?ItemId=165668006" | |
import requests | |
from bs4 import BeautifulSoup, Tag, NavigableString | |
def parseAladin(url): | |
r = requests.get(url) | |
b = BeautifulSoup(r.text, 'lxml') | |
# 제목 | |
title = b.select_one(".Ere_bo_title") | |
writer_area = b.select("li.Ere_sub2_title") | |
tmp_author_area = [] | |
for entry in writer_area[0]: | |
if type(entry) is Tag and entry.name == 'span': | |
continue | |
if type(entry) == NavigableString: | |
tmp_author_area.append(entry[:-1] if entry.endswith(",") else entry) | |
else: | |
tmp_author_area.append(entry) | |
def filterIndex(find_str, tmp_author_area): | |
find_index = -1 | |
try: | |
find_index = tmp_author_area.index(find_str) | |
except ValueError as e: | |
pass | |
tmp_filter_records = [] | |
if find_index > -1: | |
tmp_filter_records = tmp_author_area[:find_index] | |
tmp_author_area = tmp_author_area[find_index + 1:] | |
return tmp_filter_records, tmp_author_area | |
tmp_authors, tmp_author_area = filterIndex("\xa0(지은이)", tmp_author_area) | |
tmp_translator, tmp_author_area = filterIndex("\xa0(옮긴이)", tmp_author_area) | |
tmp_picture, tmp_author_area = filterIndex("\xa0(그림)", tmp_author_area) | |
press = None | |
for entry in tmp_author_area: | |
if type(entry) is Tag and entry.name == 'a': | |
if 'PublisherSearch' in entry['href']: | |
press = entry | |
break | |
# 출판일 | |
next_item = list(press.next_siblings) | |
press_date = None | |
for entry in next_item: | |
if type(entry) is NavigableString: | |
press_date = entry | |
break | |
# 원제 | |
original_name = '' | |
for entry in list(press.next_siblings): | |
if type(entry) is Tag and '원제' in entry.text: | |
original_name = entry | |
break | |
# ISBN | |
isbn = "" | |
for item in b.select(".conts_info_list1 ul li"): | |
if item.text.startswith("ISBN"): | |
isbn = item.text.split(" : ")[-1] | |
break | |
# 그림 | |
cover_image = b.select_one("#CoverMainImage")["src"] | |
return dict( | |
title=title, | |
authors=tmp_authors, | |
translator=tmp_translator, | |
pitctures=tmp_picture, | |
press=press, | |
press_date=press_date, | |
original_name=original_name, | |
isbn=isbn, | |
cover_image=cover_image, | |
) | |
if __name__ == '__main__': | |
print(parseAladin(url1)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment