search5 · December 20, 2020 13:03
diff --git a/aldin_crawl.py b/aldin_crawl.py
 # 제목, 지은이, 출판사, 출판일, ISBN, 그림
 url1 = "https://www.aladin.co.kr/shop/wproduct.aspx?ItemId=248169181"
 # url = "https://www.aladin.co.kr/shop/wproduct.aspx?ItemId=256383383"
 # url = "https://www.aladin.co.kr/shop/wproduct.aspx?ItemId=220179301"
 # url = "https://www.aladin.co.kr/shop/wproduct.aspx?ItemId=200010445"
 # url = "https://www.aladin.co.kr/shop/UsedShop/wuseditemall.aspx?ItemId=243636605"
 # url = "https://www.aladin.co.kr/shop/wproduct.aspx?ItemId=165668006"
 import requests
 from bs4 import BeautifulSoup, Tag, NavigableString


 def parseAladin(url):
    r = requests.get(url)
    b = BeautifulSoup(r.text, 'lxml')

    # 제목
    title = b.select_one(".Ere_bo_title")

    writer_area = b.select("li.Ere_sub2_title")

    tmp_author_area = []
    for entry in writer_area[0]:
        if type(entry) is Tag and entry.name == 'span':
            continue
        if type(entry) == NavigableString:
            tmp_author_area.append(entry[:-1] if entry.endswith(",") else entry)
        else:
            tmp_author_area.append(entry)

    def filterIndex(find_str, tmp_author_area):
        find_index = -1

        try:
            find_index = tmp_author_area.index(find_str)
        except ValueError as e:
            pass

        tmp_filter_records = []
        if find_index > -1:
            tmp_filter_records = tmp_author_area[:find_index]
            tmp_author_area = tmp_author_area[find_index + 1:]

        return tmp_filter_records, tmp_author_area

    tmp_authors, tmp_author_area = filterIndex("\xa0(지은이)", tmp_author_area)
    tmp_translator, tmp_author_area = filterIndex("\xa0(옮긴이)", tmp_author_area)
    tmp_picture, tmp_author_area = filterIndex("\xa0(그림)", tmp_author_area)

    press = None
    for entry in tmp_author_area:
        if type(entry) is Tag and entry.name == 'a':
            if 'PublisherSearch' in entry['href']:
                press = entry
                break

    # 출판일
    next_item = list(press.next_siblings)
    press_date = None
    for entry in next_item:
        if type(entry) is NavigableString:
            press_date = entry
            break

    # 원제
    original_name = ''
    for entry in list(press.next_siblings):
        if type(entry) is Tag and '원제' in entry.text:
            original_name = entry
            break

    # ISBN
    isbn = ""
    for item in b.select(".conts_info_list1 ul li"):
        if item.text.startswith("ISBN"):
            isbn = item.text.split(" : ")[-1]
            break

    # 그림
    cover_image = b.select_one("#CoverMainImage")["src"]

    return dict(
        title=title,
        authors=tmp_authors,
        translator=tmp_translator,
        pitctures=tmp_picture,
        press=press,
        press_date=press_date,
        original_name=original_name,
        isbn=isbn,
        cover_image=cover_image,
    )


 if __name__ == '__main__':
    print(parseAladin(url1))
	# 제목, 지은이, 출판사, 출판일, ISBN, 그림
	url1 = "https://www.aladin.co.kr/shop/wproduct.aspx?ItemId=248169181"
	# url = "https://www.aladin.co.kr/shop/wproduct.aspx?ItemId=256383383"
	# url = "https://www.aladin.co.kr/shop/wproduct.aspx?ItemId=220179301"
	# url = "https://www.aladin.co.kr/shop/wproduct.aspx?ItemId=200010445"
	# url = "https://www.aladin.co.kr/shop/UsedShop/wuseditemall.aspx?ItemId=243636605"
	# url = "https://www.aladin.co.kr/shop/wproduct.aspx?ItemId=165668006"
	import requests
	from bs4 import BeautifulSoup, Tag, NavigableString


	def parseAladin(url):
	r = requests.get(url)
	b = BeautifulSoup(r.text, 'lxml')

	# 제목
	title = b.select_one(".Ere_bo_title")

	writer_area = b.select("li.Ere_sub2_title")

	tmp_author_area = []
	for entry in writer_area[0]:
	if type(entry) is Tag and entry.name == 'span':
	continue
	if type(entry) == NavigableString:
	tmp_author_area.append(entry[:-1] if entry.endswith(",") else entry)
	else:
	tmp_author_area.append(entry)

	def filterIndex(find_str, tmp_author_area):
	find_index = -1

	try:
	find_index = tmp_author_area.index(find_str)
	except ValueError as e:
	pass

	tmp_filter_records = []
	if find_index > -1:
	tmp_filter_records = tmp_author_area[:find_index]
	tmp_author_area = tmp_author_area[find_index + 1:]

	return tmp_filter_records, tmp_author_area

	tmp_authors, tmp_author_area = filterIndex("\xa0(지은이)", tmp_author_area)
	tmp_translator, tmp_author_area = filterIndex("\xa0(옮긴이)", tmp_author_area)
	tmp_picture, tmp_author_area = filterIndex("\xa0(그림)", tmp_author_area)

	press = None
	for entry in tmp_author_area:
	if type(entry) is Tag and entry.name == 'a':
	if 'PublisherSearch' in entry['href']:
	press = entry
	break

	# 출판일
	next_item = list(press.next_siblings)
	press_date = None
	for entry in next_item:
	if type(entry) is NavigableString:
	press_date = entry
	break

	# 원제
	original_name = ''
	for entry in list(press.next_siblings):
	if type(entry) is Tag and '원제' in entry.text:
	original_name = entry
	break

	# ISBN
	isbn = ""
	for item in b.select(".conts_info_list1 ul li"):
	if item.text.startswith("ISBN"):
	isbn = item.text.split(" : ")[-1]
	break

	# 그림
	cover_image = b.select_one("#CoverMainImage")["src"]

	return dict(
	title=title,
	authors=tmp_authors,
	translator=tmp_translator,
	pitctures=tmp_picture,
	press=press,
	press_date=press_date,
	original_name=original_name,
	isbn=isbn,
	cover_image=cover_image,
	)


	if __name__ == '__main__':
	print(parseAladin(url1))