workze · April 23, 2020 05:57
diff --git a/python-bs4.py b/python-bs4.py
 from bs4 import BeautifulSoup
 import re


 html_doc = """
 <html>
 <head><title>The Dormouse's story</title></head>
 <body>

 <p class="title">
    <b>The Dormouse's story</b>
 </p>
 <pp class="story">
    <a href="/elsie" id="link1">Elsie</a>  
    <a href="/lacie" id="link2">Lacie</a>  
 </pp>

 </body>
 <html>
 """
 soup = BeautifulSoup(html_doc, 'lxml')
 # 格式化打印
 print(html_doc, '\n===================\n')

 def cookbook():
    """"""
    # 格式化打印
    print(soup.prettify())

    # =============== 查找Tag ==========
    print(soup.title)
    print(soup.head.title)
    # 取父级,<head><title>The Dormouse's story</title></head>
    print(soup.title.parent)
    # 查找所有tag，[<a href="/elsie" id="link1">Elsie</a>, <a href="/lacie" id="link2">Lacie</a>]
    print(soup.find_all('a'))
    # 取子tag
    print(soup.pp.children)
    # .next_sibling 和 .previous_sibling
    #  .next_siblings 和 .previous_siblings 属性可以对当前节点的兄弟节点迭代输出:

    # 返回第一个
    print(soup.find('a'))
    # 查询方式：
    # name
    print(soup.find_all('a'))
    # re
    print(soup.find_all(re.compile('a')))
    # list
    print(soup.find_all(['a', 'p']))

    # function
    def has_class_but_no_id(tag):
        return tag.has_attr('class') and not tag.has_attr('id')

    soup.find_all(has_class_but_no_id)

    # 可以查询的对象
    # name，默认就是name

    # attrs, id = True, id = 'xxx', id = re.compile('xx')
    # attrs, attrs = {'id': 'xxx'}

    # 文本text，text= True，‘’，re，func
    soup.find_all(text="Elsie")
    # [u'Elsie']

    # ============== 取Tag值 ============
    # 取值，The Dormouse's story
    # 如果只有一个子节点，可以级联取值 soup.head.string == soup.head.title.string
    print(soup.title.string)
    # 取属性，['title']
    # 关于多值属性，html区分单值和多值，xml都是单值
    print(soup.p.get('class'))

    # {'class': ['title']}
    print(soup.p.attrs)

    # 取属性，['title']
    print(soup.p['class'])


    # =============== 修改Tag值 ==========
    # 值替换
    soup.title.string.replace_with("Title2")

    # 修改tag的名称和属性
    tag = soup.b

    tag.name = "blockquote"
    tag['class'] = 'verybold'
    tag['id'] = 1

    del tag['class']
    del tag['id']

    # 修改 .string
    tag.string = "New link text."

    # 输出
    str(soup)
	from bs4 import BeautifulSoup
	import re


	html_doc = """
	<html>
	<head><title>The Dormouse's story</title></head>
	<body>

	<p class="title">
	<b>The Dormouse's story</b>
	</p>
	<pp class="story">
	<a href="/elsie" id="link1">Elsie</a>
	<a href="/lacie" id="link2">Lacie</a>
	</pp>

	</body>
	<html>
	"""
	soup = BeautifulSoup(html_doc, 'lxml')
	# 格式化打印
	print(html_doc, '\n===================\n')

	def cookbook():
	""""""
	# 格式化打印
	print(soup.prettify())

	# =============== 查找Tag ==========
	print(soup.title)
	print(soup.head.title)
	# 取父级,<head><title>The Dormouse's story</title></head>
	print(soup.title.parent)
	# 查找所有tag，[<a href="/elsie" id="link1">Elsie</a>, <a href="/lacie" id="link2">Lacie</a>]
	print(soup.find_all('a'))
	# 取子tag
	print(soup.pp.children)
	# .next_sibling 和 .previous_sibling
	# .next_siblings 和 .previous_siblings 属性可以对当前节点的兄弟节点迭代输出:

	# 返回第一个
	print(soup.find('a'))
	# 查询方式：
	# name
	print(soup.find_all('a'))
	# re
	print(soup.find_all(re.compile('a')))
	# list
	print(soup.find_all(['a', 'p']))

	# function
	def has_class_but_no_id(tag):
	return tag.has_attr('class') and not tag.has_attr('id')

	soup.find_all(has_class_but_no_id)

	# 可以查询的对象
	# name，默认就是name

	# attrs, id = True, id = 'xxx', id = re.compile('xx')
	# attrs, attrs = {'id': 'xxx'}

	# 文本text，text= True，‘’，re，func
	soup.find_all(text="Elsie")
	# [u'Elsie']

	# ============== 取Tag值 ============
	# 取值，The Dormouse's story
	# 如果只有一个子节点，可以级联取值 soup.head.string == soup.head.title.string
	print(soup.title.string)
	# 取属性，['title']
	# 关于多值属性，html区分单值和多值，xml都是单值
	print(soup.p.get('class'))

	# {'class': ['title']}
	print(soup.p.attrs)

	# 取属性，['title']
	print(soup.p['class'])


	# =============== 修改Tag值 ==========
	# 值替换
	soup.title.string.replace_with("Title2")

	# 修改tag的名称和属性
	tag = soup.b

	tag.name = "blockquote"
	tag['class'] = 'verybold'
	tag['id'] = 1

	del tag['class']
	del tag['id']

	# 修改 .string
	tag.string = "New link text."

	# 输出
	str(soup)