Skip to content

Instantly share code, notes, and snippets.

@workze
Created April 23, 2020 05:57
Show Gist options
  • Save workze/f30b9a76cf663acacc8511d02cee0926 to your computer and use it in GitHub Desktop.
Save workze/f30b9a76cf663acacc8511d02cee0926 to your computer and use it in GitHub Desktop.
python bs4 xml html
from bs4 import BeautifulSoup
import re
html_doc = """
<html>
<head><title>The Dormouse's story</title></head>
<body>
<p class="title">
<b>The Dormouse's story</b>
</p>
<pp class="story">
<a href="/elsie" id="link1">Elsie</a>
<a href="/lacie" id="link2">Lacie</a>
</pp>
</body>
<html>
"""
soup = BeautifulSoup(html_doc, 'lxml')
# 格式化打印
print(html_doc, '\n===================\n')
def cookbook():
""""""
# 格式化打印
print(soup.prettify())
# =============== 查找Tag ==========
print(soup.title)
print(soup.head.title)
# 取父级,<head><title>The Dormouse's story</title></head>
print(soup.title.parent)
# 查找所有tag,[<a href="/elsie" id="link1">Elsie</a>, <a href="/lacie" id="link2">Lacie</a>]
print(soup.find_all('a'))
# 取子tag
print(soup.pp.children)
# .next_sibling 和 .previous_sibling
# .next_siblings 和 .previous_siblings 属性可以对当前节点的兄弟节点迭代输出:
# 返回第一个
print(soup.find('a'))
# 查询方式:
# name
print(soup.find_all('a'))
# re
print(soup.find_all(re.compile('a')))
# list
print(soup.find_all(['a', 'p']))
# function
def has_class_but_no_id(tag):
return tag.has_attr('class') and not tag.has_attr('id')
soup.find_all(has_class_but_no_id)
# 可以查询的对象
# name,默认就是name
# attrs, id = True, id = 'xxx', id = re.compile('xx')
# attrs, attrs = {'id': 'xxx'}
# 文本text,text= True,‘’,re,func
soup.find_all(text="Elsie")
# [u'Elsie']
# ============== 取Tag值 ============
# 取值,The Dormouse's story
# 如果只有一个子节点,可以级联取值 soup.head.string == soup.head.title.string
print(soup.title.string)
# 取属性,['title']
# 关于多值属性,html区分单值和多值,xml都是单值
print(soup.p.get('class'))
# {'class': ['title']}
print(soup.p.attrs)
# 取属性,['title']
print(soup.p['class'])
# =============== 修改Tag值 ==========
# 值替换
soup.title.string.replace_with("Title2")
# 修改tag的名称和属性
tag = soup.b
tag.name = "blockquote"
tag['class'] = 'verybold'
tag['id'] = 1
del tag['class']
del tag['id']
# 修改 .string
tag.string = "New link text."
# 输出
str(soup)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment