Python爬虫入门:Beautiful Soup基础
Python爬虫入门:Beautiful Soup基础
基本用法
html = """
<html><head><title>这是标题</title></head>
<body>
<p class="title" name="title_name"><b>这是文章标题</b></p>
<a href="www.csdn.net" class="href" id="href01">csdn</a>
"""
from bs4 import BeautifulSoup
# 除了lxml,还有html.parser,xml,html5lib解析器
soup = BeautifulSoup(html,'lxml')
# prettify作用是整理格式输出
print(soup.prettify())
# 选取第一个元素,这里是title
print(soup.title)
# 获取元素文字内容
print(soup.title.string)
# 获取元素所有属性
print(soup.p.attrs)
# 获取元素指定属性
print(soup.p.attrs["name"])
print(soup.p["name"])
节点选择
子节点和子孙节点
- contents 获得所有直接子节点,返回列表
- children 获得所有子节点,返回生成器类型
- descendants 获取所有子孙节点
html = """
<html><head><title>这是标题</title></head>
<body>
<p class="title" name="title_name"><b>这是文章标题</b></p>
<a href="www.csdn.net" class="href" id="href01">csdn</a>
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
# 获取子元素列表
print(soup.body.contents)
for i,element in enumerate(soup.body.contents):
print(i,":",element)
print('*'*50+'\n')
# 获取子元素生成器
print(soup.body.children)
for i,element in enumerate(soup.body.children):
print(i,":",element)
print('*'*50+'\n')
# 逐层获取子孙元素生成器
print(soup.body.descendants)
for i,element in enumerate(soup.body.descendants):
print(i,":",element)
父节点和祖先节点
- parent 父节点
- parents 祖先节点们
html = """
<html><head><title>这是标题</title></head>
<body>
<p class="title" name="title_name"><b>这是文章标题</b></p>
<a href="www.csdn.net" class="href" id="href01">csdn</a>
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
# 父节点
print(soup.a.parent)
print('\n'+'*'*50+'\n')
# 祖先节点们
print(soup.a.parents)
for i,element in enumerate(soup.a.parents):
print(i,':',element)
兄弟节点
html = """
<html><head><title>这是标题</title></head>
<body>
<div>
<p>first</p>
<p class="title" name="title_name"><b>这是文章标题</b></p>上一个
<a href="www.csdn.net" class="href" id="href01">csdn</a>下一个
<a href="www.baidu.com">百度</a>
<p>end</p>
</div>
</body>
</html>
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
print(soup.a.next_sibling.strip())
print(soup.a.previous_sibling.strip())
print('*'*50+'\n')
print(soup.a.next_siblings)
print(list(soup.a.next_siblings))
print('\n'+'*'*50+'\n')
print(soup.a.previous_siblings)
print(list(soup.a.previous_siblings))
方法选择器
find_all()和find()
find_all(name,attrs,recursive,text,**kwargs)
- name为元素的名称,比如name=“ul”
- attrs是一个字典,比如attrs={“id”:“list-1”}
- text是要匹配的文本,可以使字符串也可以是正则表达式,比如text=re.compile(‘link’)
- find()和find_all()类似,只是返回第一个结果
类似find_all()的方法
- find_parents()和find_parent()
- find_next_siblings()和find_next_sibling()
- find_previous_siblings()和find_previous_sibling()
- find_all_next()和find_next()
- find_all_previous()和find_previous()
html = """
<html><head><title>这是标题</title></head>
<body>
<div>
<p>first</p>
<p class="title" name="title_name"><b>这是文章标题</b></p>上一个
<a href="www.csdn.net" class="href" id="href01">csdn</a>下一个
<a href="www.baidu.com">百度</a>
<p>end</p>
</div>
</body>
</html>
"""
import re
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
print(soup.find_all(name="a"))
print(soup.find_all(attrs={"id":"href01"}))
print(soup.find_all(text=re.compile("标题")))
print(soup.find_all(attrs={"id":"href01"})[0].find_parent())
print(soup.find_all(attrs={"id":"href01"})[0].find_next())
print(soup.find_all(attrs={"id":"href01"})[0].find_next_sibling())
select(“css样式”)
soup.select(".panel .panel-heading")
soup.select("ul li")
soup.select("#list-2 .element")
微信赞赏支付宝赞赏