【Python教程】自动化基础之爬虫操作1
1. 使用requests库,爬取绝对领域的软妹子图片
import os
import requests
from lxml import etree
from threading import Thread
# 头信息
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/91.0.4472.164 Safari/537.36 "
}
# 获取网页的源代码,并且返回源码的字符串形式
def get_html(url):
try:
resp = requests.get(url, headers)
resp.encoding = 'utf-8'
if resp.status_code == 200:
return resp.text
else:
return None
except Exception as e:
print(f"获取当前【{url}】错误的原因是:", e)
# 获取图片的内容,返回二进制内容
def get_content(url):
try:
resp = requests.get(url, headers)
if resp.status_code == 200:
return resp.content
else:
return None
except Exception as e:
print(f"获取当前【{url}】错误的原因是:", e)
# 解析网页源码,返回解析数据的实例对象
def parse_html(text):
try:
e = etree.HTML(text)
return e
except Exception as e:
print("解析源代码出错", e)
# 保存图片
def save_picture(url, title, i):
try:
resp = get_html(url)
e = parse_html(resp)
hrefs = e.xpath("//div[@class='entry-content']/img/@src")
# 创建文件夹
base_path = rf".\软妹子\{i}\{title}"
# 判断文件夹路径是否存在,如果不存在,则创建新的文件夹
if os.path.exists(base_path):
pass
else:
os.makedirs(base_path)
for i in range(len(hrefs)):
resp = get_content(hrefs[i])
with open(base_path + rf"\{i}.jpg", "wb") as f:
f.write(resp)
except Exception as e:
print(f"sacePic方法调用出错原因:", e)
# 定义入口函数
def main(num):
# 最外层循环控制页数
for i in range(1, num + 1):
print(f"第【{i}】页下载开始")
base_url = f"https://www.jdlingyu.com/tuji/mzitu/page/{i}"
resp = get_html(base_url)
e = parse_html(resp)
wide_titles = e.xpath("//div/h2/a/text()")
wide_hrefs = e.xpath("//div/h2/a/@href")
# 第二层循环控制每一页面上的所有图片标题对象的链接条数
for wide_title, wide_href in zip(wide_titles, wide_hrefs):
# 使用线程,加快链接访问速度
s1 = Thread(target=save_picture, args=(wide_href, wide_title, i))
s1.start()
print(f"第【{i}】页【{wide_title}】下载完成")
print("-" * 30)
if __name__ == "__main__":
main(3)
print("主线程结束")
2. 使用requests库,爬取笔趣阁龙虎榜小说
import requests
from lxml import etree
base_url = "https://www.quge6.com"
targeturl = "https://www.quge6.com/bqglhb.html"
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'}
def chapter_downloader(url, titlename):
""" 下载指定的章节标题和内容并保存 """
# 获取网页
response = requests.get(url,headers=headers)
# 编码
response.encoding = 'utf-8'
# 转换
selector = etree.HTML(response.text)
name = selector.xpath('//h1/text()')[0]
print(f'正在下载章节{title}...')
# 章节内容
content = selector.xpath('//div[@id="content"]/text()')
content = ''.join(content)
with open(r'{0}\{1}.txt'.format(base_path, title),'a',encoding='utf-8') as file:
file.write(name+'\n'+content+'\n')
base_path = r".\txtfile"
resp = requests.get(targeturl, headers)
resp.encoding = "utf-8"
e = etree.HTML(resp.text)
# 获取到龙虎榜所有的小说的标题和链接
titles = e.xpath("//div[@class='topbooks']/ul/li/a/@title")
hrefs = e.xpath("//div[@class='topbooks']/ul/li/a/@href")
hrefs = [base_url + href for href in hrefs]
# 遍历访问hrefs中所有的链接
for title, href in zip(titles, hrefs):
response = requests.get(href,headers=headers)
response.encoding = 'utf-8'
selector = etree.HTML(response.text)
name = selector.xpath('//h1/text()')[0]
urls = selector.xpath('//div[@id="list"]/dl/dd/a/@href')
urls = ['https://www.quge6.com'+url for url in urls]
for url in urls[:2]:
chapter_downloader(url, title)
3. 使用selenium库,重写爬取笔趣阁龙虎榜
import os
from selenium import webdriver
from threading import Thread
# 创建浏览器对象,返回访问结果
def get_html(url):
try:
phantomjs = webdriver.PhantomJS(executable_path=r"D:\Tools\PythonInstall\phantomjs-2.0.0-windows\bin\phantomjs",
service_args=['--ignore-ssl-errors=true', '--ssl-protocol=TLSv1'])
phantomjs.get(url)
return phantomjs
except Exception as e:
print(f"【get_html】调用错误原因:{e}")
# 访问每一章节,获取每一章节内容
def save_books(url, book_name, base_path):
try:
with open(base_path + rf"\{book_name}.txt", "a", encoding="utf-8") as file:
# 创建小说内容访问浏览器
content_phantomjs = get_html(url)
# 获取章节的标题
chapter_title = content_phantomjs.find_element_by_xpath("//div/h1").text
# 获取章节内容
chapter_content = content_phantomjs.find_element_by_id("content").text
print(f"开始下载{chapter_title}")
file.write(str(chapter_title) + "\n")
file.write("\n" * 3)
file.write(str(chapter_content))
print(f"{chapter_title}下载完成")
# 存储完毕,关闭浏览器
content_phantomjs.close()
except Exception as e:
print(f"【save_books】调用错误原因:{e}")
# 函数的主入口
def main(url, books, number):
try:
# 创建书名访问浏览器
book_phantomjs = get_html(url)
title_elements = book_phantomjs.find_elements_by_xpath("//div[@class='topbooks']/ul/li/a")
# 获取小说的书名和跳转章节链接
book_names = [title_element.text for title_element in title_elements]
books_hrefs = [title_element.get_attribute("href") for title_element in title_elements]
# 初始化一个book数量变量,用来控制下载书的数量
count = 0
# 遍历books_hrefs,获取book_type,chapter_href
for book_name, book_href in zip(book_names, books_hrefs):
# 创建章节访问浏览器
chapter_phantomjs = get_html(book_href)
# 获取到book的类型
book_type = chapter_phantomjs.find_element_by_class_name("con_top").text
book_type = book_type.split('>')[1].strip()
# 获取小说的章节链接
chapter_hrefs_elements = chapter_phantomjs.find_elements_by_xpath("//div[@id='list']/dl/dd/a")
chapter_hrefs = [chapter_hrefs_element.get_attribute("href") for chapter_hrefs_element in
chapter_hrefs_elements][
12:]
base_path = rf"E:\47期课程笔记\第二阶段 python\上课练习\selenium_1\小说下载目录\{book_type}"
if os.path.exists(base_path):
pass
else:
os.mkdir(base_path)
# 访问章节链接
for index, chapter_href in enumerate(chapter_hrefs):
# 使用多线程访问
s1 = Thread(target=save_books, args=(chapter_href, book_name, base_path))
s1.start()
if index >= number:
# 结束条件,自己控制
break
if count >= books:
break
count += 1
except Exception as e:
print(f"【main】调用错误原因:{e}")
if __name__ == '__main__':
# 传入龙虎榜小说入口链接
base_url = "https://www.quge6.com/bqglhb.html"
main(base_url, 5, 3)
赞赏
微信赞赏支付宝赞赏