【Python教程】自动化基础之爬虫操作1

作者 : 郭然 本文共5127个字,预计阅读时间需要13分钟 发布时间: 2021-07-30 共177人阅读

1. 使用requests库,爬取绝对领域的软妹子图片

import os

import requests
from lxml import etree
from threading import Thread

# 头信息
headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
                                  "Chrome/91.0.4472.164 Safari/537.36 "
}

# 获取网页的源代码,并且返回源码的字符串形式
def get_html(url):
        try:
                resp = requests.get(url, headers)
                resp.encoding = 'utf-8'
                if resp.status_code == 200:
                        return resp.text
                else:
                        return None
        except Exception as e:
                print(f"获取当前【{url}】错误的原因是:", e)

# 获取图片的内容,返回二进制内容
def get_content(url):
        try:
                resp = requests.get(url, headers)
                if resp.status_code == 200:
                        return resp.content
                else:
                        return None
        except Exception as e:
                print(f"获取当前【{url}】错误的原因是:", e)

# 解析网页源码,返回解析数据的实例对象
def parse_html(text):
        try:
                e = etree.HTML(text)
                return e
        except Exception as e:
                print("解析源代码出错", e)

# 保存图片
def save_picture(url, title, i):
        try:
                resp = get_html(url)
                e = parse_html(resp)
                hrefs = e.xpath("//div[@class='entry-content']/img/@src")
                # 创建文件夹
                base_path = rf".\软妹子\{i}\{title}"
                # 判断文件夹路径是否存在,如果不存在,则创建新的文件夹
                if os.path.exists(base_path):
                        pass
                else:
                        os.makedirs(base_path)
                for i in range(len(hrefs)):
                        resp = get_content(hrefs[i])
                        with open(base_path + rf"\{i}.jpg", "wb") as f:
                                f.write(resp)
        except Exception as e:
                print(f"sacePic方法调用出错原因:", e)

# 定义入口函数
def main(num):
        # 最外层循环控制页数
        for i in range(1, num + 1):
                print(f"第【{i}】页下载开始")
                base_url = f"https://www.jdlingyu.com/tuji/mzitu/page/{i}"
                resp = get_html(base_url)
                e = parse_html(resp)
                wide_titles = e.xpath("//div/h2/a/text()")
                wide_hrefs = e.xpath("//div/h2/a/@href")
                # 第二层循环控制每一页面上的所有图片标题对象的链接条数
                for wide_title, wide_href in zip(wide_titles, wide_hrefs):
                        # 使用线程,加快链接访问速度
                        s1 = Thread(target=save_picture, args=(wide_href, wide_title, i))
                        s1.start()
                        print(f"第【{i}】页【{wide_title}】下载完成")

                print("-" * 30)

if __name__ == "__main__":
        main(3)
        print("主线程结束")

2. 使用requests库,爬取笔趣阁龙虎榜小说

 

import requests
from lxml import etree
base_url = "https://www.quge6.com"
targeturl = "https://www.quge6.com/bqglhb.html"
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'}

def chapter_downloader(url, titlename):
    """ 下载指定的章节标题和内容并保存 """
    # 获取网页
    response = requests.get(url,headers=headers)
    # 编码
    response.encoding = 'utf-8'
    # 转换
    selector = etree.HTML(response.text)
    name = selector.xpath('//h1/text()')[0]
    print(f'正在下载章节{title}...')
    # 章节内容
    content = selector.xpath('//div[@id="content"]/text()')
    content = ''.join(content)

    with open(r'{0}\{1}.txt'.format(base_path, title),'a',encoding='utf-8') as file:
        file.write(name+'\n'+content+'\n')

base_path = r".\txtfile"
resp = requests.get(targeturl, headers)
resp.encoding = "utf-8"

e = etree.HTML(resp.text)
# 获取到龙虎榜所有的小说的标题和链接
titles = e.xpath("//div[@class='topbooks']/ul/li/a/@title")
hrefs = e.xpath("//div[@class='topbooks']/ul/li/a/@href")
hrefs = [base_url + href for href in hrefs]

# 遍历访问hrefs中所有的链接
for title, href in zip(titles, hrefs):
    response = requests.get(href,headers=headers)
    response.encoding = 'utf-8'
    selector = etree.HTML(response.text)
    name = selector.xpath('//h1/text()')[0]

    urls = selector.xpath('//div[@id="list"]/dl/dd/a/@href')
    urls = ['https://www.quge6.com'+url for url in urls]
    for url in urls[:2]:
        chapter_downloader(url, title)

3. 使用selenium库,重写爬取笔趣阁龙虎榜

 

import os

from selenium import webdriver
from threading import Thread

# 创建浏览器对象,返回访问结果
def get_html(url):
    try:
        phantomjs = webdriver.PhantomJS(executable_path=r"D:\Tools\PythonInstall\phantomjs-2.0.0-windows\bin\phantomjs",
                                        service_args=['--ignore-ssl-errors=true', '--ssl-protocol=TLSv1'])
        phantomjs.get(url)
        return phantomjs
    except Exception as e:
        print(f"【get_html】调用错误原因:{e}")

# 访问每一章节,获取每一章节内容
def save_books(url, book_name, base_path):
    try:
        with open(base_path + rf"\{book_name}.txt", "a", encoding="utf-8") as file:
            # 创建小说内容访问浏览器
            content_phantomjs = get_html(url)
            # 获取章节的标题
            chapter_title = content_phantomjs.find_element_by_xpath("//div/h1").text
            # 获取章节内容
            chapter_content = content_phantomjs.find_element_by_id("content").text
            print(f"开始下载{chapter_title}")
            file.write(str(chapter_title) + "\n")
            file.write("\n" * 3)
            file.write(str(chapter_content))
            print(f"{chapter_title}下载完成")
            # 存储完毕,关闭浏览器
            content_phantomjs.close()
    except Exception as e:
        print(f"【save_books】调用错误原因:{e}")

# 函数的主入口
def main(url, books, number):
    try:
        # 创建书名访问浏览器
        book_phantomjs = get_html(url)
        title_elements = book_phantomjs.find_elements_by_xpath("//div[@class='topbooks']/ul/li/a")
        # 获取小说的书名和跳转章节链接
        book_names = [title_element.text for title_element in title_elements]
        books_hrefs = [title_element.get_attribute("href") for title_element in title_elements]
        # 初始化一个book数量变量,用来控制下载书的数量
        count = 0
        # 遍历books_hrefs,获取book_type,chapter_href
        for book_name, book_href in zip(book_names, books_hrefs):
            # 创建章节访问浏览器
            chapter_phantomjs = get_html(book_href)
            # 获取到book的类型
            book_type = chapter_phantomjs.find_element_by_class_name("con_top").text
            book_type = book_type.split('>')[1].strip()
            # 获取小说的章节链接
            chapter_hrefs_elements = chapter_phantomjs.find_elements_by_xpath("//div[@id='list']/dl/dd/a")
            chapter_hrefs = [chapter_hrefs_element.get_attribute("href") for chapter_hrefs_element in
                             chapter_hrefs_elements][
                            12:]
            base_path = rf"E:\47期课程笔记\第二阶段 python\上课练习\selenium_1\小说下载目录\{book_type}"
            if os.path.exists(base_path):
                pass
            else:
                os.mkdir(base_path)
            # 访问章节链接
            for index, chapter_href in enumerate(chapter_hrefs):
                # 使用多线程访问
                s1 = Thread(target=save_books, args=(chapter_href, book_name, base_path))
                s1.start()
                if index >= number:
                    # 结束条件,自己控制
                    break
            if count >= books:
                break
            count += 1
    except Exception as e:
        print(f"【main】调用错误原因:{e}")

if __name__ == '__main__':
    # 传入龙虎榜小说入口链接
    base_url = "https://www.quge6.com/bqglhb.html"
    main(base_url, 5, 3)

 

赞赏

微信赞赏支付宝赞赏

VIP部落提供编程技术、教育培训、优惠购物以及各类软件和网站源码、模板等资源下载。
VIP部落 » 【Python教程】自动化基础之爬虫操作1

常见问题FAQ

提供最优质的资源集合

立即查看 了解详情