网络爬虫写真网实战,请勿大量爬取仅供参考学习
需要再文件位置创建 meitu 文件夹用于存放图片

# 代码

# -*- coding: utf-8 -*-  
# @Time    : 2022/10/26 7:41  
# @Author  : Naraci  
# @File    : 写真优化重构版.py  
  
import parsel  
import requests  
  
headers = {  
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 '  
                  'Safari/537.36 '}  
  
  
def url_request(url):  
    request = requests.get(url=url, headers=headers)  
    # 设置编码格式防止乱码  
    request.encoding = 'utf-8'  
    html_text = request.text  
    # 转换数据类型  
    parse_forats = parsel.Selector(html_text)  
    parse_forats_list = parse_forats.xpath('//section//div//ul//span/text()').getall()[-1]  # 取该页面总页数  
    num_list = [i for i in parse_forats_list if str.isdigit(i)]  # Python isdigit () 方法检测字符串是否只由数字组成。返回 True or False.  
    add_list = ''  
    for i in num_list:  
        add_list = add_list + i  
  
    number_end = int(add_list)  # 转换成整数类型  
    # 页面地址 https://www.wxept.com/qqmn/rbmn/page/2    html_url = url  
    # 遍历每个页面地址  
    for page in range(1, number_end + 1):  
        if page == 1:  
            request_url = html_url  
        else:  
            request_url = html_url + '/page/' + str(page)  
        print('正在下载:' + request_url + '页面!')  
        # for page_list in range(1, number_end + 1):  
        request_url.encode('utf-8')  
        url_list = requests.get(url=request_url, headers=headers)  
        selector = url_list.text  
        selector = parsel.Selector(selector)  # 转换数据类型  
        content = selector.xpath('//div//div/article')  # 获取所有需要的标签  
        for article in content:  
            pic_title = article.xpath('./h2/a/text()').get()  # 套图标题 用于命名文件夹  返回为对象属性需要用 get 取出来  
            pic_url = article.xpath('./h2/a/@href').get()  
            print('已将相册:' + pic_title + '添加到队列!')  
    #         pic_url = 'https://www.wxept.com/8762.html'  
    # #         pic_url = 'https://www.wxept.com/8879.html'            # 发送相册页面地址  
            response_pic = requests.get(url=pic_url, headers=headers).text  
            # 获取相册页面所有页数  
            selector_2 = parsel.Selector(response_pic)  
            img_url = selector_2.xpath('//div/a/span').getall()[-2]  
            num_list = [i for i in img_url if str.isdigit(i)]  # Python isdigit () 方法检测字符串是否只由数字组成。返回 True or False.  
            add_list = ''  
            number_list = pic_url.split('.html')[0]  
            # # 遍历出所有页面  
            for i in num_list:  
                add_list = add_list + i  
            number_end = int(add_list)  
            for page in range(1, number_end + 1):  
                if page == 1:  
                    request_html_url = number_list + '.html'  
                else:  
                    request_html_url = number_list + '.html/' + str(page)  
                print(request_html_url)  
                #解析图片地址  
                response_pic = requests.get(url=request_html_url, headers=headers).text  
                selector_2 = parsel.Selector(response_pic)  
                img_url = selector_2.xpath('//section//article//a/img/@src').getall()  
                for pic_url in img_url:  
                    # 发送图片链接请求,获取图片数据以二进制数据保存  
                    img_data = requests.get(url=pic_url, headers=headers).content  
                    # 数据保存  
                    fil_name = pic_url.split('/')[-1]  # 获取图片名字  
                    with open('meitudownload\\' + fil_name, mode='wb') as fp:  # wb 二进制  
                        fp.write(img_data)  
                        print('下载完成:' + fil_name)  
  
  
if __name__ == '__main__':  
    print('请在文件目录手动新建meitudownload目录')  
    print('在下面网址复制url粘贴值下方:https://www.wxept.com/qqmn/')  
    put_url = input('请输入正确分类页面url:')  
    # url = 'https://www.wxept.com/qqmn/dlmn'  
    url = put_url  
    url_request(url)  
    print('全部下载成功!')  
      
    # 文件根目录手动创建 meitu 文件夹

# 文件下载

源代码和 exe 成品文件点击下载

更新于 阅读次数

请我喝[茶]~( ̄▽ ̄)~*

Naraci 微信支付

微信支付

Naraci 支付宝

支付宝

Naraci 贝宝

贝宝