php中文网 | cnphp.com

 找回密码
 立即注册

QQ登录

只需一步,快速开始

搜索
查看: 589|回复: 0

python3 图片爬虫

[复制链接]

3138

主题

3148

帖子

1万

积分

管理员

Rank: 9Rank: 9Rank: 9

UID
1
威望
0
积分
7946
贡献
0
注册时间
2021-4-14
最后登录
2024-11-21
在线时间
763 小时
QQ
发表于 2022-5-17 16:44:03 | 显示全部楼层 |阅读模式
[mw_shl_code=applescript,true]# -*- coding: utf-8 -*-
import requests
import random
import time
import os
import json
from lxml import etree
import threading
from pynput.keyboard import Key, Listener


def getpage(url, refer='https://www.mzitu.com/', host='www.mzitu.com'):
    if len(url.strip())==0: return 0   
    AgentList = [
        "Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:82.0) Gecko/20100101 Firefox/82.0",
        "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",

    ]
    rdAgent = random.choice(AgentList)

    oHeader = {'Host': host,
               'Referer': refer,
               'User-Agent': rdAgent,
               'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
               'Accept-Encoding': 'gzip, deflate, br',
               'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
               'Connection': 'keep-alive',
                 }
    page = requests.get(url, headers=oHeader)
    if page.status_code != 200:
        print('the page status is :', page.status_code )   
        return 0
    return page

def getPicCoverlist(HomePage):
    HomePage.encoding = 'utf-8'
    html = etree.HTML(HomePage.text)
    coverlist = html.xpath('//div[@class="postlist"]/ul[@id="pins"]/li/a/@href')
    covertitle = html.xpath('//div[@class="postlist"]/ul[@id="pins"]/li/a/img/@alt')
    return list(zip(coverlist, covertitle))


def getPageCount(headpageurl):
    page = getpage(headpageurl)
    page.encoding = 'utf=8'
    html = etree.HTML(page.text)
    picurllist = html.xpath('//div[@class="pagenavi"]/a/@href')
    HeadNum = int(picurllist[-1].split('/')[-1])
    EndNum = int(picurllist[-2].split('/')[-1])
    urlrange = [headpageurl, HeadNum, EndNum]
    return urlrange

def MakepageUrllist(pageurlNum):
    pageurllist = []
    pageurllist.append(pageurlNum[0])
    for i in range(pageurlNum[1], pageurlNum[2] + 1):
        pageurllist.append(pageurlNum[0] + '/' + str(i))
    return pageurllist

def getPicdown(piclasspageurl, savepath):
    pos = piclasspageurl.rfind('/')
    last = int(piclasspageurl[pos + 1:])
    if last > 1:
        lasturl = piclasspageurl[:pos+1] + str(last - 1)
    else:
        lasturl = piclasspageurl
    page = getpage(piclasspageurl, lasturl)
    page.encoding = 'utf-8'
    html = etree.HTML(page.text)
    imgurl = html.xpath('//div[@class="main-image"]/p/a/img/@src')
    imgname = savepath + '\\' + imgurl[0].split('/')[-1]
    if not os.path.exists(imgname):
        imghtml = getpage(imgurl[0], piclasspageurl, 'imgpc.iimzt.com')
        if imghtml != 0:
            try:
                with open(imgname, 'wb') as f:
                    f.write(imghtml.content)
                    f.close
            except:
                print('写入错误:' + imgurl[0])
                pass
            time.sleep(random.randint(3, 8))
            return last
        else:
            print('下载图片错误', piclasspageurl, imgurl[0])
            return 0
    else:
        print('\n已下载:', imgurl)
        return 0

def checkdownfolder(savefolder, covername):
    tfolder = savefolder + '\\' + covername
    if not os.path.exists(tfolder):
        os.makedirs(tfolder)
        os.chdir(tfolder)
    return tfolder

   
def getPicClasslist(homepage):
    homepage.encoding = 'utf-8'
    html = etree.HTML(homepage.text)
    classURLlist = html.xpath('//div[@class="mainnav"]/ul[@id="menu-nav"]/li/a/@href')
    classtitlelist = html.xpath('//div[@class="mainnav"]/ul[@id="menu-nav"]/li/a/@title')
    return list(zip(classtitlelist, classURLlist))


def choosePicClass():
    homeURL = 'https://www.mzitu.com'
    pclass = getPicClasslist(getpage(homeURL))
    print('选择类别'.center(45,'*'))
    for k in range(1, len(pclass) - 3):
        #print('序号 ' +str(k)+ ' -- ' + pclass[k][0] + ' > '+pclass[k][1] )
        print('序号' + str(k) + ' >> ' + pclass[k][1])
    print(' your choice? '.center(45,'*'))
    relist = []
    urlnumber=input('请选择数字:')
    if urlnumber.isdigit() and int(urlnumber) >0 and int(urlnumber)<7 :
        relist=pclass[int(urlnumber)]
    return relist
def getClasspagelist(classhomeurl):
    page = getpage(classhomeurl)
    page.encoding = 'utf=8'   
    html = etree.HTML(page.text)
    tmppagelist = html.xpath('//div[@class="nav-links"]/a[@class="page-numbers"]/@href')
    knumber=tmppagelist[-1].rstrip('/').rfind('/')
    lastpage=tmppagelist[-1].rstrip('/')[knumber+1:]
    classpagelist=[classhomeurl]
    for p in range(2,int(lastpage)+1):
        classpagelist.append(classhomeurl+'page/'+str(p)+'/')
    return classpagelist

def getclassCode(ClassUrl):
    pos=ClassUrl.rstrip('/').rfind('/')
    return ClassUrl.rstrip('/')[pos + 1:]
#def delspicalchar(ostring):
    #ochar=r'[\/:*?"<>|]*'
    #return re.sub(ochar,'',ostring)
def formatname(img_title):
    for i in ['\\','/',':','*','?','"','<','>','!','|']:
        while i in img_title:
            img_title = img_title.strip().replace(i, '')
    return img_title
   

def MainDownThread():
    picClass = choosePicClass()
    picClasstitle = picClass[0]
    picClassURL = picClass[1]
    picClassCode = getclassCode(picClassURL)
    progressDict = {'classname': picClassCode, 'classpage': 0, 'covernumber': 0, 'picpage': 0}
    progress = []

    #从log文件载入下载进度
    if os.path.exists(logpath):
        with open(logpath, 'r') as f:
            progress = json.load(f)
            f.close
   
    for pDict in progress:
        if pDict['classname'] == picClassCode:
            progressDict = pDict
            picClassURL = mainurl + progressDict['classname'] + '/'
            picClassCode = progressDict['classname']
            picClasstitle = progressDict['classname']
            break
    classpagebegin = progressDict['classpage']
    coverbegin = progressDict['covernumber']
    picbegin = progressDict['picpage']   
    classpagelist=getClasspagelist(picClassURL)
    classsavefolder=downfolder + '\\' + picClassCode

   
    print('开始下载'.center(50,'+'))
   
    try:
        for cp in range(classpagebegin,len(classpagelist)):
            if ExitKey==True : break
            print('\n 正在下载 : '+ picClasstitle + ' 第 ' + str(cp+1) + ' 页 ' )  
            progressDict['classpage']=cp
            classpageurl=classpagelist[cp]      
            coverlist = getPicCoverlist(getpage(classpageurl))
            if cp > classpagebegin : coverbegin=0
            for i in range(coverbegin,len(coverlist)):
                if ExitKey==True : break
                cover=coverlist
                coverUrl = cover[0]
                covertitle = formatname(cover[1])
                print('\n\n Cover ' + str(i+1) + ' : [' + covertitle + ']')
                progressDict['covernumber']=i
                coverfolder = checkdownfolder(classsavefolder, covertitle)
                Pagelist = MakepageUrllist(getPageCount(coverUrl))
                print(' 套图共计 : ' + str(len(Pagelist)) + ' 张\n')
                if cp > classpagebegin or i > coverbegin:
                    picbegin = 0
                for j in range(picbegin,len(Pagelist)):
                    if ExitKey==True : break
                    try:
                        purl=Pagelist[j]                        
                        getre = getPicdown(purl, coverfolder)
                        if getre > 0:
                            print('\r' + '---->完成下载第 ' + str(j + 1) + ' 张', end='', flush=True)
                            progressDict['picpage'] = j                          
                    except Exception as e:                        
                        progressDict['picpage'] = j
                        SaveProgress(progressDict, progress)
                        print('\n第' + str(j + 1) + '张, 下载错误。', purl, e)
                        j = j - 1
                        continue

    except Exception as e:
        SaveProgress(progressDict, progress)
        print('\n 出现错误,无法继续' , e)
    else:
        SaveProgress(progressDict,progress)
        print('\n下载进程退出~~~~')
   
   
def SaveProgress(pdict, progress):
    for pd in progress:
        if pd['classname'] == pdict['classname']:
            progress.remove(pd)
    progress.append(pdict)
    jsonstr = json.dumps(progress)
    with open(logpath, 'w') as f:
        f.write(jsonstr)
        f.close   
            
def presskey(key):
    global ExitKey, TimeStamp
    if key == Key.ctrl_l:
        TimeStamp = time.time()
    if key == Key.f3:
        if (time.time() - TimeStamp) < 0.5:
            ExitKey = True
            time.sleep(1)
            return False                    
        
        

def onrelease(key):
    if key== Key.esc:
        return False

def startlisten():
    #with Listener(on_press=presskey, on_release=onrelease) as listener:
    with Listener(on_press=presskey) as listener:
        listener.join()   
            
if __name__ == '__main__':

    ExitKey = False
    TimeStamp = 0
    mainurl = 'https://www.mzitu.com/'
    downfolder = r'F:\Backup\privatebak\mypic'
    logpath = downfolder + '\\' + 'downinfo.log'
   
    t1 = threading.Thread(target=MainDownThread)
    t1.setDaemon(True)
    t1.start()
    startlisten()
    t1.join()
    os.system('pause')
[/mw_shl_code]

回复

使用道具 举报

您需要登录后才可以回帖 登录 | 立即注册

本版积分规则

QQ|php中文网 | cnphp.com ( 赣ICP备2021002321号-2 )

GMT+8, 2024-11-22 02:48 , Processed in 0.294819 second(s), 35 queries , Gzip On.

Powered by Discuz! X3.4 Licensed

Copyright © 2001-2020, Tencent Cloud.

申明:本站所有资源皆搜集自网络,相关版权归版权持有人所有,如有侵权,请电邮(fiorkn@foxmail.com)告之,本站会尽快删除。

快速回复 返回顶部 返回列表