admin 发表于 2022-5-17 16:44:03

python3 图片爬虫

# -*- coding: utf-8 -*-
import requests
import random
import time
import os
import json
from lxml import etree
import threading
from pynput.keyboard import Key, Listener


def getpage(url, refer='https://www.mzitu.com/', host='www.mzitu.com'):
    if len(url.strip())==0: return 0   
    AgentList = [
      "Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:82.0) Gecko/20100101 Firefox/82.0",
      "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0",
      "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
      'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
      "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",

    ]
    rdAgent = random.choice(AgentList)

    oHeader = {'Host': host,
               'Referer': refer,
               'User-Agent': rdAgent,
               'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
               'Accept-Encoding': 'gzip, deflate, br',
               'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
               'Connection': 'keep-alive',
               }
    page = requests.get(url, headers=oHeader)
    if page.status_code != 200:
      print('the page status is :', page.status_code )   
      return 0
    return page

def getPicCoverlist(HomePage):
    HomePage.encoding = 'utf-8'
    html = etree.HTML(HomePage.text)
    coverlist = html.xpath('//div[@class="postlist"]/ul[@id="pins"]/li/a/@href')
    covertitle = html.xpath('//div[@class="postlist"]/ul[@id="pins"]/li/a/img/@alt')
    return list(zip(coverlist, covertitle))


def getPageCount(headpageurl):
    page = getpage(headpageurl)
    page.encoding = 'utf=8'
    html = etree.HTML(page.text)
    picurllist = html.xpath('//div[@class="pagenavi"]/a/@href')
    HeadNum = int(picurllist[-1].split('/')[-1])
    EndNum = int(picurllist[-2].split('/')[-1])
    urlrange =
    return urlrange

def MakepageUrllist(pageurlNum):
    pageurllist = []
    pageurllist.append(pageurlNum)
    for i in range(pageurlNum, pageurlNum + 1):
      pageurllist.append(pageurlNum + '/' + str(i))
    return pageurllist

def getPicdown(piclasspageurl, savepath):
    pos = piclasspageurl.rfind('/')
    last = int(piclasspageurl)
    if last > 1:
      lasturl = piclasspageurl[:pos+1] + str(last - 1)
    else:
      lasturl = piclasspageurl
    page = getpage(piclasspageurl, lasturl)
    page.encoding = 'utf-8'
    html = etree.HTML(page.text)
    imgurl = html.xpath('//div[@class="main-image"]/p/a/img/@src')
    imgname = savepath + '\\' + imgurl.split('/')[-1]
    if not os.path.exists(imgname):
      imghtml = getpage(imgurl, piclasspageurl, 'imgpc.iimzt.com')
      if imghtml != 0:
            try:
                with open(imgname, 'wb') as f:
                  f.write(imghtml.content)
                  f.close
            except:
                print('写入错误:' + imgurl)
                pass
            time.sleep(random.randint(3, 8))
            return last
      else:
            print('下载图片错误', piclasspageurl, imgurl)
            return 0
    else:
      print('\n已下载:', imgurl)
      return 0

def checkdownfolder(savefolder, covername):
    tfolder = savefolder + '\\' + covername
    if not os.path.exists(tfolder):
      os.makedirs(tfolder)
      os.chdir(tfolder)
    return tfolder

   
def getPicClasslist(homepage):
    homepage.encoding = 'utf-8'
    html = etree.HTML(homepage.text)
    classURLlist = html.xpath('//div[@class="mainnav"]/ul[@id="menu-nav"]/li/a/@href')
    classtitlelist = html.xpath('//div[@class="mainnav"]/ul[@id="menu-nav"]/li/a/@title')
    return list(zip(classtitlelist, classURLlist))


def choosePicClass():
    homeURL = 'https://www.mzitu.com'
    pclass = getPicClasslist(getpage(homeURL))
    print('选择类别'.center(45,'*'))
    for k in range(1, len(pclass) - 3):
      #print('序号 ' +str(k)+ ' -- ' + pclass + ' > '+pclass )
      print('序号' + str(k) + ' >> ' + pclass)
    print(' your choice? '.center(45,'*'))
    relist = []
    urlnumber=input('请选择数字:')
    if urlnumber.isdigit() and int(urlnumber) >0 and int(urlnumber)<7 :
      relist=pclass
    return relist
def getClasspagelist(classhomeurl):
    page = getpage(classhomeurl)
    page.encoding = 'utf=8'   
    html = etree.HTML(page.text)
    tmppagelist = html.xpath('//div[@class="nav-links"]/a[@class="page-numbers"]/@href')
    knumber=tmppagelist[-1].rstrip('/').rfind('/')
    lastpage=tmppagelist[-1].rstrip('/')
    classpagelist=
    for p in range(2,int(lastpage)+1):
      classpagelist.append(classhomeurl+'page/'+str(p)+'/')
    return classpagelist

def getclassCode(ClassUrl):
    pos=ClassUrl.rstrip('/').rfind('/')
    return ClassUrl.rstrip('/')
#def delspicalchar(ostring):
    #ochar=r'[\/:*?"<>|]*'
    #return re.sub(ochar,'',ostring)
def formatname(img_title):
    for i in ['\\','/',':','*','?','"','<','>','!','|']:
      while i in img_title:
            img_title = img_title.strip().replace(i, '')
    return img_title
   

def MainDownThread():
    picClass = choosePicClass()
    picClasstitle = picClass
    picClassURL = picClass
    picClassCode = getclassCode(picClassURL)
    progressDict = {'classname': picClassCode, 'classpage': 0, 'covernumber': 0, 'picpage': 0}
    progress = []

    #从log文件载入下载进度
    if os.path.exists(logpath):
      with open(logpath, 'r') as f:
            progress = json.load(f)
            f.close
   
    for pDict in progress:
      if pDict['classname'] == picClassCode:
            progressDict = pDict
            picClassURL = mainurl + progressDict['classname'] + '/'
            picClassCode = progressDict['classname']
            picClasstitle = progressDict['classname']
            break
    classpagebegin = progressDict['classpage']
    coverbegin = progressDict['covernumber']
    picbegin = progressDict['picpage']   
    classpagelist=getClasspagelist(picClassURL)
    classsavefolder=downfolder + '\\' + picClassCode

   
    print('开始下载'.center(50,'+'))
   
    try:
      for cp in range(classpagebegin,len(classpagelist)):
            if ExitKey==True : break
            print('\n 正在下载 : '+ picClasstitle + ' 第 ' + str(cp+1) + ' 页 ' )
            progressDict['classpage']=cp
            classpageurl=classpagelist      
            coverlist = getPicCoverlist(getpage(classpageurl))
            if cp > classpagebegin : coverbegin=0
            for i in range(coverbegin,len(coverlist)):
                if ExitKey==True : break
                cover=coverlist
                coverUrl = cover
                covertitle = formatname(cover)
                print('\n\n Cover ' + str(i+1) + ' : [' + covertitle + ']')
                progressDict['covernumber']=i
                coverfolder = checkdownfolder(classsavefolder, covertitle)
                Pagelist = MakepageUrllist(getPageCount(coverUrl))
                print(' 套图共计 : ' + str(len(Pagelist)) + ' 张\n')
                if cp > classpagebegin or i > coverbegin:
                  picbegin = 0
                for j in range(picbegin,len(Pagelist)):
                  if ExitKey==True : break
                  try:
                        purl=Pagelist                        
                        getre = getPicdown(purl, coverfolder)
                        if getre > 0:
                            print('\r' + '---->完成下载第 ' + str(j + 1) + ' 张', end='', flush=True)
                            progressDict['picpage'] = j                        
                  except Exception as e:                        
                        progressDict['picpage'] = j
                        SaveProgress(progressDict, progress)
                        print('\n第' + str(j + 1) + '张, 下载错误。', purl, e)
                        j = j - 1
                        continue

    except Exception as e:
      SaveProgress(progressDict, progress)
      print('\n 出现错误,无法继续' , e)
    else:
      SaveProgress(progressDict,progress)
      print('\n下载进程退出~~~~')
   
   
def SaveProgress(pdict, progress):
    for pd in progress:
      if pd['classname'] == pdict['classname']:
            progress.remove(pd)
    progress.append(pdict)
    jsonstr = json.dumps(progress)
    with open(logpath, 'w') as f:
      f.write(jsonstr)
      f.close   
            
def presskey(key):
    global ExitKey, TimeStamp
    if key == Key.ctrl_l:
      TimeStamp = time.time()
    if key == Key.f3:
      if (time.time() - TimeStamp) < 0.5:
            ExitKey = True
            time.sleep(1)
            return False                  
      
      

def onrelease(key):
    if key== Key.esc:
      return False

def startlisten():
    #with Listener(on_press=presskey, on_release=onrelease) as listener:
    with Listener(on_press=presskey) as listener:
      listener.join()   
            
if __name__ == '__main__':

    ExitKey = False
    TimeStamp = 0
    mainurl = 'https://www.mzitu.com/'
    downfolder = r'F:\Backup\privatebak\mypic'
    logpath = downfolder + '\\' + 'downinfo.log'
   
    t1 = threading.Thread(target=MainDownThread)
    t1.setDaemon(True)
    t1.start()
    startlisten()
    t1.join()
    os.system('pause')
页: [1]
查看完整版本: python3 图片爬虫