Python实现年报爬取源码

admin 发表于 2022-10-22 12:26:34

import requests
import re
#定义爬取函数
#1、对单个页面进行请求，返回数据信息——以第一页为例
saving_path='F:\\张子萱\\project\\能源统计年鉴'
import requests
def get_and_download_pdf_flie(pageNum):
url='http://www.cninfo.com.cn/new/hisAnnouncement/query'
pageNum=int(pageNum)
data={'pageNum':pageNum,
   'pageSize':30,
   'column':'sse',
   'tabName':'fulltext',
   'plate':'sh',
   'stock':'600023,9900027828',
   'searchkey':'',
   'secid':'',
   'category':'category_ndbg_szsh',
   'trade':'',
   'seDate':'',
   'sortName':'',
   'sortType':'',
   'isHLtitle':'true'}
headers={'Accept':'application/json, text/javascript, */*; q=0.01',
   'Accept-Encoding':'gzip, deflate',
   'Accept-Language':'zh-CN,zh;q=0.9',
   'Connection':'keep-alive',
   'Content-Length':'242',
   'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',
   'Host':'www.cninfo.com.cn',
   'Origin':'http://www.cninfo.com.cn',
   'Referer':'http://www.cninfo.com.cn/new/disclosure/stock?stockCode=600023&orgId=9900027828',
   'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36 SLBrowser/7.0.0.12151 SLBChan/1',
   'X-Requested-With':'XMLHttpRequest'}
r=requests.post(url,data=data,headers=headers)
result=r.json()['announcements']#获取单页年报的数据，数据格式为json。获取json中的年报信息。
#2.对数据信息进行提取
for i in result:
   if re.search('摘要',i['announcementTitle']):#避免下载一些年报摘要等不需要的文件
         pass
   else:
         title=i['announcementTitle']
         secName=i['secName']
         secName=secName.replace('*','')#下载前要将文件名中带*号的去掉，因为文件命名规则不能带*号，否则程序会中断
         secCode=i['secCode']
         adjunctUrl=i['adjunctUrl']
         down_url='http://static.cninfo.com.cn/'+adjunctUrl
         filename=f'{secCode}{secName}{title}.pdf'
         filepath=saving_path+'\\'+filename
         r=requests.get(down_url)
         with open(filepath,'wb') as f:
            f.write(r.content)
         print(f'{secCode}{secName}{title}下载完毕')#设置进度条
#3.设置循环，下载多页的年报
for pageNum in range(1,3):#为演示，下载1-2页的年报
get_and_download_pdf_flie(pageNum)

页: [1]

php中文网 | cnphp.com's Archiver

Python实现年报爬取源码