威望0
积分7946
贡献0
在线时间763 小时
UID1
注册时间2021-4-14
最后登录2024-11-21
管理员
- UID
- 1
- 威望
- 0
- 积分
- 7946
- 贡献
- 0
- 注册时间
- 2021-4-14
- 最后登录
- 2024-11-21
- 在线时间
- 763 小时
|
[mw_shl_code=python,true]import requests
import re
#定义爬取函数
#1、对单个页面进行请求,返回数据信息——以第一页为例
saving_path='F:\\张子萱\\project\\能源统计年鉴'
import requests
def get_and_download_pdf_flie(pageNum):
url='http://www.cninfo.com.cn/new/hisAnnouncement/query'
pageNum=int(pageNum)
data={'pageNum':pageNum,
'pageSize':30,
'column':'sse',
'tabName':'fulltext',
'plate':'sh',
'stock':'600023,9900027828',
'searchkey':'',
'secid':'',
'category':'category_ndbg_szsh',
'trade':'',
'seDate':'',
'sortName':'',
'sortType':'',
'isHLtitle':'true'}
headers={'Accept':'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding':'gzip, deflate',
'Accept-Language':'zh-CN,zh;q=0.9',
'Connection':'keep-alive',
'Content-Length':'242',
'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',
'Host':'www.cninfo.com.cn',
'Origin':'http://www.cninfo.com.cn',
'Referer':'http://www.cninfo.com.cn/new/disclosure/stock?stockCode=600023&orgId=9900027828',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36 SLBrowser/7.0.0.12151 SLBChan/1',
'X-Requested-With':'XMLHttpRequest'}
r=requests.post(url,data=data,headers=headers)
result=r.json()['announcements']#获取单页年报的数据,数据格式为json。获取json中的年报信息。
#2.对数据信息进行提取
for i in result:
if re.search('摘要',i['announcementTitle']):#避免下载一些年报摘要等不需要的文件
pass
else:
title=i['announcementTitle']
secName=i['secName']
secName=secName.replace('*','')#下载前要将文件名中带*号的去掉,因为文件命名规则不能带*号,否则程序会中断
secCode=i['secCode']
adjunctUrl=i['adjunctUrl']
down_url='http://static.cninfo.com.cn/'+adjunctUrl
filename=f'{secCode}{secName}{title}.pdf'
filepath=saving_path+'\\'+filename
r=requests.get(down_url)
with open(filepath,'wb') as f:
f.write(r.content)
print(f'{secCode}{secName}{title}下载完毕')#设置进度条
#3.设置循环,下载多页的年报
for pageNum in range(1,3):#为演示,下载1-2页的年报
get_and_download_pdf_flie(pageNum)[/mw_shl_code] |
|