威望0
积分7946
贡献0
在线时间763 小时
UID1
注册时间2021-4-14
最后登录2024-11-21
管理员
- UID
- 1
- 威望
- 0
- 积分
- 7946
- 贡献
- 0
- 注册时间
- 2021-4-14
- 最后登录
- 2024-11-21
- 在线时间
- 763 小时
|
[mw_shl_code=python,true]import re
import requests
#设置请求头和cookie根据要求
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER',
'Cookie':'__permanent_id=20221203162355205379923102881427769; __rpm=|s_112100...1670079692842; search_passback=b6219fd996d8e8e9e9118c63fc010000de89c800e1118c63; ddscreen=2; dest_area=country_id=9000&province_id=111&city_id =0&district_id=0&town_id=0; __visit_id=20221204112010093339809160925577017; __out_refer=; __trace_id=20221204112010094534951493076142788; pos_9_end=1670124010185; pos_0_start=1670124010224; pos_0_end=1670124010229; ad_ids=2095975|#1'
}
def spider(keyword, page):
for i in range(1, page):
url = 'http://search.dangdang.com/?key=' + str(keyword) + '&page_index=' + str(i)
print(url)
res = requests.get(url, headers=headers,timeout=2)
html=res.text
for a in range(1,60):
lis=re.findall(r'<li ddt-pit="'+str(a)+'"(.*?)>(.*?)</li>',html,re.S)
title=re.findall(r'<a title="(.*?)"',str(lis),re.S) [0] #书名
price=re.findall(r'_now_price">.*?(\d+\D\d+)',str(lis),re.S)[0] #价格
author1=re.findall(r'<p class="search_book_author">(.*?)</p>',str(lis),re.S)[0]
try:
author = re.findall(r'2=(.*?)&medium', str(author1), re.S)[0]
pub3 = re.findall(r'P_cbs.*?>(.*?)</a', str(author1), re.S)[0]
with open("dsj_2002_20200126057_.csv", "a", encoding='utf_8_sig') as f2:
f2.writelines(title + "," + str(price) + "," + str(author) + "," + str(pub3) + '\n')
print('\n书籍:', title, '\n价格:', price, '\n作者:', author, '\n出版社:', pub3)
except:
pub1 = re.findall(r'dd_name.*?t.*?>.*?社?<', str(author1), re.S)[1]
pub = re.findall(r'>(.*?)<', str(pub1), re.S)[0]
with open("dsj_2002_20200126057_csv", "a", encoding='utf_8_sig') as f2:
f2.writelines(title + "," + str(price) + "," + '佚名' + "," + str(pub) + '\n')
print('\n书籍:', title, '\n价格:', price, '\n作者:', '佚名', '\n出版社:', pub)
if __name__ == '__main__':
keyword = 'python爬虫'
page = 7
spider(keyword, page)[/mw_shl_code] |
|