威望0
积分7976
贡献0
在线时间763 小时
UID1
注册时间2021-4-14
最后登录2024-11-24
管理员
- UID
- 1
- 威望
- 0
- 积分
- 7976
- 贡献
- 0
- 注册时间
- 2021-4-14
- 最后登录
- 2024-11-24
- 在线时间
- 763 小时
|
[mw_shl_code=python,true]#!/usr/bin/env python
# coding:utf-8
# @Time:2021/7/25 12:47
# @Author:李宏
# @File: 爬取哔哩哔哩视频.py
# @Sofeware yCharm
# 3.1 准备工作
# 依赖的包
import json
import os
import sys
import re
import shutil
import ssl
import time
import requests
from concurrent.futures import ThreadPoolExecutor
from random import choice
from lxml import etree
import warnings
warnings.filterwarnings("ignore") # 忽略warning警告错误
'''
需要改变的具体数据项:
进入视频播放页面,按F12键进入,按F5键刷新,再暂停。
通过类似 BV1z4411N7iv?from=search&seid=9453069623894011451 页面
1、按 CTRL+F 查找 title 或自定义 title
title = 'Pandas数据分析_Python进阶'
2、点击 Response 及下面的 { } ,找到 Windows.__INITIAL_STATE__ = {
"bvid":"BV1z4411N7iv"
"aid":55166475,
"p":30, # 视频数量
3、点击 Header 获得:
a、 'referer': 'https://www.bilibili.com/video/BV1z4411N7iv?from=search&seid=9453069623894011451',
b、 url = 'https://api.bilibili.com/x/player/pagelist?bvid=BV1XW411Q7VQ&jsonp=jsonp'
c、 由 Remote Address 获得 proxy 地址
'''
''' 以下两行委要更新的内容 '''
title = 'PyQt5教程'
main_address = 'https://www.bilibili.com/video/BV154411n79k?p=60'
bvid = main_address.split('?')[0].split('/')[-1]
filepath = os.path.join(r'D:\software', title)
if ~os.path.exists(filepath):
try:
os.mkdir(filepath)
except BaseException:
print('目录已存在! ')
# 添加请求头和随机用户代理
proxy = {'https': 'http://10.22.96.29:8080', 'http': 'http://10.22.96.29:8080'}
headers = {
'Accept': '*/*',
'accept-encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'referer': main_address,
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'
}
def get_user_agent():
'''获取随机用户代理'''
user_agents = [
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
"Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
"Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
"Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
"Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
"Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre",
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
"Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",
"MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1",
"Mozilla/5.0 (Linux; Android 5.1.1; Nexus 6 Build/LYZ28E) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.23 Mobile Safari/537.36",
"Mozilla/5.0 (iPod; U; CPU iPhone OS 2_1 like Mac OS X; ja-jp) AppleWebKit/525.18.1 (KHTML, like Gecko) Version/3.1.1 Mobile/5F137 Safari/525.20",
"Mozilla/5.0 (Linux;u;Android 4.2.2;zh-cn;) AppleWebKit/534.46 (KHTML,like Gecko) Version/5.1 Mobile Safari/10600.6.3 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)",
"Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)"
]
# 在user_agent列表中随机产生一个代理,作为模拟的浏览器
user_agent = choice(user_agents)
return user_agent
# 3.2 编写下载代码
# 3.2 编写正则表达式
def re_video_info(text, pattern):
'''利用正则表达式匹配出视频信息并转化成json'''
match = re.search(pattern, text)
return json.loads(match.group(1))
# def single_download(aid, acc_quality,page):
def single_download(url, acc_quality, page, filename):
'''单个视频实现下载'''
# 请求视频链接,获取信息
origin_video_url = url+'?p={0}'.format(str(page).strip())
print(origin_video_url)
print('----------------')
res = requests.get(origin_video_url, headers=headers, timeout=6, proxies=proxy, verify=False) # proxies=proxy,
print(res.text)
print('----------------')
# html = etree.HTML(res.text)
# title = html.xpath('//*[@id="viewbox_report"]/h1/span/text()')[0]
filename = os.path.join(filepath, filename)
print('您当前正在下载:', title+': ' + filename)
video_info_temp = re_video_info(res.text, '__playinfo__=(.*?)</script><script>')
print(video_info_temp)
# video_info = {}
# 获取视频质量
# quality = video_info_temp['data']['quality'] # video_info_temp['data']['accept_description'][acc_quality]
# 获取视频时长
# video_info['duration'] = video_info_temp['data']['dash']['duration']
# 计算视频时长
# video_time = int(video_info.get('duration', 0))
# video_minute = video_time // 60
# video_second = video_time % 60
# print('当前视频清晰度为{},时长{}分{}秒'.format(quality, video_minute, video_second))
# 获取视频链接
video_url = video_info_temp['data']['dash']['video'][acc_quality]['baseUrl']
# 获取音频链接
audio_url = video_info_temp['data']['dash']['audio'][acc_quality]['baseUrl']
# 调用函数下载保存视频
download_video_single(origin_video_url, video_url, audio_url, filename)
# 3.3 编写下载代码
def download_video_single(referer_url, video_url, audio_url, video_name):
'''单个视频下载'''
# 更新请求头
headers.update({"Referer": referer_url})
# print("视频下载开始:%s" % video_name)
# 下载并保存视频
# video_content = requests.get(video_url, headers=headers, timeout=20, proxies=proxy, verify=False)
# print('%s\t视频大小:' % video_name, round(int(video_content.headers.get('content-length', 0)) / 1024 / 1024, 2), '\tMB')
received_video = 0
with open('%s_video.mp4' % video_name, 'ab') as output:
headers['Range'] = 'bytes=' + str(received_video) + '-'
response = requests.get(video_url, headers=headers, timeout=20, proxies=proxy, verify=False)
output.write(response.content)
# 下载并保存音频
# audio_content = requests.get(audio_url, headers=headers, timeout=20, proxies=proxy, verify=False)
# print('%s\t音频大小:' % video_name, round(int(audio_content.headers.get('content-length', 0)) / 1024 / 1024, 2), '\tMB')
received_audio = 0
with open('%s_audio.mp3' % video_name, 'ab') as output:
headers['Range'] = 'bytes=' + str(received_audio) + '-'
response = requests.get(audio_url, headers=headers, timeout=20, proxies=proxy, verify=False )
output.write(response.content)
received_audio += len(response.content)
# print("视频下载结束:%s" % video_name)
video_audio_merge_single(video_name)
# 3.4 将下载好的音频和视频合并
def video_audio_merge_single(video_name):
'''使用ffmpeg单个视频音频合并'''
print("视频合成开始:%s" % video_name)
import subprocess
command2 = r'D:\software\ffmpeg.exe -i '+video_name+'_audio.mp3 -i ' +video_name+'_video.mp4 -acodec copy -vcodec copy ' +video_name+'.mp4 '
subprocess.Popen(command2, shell=True)
time.sleep(20)
delfilename1 = video_name + '_audio.mp3'
delfilename2 = video_name + '_video.mp4'
if os.path.exists(delfilename1): # 如果文件存在 ,则 删除
os.remove(delfilename1)
if os.path.exists(delfilename2):
# 删除文件,可使用以下两种方法。
os.remove(delfilename2)
print("视频合成结束:%s" % video_name)
def main():
start_time = time.time()
url = r'https://api.bilibili.com/x/player/pagelist?bvid=' + bvid + r'&jsonp=jsonp'
res = requests.get(url, proxies=proxy)
res.encoding = 'utf-8'
# print(res.text)
result = json.loads(res.text)['data']
replist = [' ', '/', r'\n', r'\t', '[', ']', '【', '】', '{', '}', '(', ')', '(', ')', '、', ',', ':'] # 须被替换字符
filenames = []
for it in result:
filename = ''
filename = filename.join('' if i in replist else '_' if i == '.' else i for i in it['part'])
filenames.append(filename)
print(filenames)
# quit()
url = r'https://www.bilibili.com/video/' + bvid
for i in range(len(filenames)):
i += 1
if i >= 1: # and (i<20)):
zf = str(i).zfill(3)+'_'+filenames[i-1]
# print('OK_'+zf)
single_download(url, 0, i, zf)
print("程序运行耗时:{}".format(time.time() - start_time)) # 当前时间 - 启动时间 = 程序运行耗时
if __name__ == '__main__':
main()
[/mw_shl_code] |
|