php中文网 | cnphp.com

 找回密码
 立即注册

QQ登录

只需一步,快速开始

搜索
查看: 618|回复: 0

python爬取哔哩视频

[复制链接]

3146

主题

3156

帖子

1万

积分

管理员

Rank: 9Rank: 9Rank: 9

UID
1
威望
0
积分
7966
贡献
0
注册时间
2021-4-14
最后登录
2024-11-23
在线时间
763 小时
QQ
发表于 2022-6-21 22:23:10 | 显示全部楼层 |阅读模式
[mw_shl_code=python,true]#!/usr/bin/env python
# coding:utf-8
# @Time:2021/7/25 12:47
# @Author:李宏
# @File: 爬取哔哩哔哩视频.py
# @Sofeware yCharm

# 3.1 准备工作
# 依赖的包
import json
import os
import sys
import re
import shutil
import ssl
import time
import requests
from concurrent.futures import ThreadPoolExecutor
from random import choice
from lxml import etree
import warnings
warnings.filterwarnings("ignore")  # 忽略warning警告错误


'''
需要改变的具体数据项:
进入视频播放页面,按F12键进入,按F5键刷新,再暂停。
通过类似 BV1z4411N7iv?from=search&seid=9453069623894011451 页面
1、按 CTRL+F 查找 title  或自定义 title
    title = 'Pandas数据分析_Python进阶'
2、点击 Response 及下面的 { } ,找到 Windows.__INITIAL_STATE__ = {
    "bvid":"BV1z4411N7iv"
    "aid":55166475,
    "p":30,                   # 视频数量
3、点击 Header 获得:
    a、 'referer': 'https://www.bilibili.com/video/BV1z4411N7iv?from=search&seid=9453069623894011451',
    b、 url = 'https://api.bilibili.com/x/player/pagelist?bvid=BV1XW411Q7VQ&jsonp=jsonp'
    c、 由 Remote Address 获得 proxy 地址
'''

''' 以下两行委要更新的内容  '''
title = 'PyQt5教程'
main_address = 'https://www.bilibili.com/video/BV154411n79k?p=60'

bvid = main_address.split('?')[0].split('/')[-1]
filepath = os.path.join(r'D:\software', title)
if ~os.path.exists(filepath):
    try:
        os.mkdir(filepath)
    except BaseException:
        print('目录已存在! ')

# 添加请求头和随机用户代理
proxy = {'https': 'http://10.22.96.29:8080', 'http': 'http://10.22.96.29:8080'}
headers = {
   'Accept': '*/*',
   'accept-encoding': 'gzip, deflate, br',
   'Accept-Language': 'zh-CN,zh;q=0.9',
   'referer': main_address,
   'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.164 Safari/537.36'
}


def get_user_agent():
   '''获取随机用户代理'''
   user_agents = [
       "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
       "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
       "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
       "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
       "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
       "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
       "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
       "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
       "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
       "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
       "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
       "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
       "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
       "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
       "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
       "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
       "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
       "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
       "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
       "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
       "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER",
       "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
       "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
       "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
       "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)",
       "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
       "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
       "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
       "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
       "Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
       "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre",
       "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0",
       "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
       "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",
       "MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
       "Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1",
       "Mozilla/5.0 (Linux; Android 5.1.1; Nexus 6 Build/LYZ28E) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.23 Mobile Safari/537.36",
       "Mozilla/5.0 (iPod; U; CPU iPhone OS 2_1 like Mac OS X; ja-jp) AppleWebKit/525.18.1 (KHTML, like Gecko) Version/3.1.1 Mobile/5F137 Safari/525.20",
       "Mozilla/5.0 (Linux;u;Android 4.2.2;zh-cn;) AppleWebKit/534.46 (KHTML,like Gecko) Version/5.1 Mobile Safari/10600.6.3 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)",
       "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)"
   ]
   # 在user_agent列表中随机产生一个代理,作为模拟的浏览器
   user_agent = choice(user_agents)
   return user_agent


# 3.2 编写下载代码
# 3.2 编写正则表达式
def re_video_info(text, pattern):
    '''利用正则表达式匹配出视频信息并转化成json'''
    match = re.search(pattern, text)
    return json.loads(match.group(1))


# def single_download(aid, acc_quality,page):
def single_download(url, acc_quality, page, filename):
    '''单个视频实现下载'''
    # 请求视频链接,获取信息
    origin_video_url = url+'?p={0}'.format(str(page).strip())
    print(origin_video_url)
    print('----------------')
    res = requests.get(origin_video_url, headers=headers, timeout=6, proxies=proxy, verify=False)    # proxies=proxy,
    print(res.text)
    print('----------------')
    # html = etree.HTML(res.text)

    # title = html.xpath('//*[@id="viewbox_report"]/h1/span/text()')[0]
    filename = os.path.join(filepath, filename)
    print('您当前正在下载:', title+':  ' + filename)
    video_info_temp = re_video_info(res.text, '__playinfo__=(.*?)</script><script>')
    print(video_info_temp)
    # video_info = {}
    # 获取视频质量
    # quality = video_info_temp['data']['quality']  # video_info_temp['data']['accept_description'][acc_quality]
    # 获取视频时长
    # video_info['duration'] = video_info_temp['data']['dash']['duration']
    # 计算视频时长
    # video_time = int(video_info.get('duration', 0))
    # video_minute = video_time // 60
    # video_second = video_time % 60

    # print('当前视频清晰度为{},时长{}分{}秒'.format(quality, video_minute, video_second))

    # 获取视频链接
    video_url = video_info_temp['data']['dash']['video'][acc_quality]['baseUrl']
    # 获取音频链接
    audio_url = video_info_temp['data']['dash']['audio'][acc_quality]['baseUrl']
    # 调用函数下载保存视频
    download_video_single(origin_video_url, video_url, audio_url, filename)


# 3.3 编写下载代码
def download_video_single(referer_url, video_url, audio_url, video_name):
    '''单个视频下载'''
    # 更新请求头
    headers.update({"Referer": referer_url})
    # print("视频下载开始:%s" % video_name)
    # 下载并保存视频
    # video_content = requests.get(video_url, headers=headers, timeout=20, proxies=proxy, verify=False)
    # print('%s\t视频大小:' % video_name, round(int(video_content.headers.get('content-length', 0)) / 1024 / 1024, 2), '\tMB')
    received_video = 0
    with open('%s_video.mp4' % video_name, 'ab') as output:
        headers['Range'] = 'bytes=' + str(received_video) + '-'
        response = requests.get(video_url, headers=headers, timeout=20, proxies=proxy, verify=False)
        output.write(response.content)
    # 下载并保存音频
    # audio_content = requests.get(audio_url, headers=headers, timeout=20, proxies=proxy, verify=False)
    # print('%s\t音频大小:' % video_name, round(int(audio_content.headers.get('content-length', 0)) / 1024 / 1024, 2), '\tMB')
    received_audio = 0
    with open('%s_audio.mp3' % video_name, 'ab') as output:
        headers['Range'] = 'bytes=' + str(received_audio) + '-'
        response = requests.get(audio_url, headers=headers, timeout=20, proxies=proxy, verify=False )
        output.write(response.content)
        received_audio += len(response.content)
    # print("视频下载结束:%s" % video_name)
    video_audio_merge_single(video_name)


# 3.4 将下载好的音频和视频合并
def video_audio_merge_single(video_name):
    '''使用ffmpeg单个视频音频合并'''
    print("视频合成开始:%s" % video_name)
    import subprocess
    command2 = r'D:\software\ffmpeg.exe -i '+video_name+'_audio.mp3 -i ' +video_name+'_video.mp4 -acodec copy -vcodec copy ' +video_name+'.mp4 '
    subprocess.Popen(command2, shell=True)
    time.sleep(20)
    delfilename1 = video_name + '_audio.mp3'
    delfilename2 = video_name + '_video.mp4'

    if os.path.exists(delfilename1):  # 如果文件存在 ,则 删除
        os.remove(delfilename1)
    if os.path.exists(delfilename2):
        # 删除文件,可使用以下两种方法。
        os.remove(delfilename2)
    print("视频合成结束:%s" % video_name)


def main():
    start_time = time.time()

    url = r'https://api.bilibili.com/x/player/pagelist?bvid=' + bvid + r'&jsonp=jsonp'
    res = requests.get(url, proxies=proxy)
    res.encoding = 'utf-8'
    # print(res.text)
    result = json.loads(res.text)['data']
    replist = [' ', '/', r'\n', r'\t', '[', ']', '【', '】', '{', '}', '(', ')', '(', ')', '、', ',', ':']  # 须被替换字符
    filenames = []
    for it in result:
        filename = ''
        filename = filename.join('' if i in replist else '_' if i == '.' else i for i in it['part'])
        filenames.append(filename)
    print(filenames)
    # quit()

    url = r'https://www.bilibili.com/video/' + bvid
    for i in range(len(filenames)):
        i += 1
        if i >= 1:             # and (i<20)):
            zf = str(i).zfill(3)+'_'+filenames[i-1]
            # print('OK_'+zf)
            single_download(url, 0, i, zf)
    print("程序运行耗时:{}".format(time.time() - start_time))  # 当前时间 - 启动时间 =  程序运行耗时


if __name__ == '__main__':
    main()




[/mw_shl_code]

回复

使用道具 举报

您需要登录后才可以回帖 登录 | 立即注册

本版积分规则

QQ|php中文网 | cnphp.com ( 赣ICP备2021002321号-2 )

GMT+8, 2024-11-23 20:59 , Processed in 0.273739 second(s), 36 queries , Gzip On.

Powered by Discuz! X3.4 Licensed

Copyright © 2001-2020, Tencent Cloud.

申明:本站所有资源皆搜集自网络,相关版权归版权持有人所有,如有侵权,请电邮(fiorkn@foxmail.com)告之,本站会尽快删除。

快速回复 返回顶部 返回列表