Python爬虫之爬取B站视频(哔哩哔哩)

2021/5/20 22:54:44

编程Tag： 爬虫 video url content python audio headers self 哔哩

本文主要是介绍Python爬虫之爬取B站视频(哔哩哔哩)，对大家解决编程问题具有一定的参考价值，需要的程序猿们随着小编来一起学习吧！

代码如下

亲测有效

# encoding: utf-8

import requests  # 模拟发送请求
import json
import re
import os

# 定义请求头
headers = {
    'Accept': '*/*',
    'Accept-Language': 'en-US,en;q=0.5',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36'
}

class BilibiliVideoSpider(object):
    def __init__(self, url, output_root):
        self.url = url
        self.output_root = output_root
        self.headers = {
            'Accept': '*/*',
            'Accept-Language': 'en-US,en;q=0.5',
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36'
        }   # 定义请求头

    def _match(self, text, pattern):
        match = re.search(pattern, text)
        if match is None:
            print('this pattern was not matched !')
        return json.loads(match.group(1))

    def getHtml(self):
        try:
            response = requests.get(url=self.url, headers=self.headers)  # 发请求，拿数据 （获取响应对象）
            print(f'status_code: {response.status_code}')
            if response.status_code == 200:
                return response
        except RequestException:
            print('html reques error !')

    def parseHtml(self, response):
        playinfo = self._match(response.text, '__playinfo__=(.*?)</script><script>')          # 视频详情json
        initial_state = self._match(response.text, r'__INITIAL_STATE__=(.*?);\(function\(\)') # 视频内容json

        video_url = playinfo['data']['dash']['video'][0]['baseUrl']                        # 视频分多种格式，直接取分辨率最高的视频 1080p
        audio_url = playinfo['data']['dash']['audio'][0]['baseUrl']                        # 取音频地址
        video_name = initial_state['videoData']['title']                                   # 取视频名字
        # print(f'视频名字为: {video_name}')
        # print(f'视频地址为：{video_url}')
        # print(f'音频地址为：{audio_url}')
        return video_url, audio_url, video_name

    def downloadVideo(self, video_url, audio_url, video_name):
        headers.update({"Referer": self.url})
        print('开始下载视频: ')
        video_content = requests.get(video_url, headers=headers)
        audio_content = requests.get(audio_url, headers=headers)
        print('%s视频大小：' % video_name, video_content.headers['content-length'])
        print('%s音频大小：' % video_name, audio_content.headers['content-length'])

        # 下载视频
        received_video = 0
        video = f'{self.output_root}video.mp4'
        with open(video, 'ab') as output:
            while int(video_content.headers['content-length']) > received_video:
                headers['Range'] = 'bytes=' + str(received_video) + '-'
                response = requests.get(video_url, headers=headers)
                output.write(response.content)
                received_video += len(response.content)

        # 下载音频开始
        audio_content = requests.get(audio_url, headers=headers)
        received_audio = 0
        audio = f'{self.output_root}audio.mp4'
        with open(audio, 'ab') as output:
            while int(audio_content.headers['content-length']) > received_audio:
                headers['Range'] = 'bytes=' + str(received_audio) + '-'
                response = requests.get(audio_url, headers=headers)
                output.write(response.content)
                received_audio += len(response.content)
        print('视频下载完成')

        root_path = os.path.abspath(os.path.dirname(__file__)).split('shippingSchedule')[0]
        video_dst =  root_path+'/download.mp4'
        self.video_audio_merge(video, audio, video_dst)
        print(f'下载的视频: {video_dst}')
        os.remove(video)
        os.remove(audio)


    def video_audio_merge(self, video_src, audio_src, video_dst):
        '''使用ffmpeg单个视频音频合并'''
        import subprocess
        command = 'ffmpeg -i %s_video.mp4 -i %s_audio.mp4 -c copy %s.mp4 -y -loglevel quiet' % (
            video_src, audio_src, video_dst)
        subprocess.Popen(command, shell=True)

    def run(self):
        response = self.getHtml()
        video_url, audio_url, video_name = self.parseHtml(response)
        self.downloadVideo(video_url, audio_url, video_name)

def demo():
    url = 'https://www.bilibili.com/video/BV1Q5411p7bz?from=search&seid=14643382716113842219'
    output_root = './'

    b = BilibiliVideoSpider(url, output_root)
    b.run()

if __name__ == '__main__':
    demo()

这篇关于Python爬虫之爬取B站视频(哔哩哔哩)的文章就介绍到这儿，希望我们推荐的文章对大家有所帮助，也希望大家多多支持为之网！

Python爬虫之爬取B站视频(哔哩哔哩)

代码如下

相关编程文章