码迷,mamicode.com
首页 > 编程语言 > 详细

Python搜索B站视频并且爬取

时间:2021-01-20 12:02:16      阅读:0      评论:0      收藏:0      [点我收藏+]

标签:安装   ica   content   down   video   webp   iter   div   encoding   

目前遍历循环仍未完成,所以只会下载第一个结果,后续会完善成接口可以做单独调用,其中还有音频和视频的合并,需要先安装ffmpeg环境

 

# -*- coding: utf-8 -*-

import requests
from urllib import parse,request
import urllib.request
from bs4 import BeautifulSoup
import re
import os
import subprocess
import time
import json
import sys
import io
import ffmpeg

sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding=‘utf-8‘)


class BiliBili(object):
    def __init__(self, url):
        self.url = url


    def html(self, url):
        headers = {
            ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.74 Safari/537.36 Edg/79.0.309.43‘,
            ‘Accept‘: ‘text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8‘,
            ‘Accept-Language‘: ‘zh-CN,zh;q=0.8‘,
            ‘Connection‘: ‘keep-alive‘,
        }
        html = requests.get(url, headers=headers)
        html = html.text
        #print(html)
        return html

    def get_video_html(self,url):
        headers = {
            ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.74 Safari/537.36 Edg/79.0.309.43‘,
            ‘Accept‘: ‘text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8‘,
            ‘Accept-Language‘: ‘zh-CN,zh;q=0.8‘,
            ‘Connection‘: ‘keep-alive‘,
            # ‘Cookie‘:cookie
        }
        response = requests.get(url, headers=headers)
        video_html = response.text
        return video_html

    def get_video_info(selfs, html):
        result = re.findall(‘<script>window.__playinfo__=(.*?)</script>‘, html, re.S)[0]
        html_data = json.loads(result)
        download_video_url = html_data[‘data‘][‘dash‘][‘video‘][0][‘backup_url‘][0]
        return download_video_url

    def get_audio_info(self, html):
        result = re.findall(‘<script>window.__playinfo__=(.*?)</script>‘, html, re.S)[0]
        html_data = json.loads(result)
        download_audio_url = html_data[‘data‘][‘dash‘][‘audio‘][0][‘backup_url‘][0]
        return download_audio_url

    def search_video_info(self, html):
        soup = BeautifulSoup(html,"html.parser")
        for tag in soup.find_all(‘div‘, class_=‘info‘):
            title = tag.find(‘a‘,class_=‘title‘).get_text()
            people_num = tag.find(‘span‘, class_=‘so-icon watch-num‘).get_text()
            up_name = tag.find(‘a‘,class_=‘up-name‘).get_text()
            video_url = tag.find(‘a‘).get(‘href‘)
            video_url = video_url.replace(‘//‘,‘‘)
            return title, video_url

    def search_video(self, html):
        title, video_url = self.search_video_info(html)
        print(title)
        print(video_url)
        print(url)
        self.run_video(title, video_url, url)

    def run_search(self):
        #获取搜索结果,根据搜索结果获得视频链接
        html =self.html(url)
        self.search_video(html)


    def run_video(self,title, video_url,url):
        # 根据结果传入来获得视频下载链接
        video_size = 0
        audio_size = 0
        print("视频名称:" + title)
        print(url)
        print(‘https://‘+ video_url)
        get_video_html = self.get_video_html(‘https://‘+ video_url)
        download_video_url = self.get_video_info(get_video_html)
        download_audio_url = self.get_audio_info(get_video_html)
        headers = {
            ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0‘,
            ‘Accept‘: ‘text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8‘,
            ‘Referer‘: ‘https://‘+ video_url,
            ‘Accept-Encoding‘:"gzip, deflate, br",
            ‘Accept-Language‘: ‘zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2‘,
            ‘Connection‘: ‘keep-alive‘,
            #‘Cookie‘:cookie
        }
        video_content = requests.get(download_video_url, stream=True, headers=headers)
        mp4_file_size = int(video_content.headers[‘content-length‘])
        if video_content.status_code == 200:
            print(‘[文件大小]:%0.2f MB‘ %(mp4_file_size / 1024 / 1024))
            with open(title + ‘.mp4‘, mode=‘wb‘) as mp4:
                for chunk in video_content.iter_content(chunk_size=1024):
                    if chunk:
                        mp4.write(chunk)
                        video_size += len(chunk)  # 已下载的文件大小

        audio_content = requests.get(download_audio_url, stream=True, headers=headers)
        mp3_file_size = int(audio_content.headers[‘content-length‘])
        if audio_content.status_code == 200:
            print(‘[文件大小]:%0.2f MB‘ % (mp3_file_size / 1024 / 1024))
            with open(title + ‘.mp3‘, mode=‘wb‘) as mp3:
                for chunk in audio_content.iter_content(chunk_size=1024):
                    if chunk:
                        mp3.write(chunk)
                        audio_size += len(chunk)

        print(‘正在保存:‘, title)
        self.video_audio_merge_single(title)

    def video_audio_merge_single(self, video_name):
        #合成视频
        print(‘视频合成开始:‘,video_name)
        ffm = r"D:\sofware\ffmpeg-4.3.1-2021-01-01-full_build\bin\ffmpeg.exe "
        command = ffm + ‘ -i "{}.mp4" -i "{}.mp3" -vcodec copy -acodec copy "{}.mp4"‘.format(
            video_name, video_name, video_name + ‘(合)‘)
        subprocess.Popen(command, shell=True)
        print(command)
        time.sleep(10)
        print("视频合成结束:", video_name)


if __name__ ==‘__main__‘:
    url = ‘https://search.bilibili.com/all?‘
    keyword = ‘哈哈哈哈哈‘#需要搜索的视频名称
    keyword = urllib.parse.quote(keyword)
    param = ‘keyword=‘ + keyword + ‘&from_source=nav_searchs&pm_id_from=333.851.b_696e7465726e6174696f6e616c486561646572.15‘
    url = url + param
    BB = BiliBili(url)
    BB.run_search()

 

Python搜索B站视频并且爬取

标签:安装   ica   content   down   video   webp   iter   div   encoding   

原文地址:https://www.cnblogs.com/duanminkid/p/14300350.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!