爬取西瓜视频

时间：2020-06-26 16:19:44 阅读：165 评论：0 收藏：0 [点我收藏+]

标签：detail dir decode 爬取 exception gecko XML main 方式

 1 # -*- coding: utf-8 -*-
 2 # __author__ = "maple"
 3 
 4 
 5 from base64 import b64decode
 6 from lxml import etree
 7 import requests
 8 import json
 9 import re
10 import os
11 
12 
13 class XiGuaSpider:
14 
15     def __init__(self):
16         self.headers = {
17             ‘Referer‘: ‘https://www.ixigua.com‘,
18             ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36‘,
19             ‘cookie‘: ‘wafid=8b91d940-81ec-4620-af0f-f45d479a62c2; wafid.sig=BZgx1eD0aFGn25mL-y-SEh17cng; ttwid=6841106955945346564; ttwid.sig=glkPgElc0Yh0OEDyNL0P91fmbZg; xiguavideopcwebid=6841106955945346564; xiguavideopcwebid.sig=avM_v_QTwC7VqM26Yqde9eer3xA; _ga=GA1.2.1235075053.1592819342; SLARDAR_WEB_ID=fa1eb835-d608-4ade-850d-bc0409bd541f; _gid=GA1.2.303152420.1593089518; ixigua-a-s=1; Hm_lvt_db8ae92f7b33b6596893cdf8c004a1a2=1593094562,1593095154,1593098009,1593147688; Hm_lpvt_db8ae92f7b33b6596893cdf8c004a1a2=1593153331‘,
20         }
21 
22         self.video_dirs = ‘./video‘
23 
24     def download_file(self, file_path, download_url):
25         print(‘*‘ * 100)
26         print(f"保存路径：{file_path}")
27         print(f‘下载URL：{download_url}‘)
28         response = requests.get(url=download_url, headers=self.headers, stream=True)
29         content_size = int(response.headers["content-length"])  # 视频内容的总大小
30         size = 0
31         with open(file_path, "wb") as file:  # 非纯文本都以字节的方式写入
32             for data in response.iter_content(chunk_size=1024):  # 循环写入
33                 file.write(data)  # 写入视频文件
34                 file.flush()  # 刷新缓存
35                 size += len(data)  # 叠加每次写入的大小
36                 # 打印下载进度
37                 print("\r文件下载进度:%d%%(%0.2fMB/%0.2fMB)" % (
38                     float(size / content_size * 100), (size / 1024 / 1024),
39                     (content_size / 1024 / 1024)),
40                       end=" ")
41         print()
42 
43     def get_response(self, url):
44         response = None
45         try:
46             response = requests.get(url, headers=self.headers)
47         except Exception as e:
48             print(e)
49         return response
50 
51     def parse_detail(self, url):
52         response = self.get_response(url)
53         if not response:
54             return
55         html = response.text
56         document = etree.HTML(html)
57         title = ‘‘.join(document.xpath(‘//*[@class="hasSource"]/text()‘))
58         if not title:
59             title = ‘‘.join(document.xpath(‘//*[@class="teleplayPage__Description__header"]/h1/text()‘))
60 
61         title = re.sub(u"([^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a])", "-", title)
62         pattern = r‘\<script.*?\>window\._SSR_HYDRATED_DATA=(.*?)\</script\>‘
63         result = re.findall(pattern, html)
64         if len(result) < 1:
65             print(‘没有找到下载链接。。。‘)
66             return None
67         result = result[0]
68         data = json.loads(result)
69         with open(‘video.json‘, ‘w‘, encoding=‘utf-8‘) as f:
70             json.dump(data, f)
71 
72         try:
73             video_list = data[‘Projection‘][‘video‘][‘videoResource‘][‘normal‘][‘video_list‘]
74         except Exception as e:
75             print(‘异常信息：‘, e)
76             video_list = data[‘Teleplay‘][‘videoResource‘][‘normal‘][‘video_list‘]
77 
78         video_3 = video_list.get(‘video_3‘)
79         if not video_3:
80             video_3 = video_list.get(‘video_2‘)
81         video_url = video_3[‘main_url‘]
82         video_url = b64decode(video_url).decode(‘utf-8‘)
83 
84         if not os.path.exists(self.video_dirs):
85             os.mkdir(self.video_dirs)
86         file_path = f"{self.video_dirs}/{title}.mp4"
87         self.download_file(file_path, video_url)
88 
89     def start_requests(self):
90         url = ‘https://www.ixigua.com/i6618828724525597192‘
91         self.parse_detail(url)
92 
93     def run(self):
94         self.start_requests()
95 
96 
97 if __name__ == ‘__main__‘:
98     XiGuaSpider().run()

爬取西瓜视频

标签：detail dir decode 爬取 exception gecko XML main 方式

原文地址：https://www.cnblogs.com/shiguanggege/p/13195073.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行