码迷,mamicode.com
首页 > 编程语言 > 详细

Python爬虫学习教程,批量爬取下载抖音视频

时间:2019-07-16 10:38:24      阅读:283      评论:0      收藏:0      [点我收藏+]

标签:打印   存在   from   ram   运行   process   学习教程   __name__   ejs   

这篇文章主要为大家详细介绍了python批量爬取下载抖音视频,具有一定的参考价值,感兴趣的小伙伴们可以参考一下

这篇文章主要为大家详细介绍了python批量爬取下载抖音视频,具有一定的参考价值,感兴趣的小伙伴们可以参考一下

这篇文章主要为大家详细介绍了python批量爬取下载抖音视频,具有一定的参考价值,感兴趣的小伙伴们可以参考一下

技术图片

项目源码展示:

  1 ‘‘‘
  2 在学习过程中有什么不懂得可以加我的
  3 python学习交流扣扣qun,934109170
  4 群里有不错的学习教程、开发工具与电子书籍。
  5 与你分享python企业当下人才需求及怎么从零基础学习好python,和学习什么内容。
  6 ‘‘‘
  7 # -*- coding:utf-8 -*-
  8 from contextlib import closing
  9 import requests, json, re, os, sys, random
 10 from ipaddress import ip_address
 11 from subprocess import Popen, PIPE
 12 import urllib
 13 class DouYin(object):
 14  def __init__(self, width = 500, height = 300):
 15  """
 16  抖音App视频下载
 17  """
 18  rip = ip_address(0.0.0.0)
 19  while rip.is_private:
 20   rip = ip_address(..join(map(str, (random.randint(0, 255) for _ in range(4)))))
 21  self.headers = {
 22   accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,
 23   accept-encoding: gzip, deflate, br,
 24   accept-language: zh-CN,zh;q=0.9,
 25   pragma: no-cache,
 26   cache-control: no-cache,
 27   upgrade-insecure-requests: 1,
 28   user-agent: Mozilla/5.0 (Linux; U; Android 5.1.1; zh-cn; MI 4S Build/LMY47V) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/53.0.2785.146 Mobile Safari/537.36 XiaoMi/MiuiBrowser/9.1.3,
 29   X-Real-IP: str(rip),
 30   X-Forwarded-For: str(rip),
 31  }
 32  def get_video_urls(self, user_id, type_flag=f):
 33  """
 34  获得视频播放地址
 35  Parameters:
 36   user_id:查询的用户UID
 37  Returns:
 38   video_names: 视频名字列表
 39   video_urls: 视频链接列表
 40   nickname: 用户昵称
 41  """
 42  video_names = []
 43  video_urls = []
 44  share_urls = []
 45  max_cursor = 0
 46  has_more = 1
 47  i = 0
 48  share_user_url = https://www.douyin.com/share/user/%s % user_id
 49  share_user = requests.get(share_user_url, headers=self.headers)
 50  while share_user.status_code != 200:
 51   share_user = requests.get(share_user_url, headers=self.headers)
 52  _dytk_re = re.compile(r"dytk\s*:\s*‘(.+)‘")
 53  dytk = _dytk_re.search(share_user.text).group(1)
 54  _nickname_re = re.compile(r<p class="nickname">(.+?)<\/p>)
 55  nickname = _nickname_re.search(share_user.text).group(1)
 56  urllib.request.urlretrieve(https://raw.githubusercontent.com/Jack-Cherish/python-spider/master/douyin/fuck-byted-acrawler.js, fuck-byted-acrawler.js)
 57  try:
 58   Popen([node, -v], stdout=PIPE, stderr=PIPE).communicate()
 59  except (OSError, IOError) as err:
 60   print(请先安装 node.js: https://nodejs.org/)
 61   sys.exit()
 62  user_url_prefix = https://www.douyin.com/aweme/v1/aweme/favorite if type_flag == f else https://www.douyin.com/aweme/v1/aweme/post
 63  print(解析视频链接中)
 64  while has_more != 0:
 65   process = Popen([node, fuck-byted-acrawler.js, str(user_id)], stdout=PIPE, stderr=PIPE)
 66   _sign = process.communicate()[0].decode().strip(\n).strip(\r)
 67   user_url = user_url_prefix + /?user_id=%s&max_cursor=%s&count=21&aid=1128&_signature=%s&dytk=%s % (user_id, max_cursor, _sign, dytk)
 68   req = requests.get(user_url, headers=self.headers)
 69   while req.status_code != 200:
 70   req = requests.get(user_url, headers=self.headers)
 71   html = json.loads(req.text)
 72   try:
 73   while html[aweme_list] == []:
 74    i = i + 1
 75    sys.stdout.write(已重新链接 + str(i) + 次 (若超过100次,请ctrl+c强制停止再重来) + \r)
 76    sys.stdout.flush()
 77    process = Popen([node, fuck-byted-acrawler.js, str(user_id)], stdout=PIPE, stderr=PIPE)
 78    _sign = process.communicate()[0].decode().strip(\n).strip(\r)
 79    user_url = user_url_prefix + /?user_id=%s&max_cursor=%s&count=21&aid=1128&_signature=%s&dytk=%s % (user_id, max_cursor, _sign, dytk)
 80    req = requests.get(user_url, headers=self.headers)
 81    while req.status_code != 200:
 82    req = requests.get(user_url, headers=self.headers)
 83    html = json.loads(req.text)
 84   except:
 85   pass
 86   i = 0
 87   for each in html[aweme_list]:
 88   try:
 89    url = https://aweme.snssdk.com/aweme/v1/play/?video_id=%s&line=0&ratio=720p&media_type=4&vr_type=0&test_cdn=None&improve_bitrate=0
 90    uri = each[video][play_addr][uri]
 91    video_url = url % uri
 92   except:
 93    continue
 94   share_desc = each[share_info][share_desc]
 95   if os.name == nt:
 96    for c in r\/:*?"<>|:
 97    nickname = nickname.replace(c, ‘‘).strip().strip(\.)
 98    share_desc = share_desc.replace(c, ‘‘).strip()
 99   share_id = each[aweme_id]
100   if share_desc in [抖音-原创音乐短视频社区, TikTok, ‘‘]:
101    video_names.append(share_id + .mp4)
102   else:
103    video_names.append(share_id + - + share_desc + .mp4)
104   share_urls.append(each[share_info][share_url])
105   video_urls.append(video_url)
106   max_cursor = html[max_cursor]
107   has_more = html[has_more]
108  return video_names, video_urls, share_urls, nickname
109  def get_download_url(self, video_url, watermark_flag):
110  """
111  获得带水印的视频播放地址
112  Parameters:
113   video_url:带水印的视频播放地址
114  Returns:
115   download_url: 带水印的视频下载地址
116  """
117  # 带水印视频
118  if watermark_flag == True:
119   download_url = video_url.replace(/play/, /playwm/)
120  # 无水印视频
121  else:
122   download_url = video_url.replace(/playwm/, /play/)
123  return download_url
124  def video_downloader(self, video_url, video_name, watermark_flag=False):
125  """
126  视频下载
127  Parameters:
128   video_url: 带水印的视频地址
129   video_name: 视频名
130   watermark_flag: 是否下载带水印的视频
131  Returns:
132 133  """
134  size = 0
135  video_url = self.get_download_url(video_url, watermark_flag=watermark_flag)
136  with closing(requests.get(video_url, headers=self.headers, stream=True)) as response:
137   chunk_size = 1024
138   content_size = int(response.headers[content-length])
139   if response.status_code == 200:
140   sys.stdout.write( [文件大小]:%0.2f MB\n % (content_size / chunk_size / 1024))
141   with open(video_name, wb) as file:
142    for data in response.iter_content(chunk_size = chunk_size):
143    file.write(data)
144    size += len(data)
145    file.flush()
146    sys.stdout.write( [下载进度]:%.2f%% % float(size / content_size * 100) + \r)
147    sys.stdout.flush()
148  def run(self):
149  """
150  运行函数
151  Parameters:
152   None
153  Returns:
154   None
155  """
156  self.hello()
157  print(搜索api需要登录,暂时使用UID下载\n分享用户页面,用浏览器打开短链接,原始链接中/share/user/后的数字即是UID)
158  user_id = input(请输入ID (例如95006183):)
159  user_id = user_id if user_id else 95006183
160  watermark_flag = input(是否下载带水印的视频 (0-否(默认), 1-是):)
161  watermark_flag = watermark_flag if watermark_flag!=‘‘ else 0
162  watermark_flag = bool(int(watermark_flag))
163  type_flag = input(f-收藏的(默认), p-上传的:)
164  type_flag = type_flag if type_flag!=‘‘ else f
165  save_dir = input(保存路径 (例如"E:/Download/", 默认"./Download/"):)
166  save_dir = save_dir if save_dir else "./Download/"
167  video_names, video_urls, share_urls, nickname = self.get_video_urls(user_id, type_flag)
168  nickname_dir = os.path.join(save_dir, nickname)
169  if not os.path.exists(save_dir):
170   os.makedirs(save_dir)
171  if nickname not in os.listdir(save_dir):
172   os.mkdir(nickname_dir)
173  if type_flag == f:
174   if favorite not in os.listdir(nickname_dir):
175   os.mkdir(os.path.join(nickname_dir, favorite))
176  print(视频下载中:共有%d个作品!\n % len(video_urls))
177  for num in range(len(video_urls)):
178   print( 解析第%d个视频链接 [%s] 中,请稍后!\n % (num + 1, share_urls[num]))
179   if \\ in video_names[num]:
180   video_name = video_names[num].replace(\\, ‘‘)
181   elif / in video_names[num]:
182   video_name = video_names[num].replace(/, ‘‘)
183   else:
184   video_name = video_names[num]
185   video_path = os.path.join(nickname_dir, video_name) if type_flag!=f else os.path.join(nickname_dir, favorite, video_name)
186   if os.path.isfile(video_path):
187   print(视频已存在)
188   else:
189   self.video_downloader(video_urls[num], video_path, watermark_flag)
190   print(\n)
191  print(下载完成!)
192  def hello(self):
193  """
194  打印欢迎界面
195  Parameters:
196   None
197  Returns:
198   None
199  """
200  print(* * 100)
201  print(\t\t\t\t抖音App视频下载小助手)
202  print(\t\t作者:Jack Cui、steven7851)
203  print(* * 100)
204 if __name__ == __main__:
205  douyin = DouYin()
206  douyin.run()

运行结果:

技术图片

爬取结果截图

技术图片

 

Python爬虫学习教程,批量爬取下载抖音视频

标签:打印   存在   from   ram   运行   process   学习教程   __name__   ejs   

原文地址:https://www.cnblogs.com/xiaoyiq/p/11192969.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!