码迷,mamicode.com
首页 > Web开发 > 详细

网页爬虫---音乐

时间:2020-07-11 11:16:32      阅读:105      评论:0      收藏:0      [点我收藏+]

标签:findall   stat   main   tar   int   lse   集中   成功   import   

import requests
import time
import re
import os

"""歌手字典"""
song_dict = {}

def song_static():
"""采集静态页面url和歌手"""
try:
response = requests.get(‘http://www.9ku.com/music/T_Singer.htm‘, timeout=30)
html = response.text
reg = r‘<a href="(.*?)" class="t-t">(.*?)</a>‘
static_singer = re.findall(reg, html)
for ul, title in static_singer:
url = ‘http://www.9ku.com‘ + ul
song_dict[title]=url
except requests.exceptions.Timeout as e:
print(e)
except requests.exceptions.HTTPError as e:
print(e)
# df = pd.DataFrame(song_list, columns=[‘url‘, ‘歌手‘])
# df.to_excel(‘歌手url.xlsx‘, engine=‘xlsxwriter‘, index=False)
return song_dict


#动态歌手地址采集
def song_List():
"""采集动态页面url和歌手"""
i=2
print(‘数据采集中......‘)
try:
while True:
print(‘正在采集第{}页数据‘.format(i))
response = requests.get("http://www.9ku.com/geshou/all-all-all/{}.htm".format(i),timeout=30)
html = response.text
reg = r‘<a href="(.*?)" class="t-t">(.*?)</a>‘
data = re.findall(reg,html)
if len(data):
i += 1
for ul ,title in data:
url = ‘http://www.9ku.com‘+ul
song_dict[title] = url
else:
response.close()
break
except requests.exceptions.Timeout as e:
print (e)
except requests.exceptions.HTTPError as e:
print (e)
# df = pd.DataFrame(dynamic_singer,columns=[‘url‘,‘歌手‘])
# df.to_excel(‘歌手url.xlsx‘,engine=‘xlsxwriter‘,index=False)
print (‘数据采集完成‘)
return song_dict


def song_search():
"""歌曲下载"""
while True:
name = input("请输入歌手名称:")
path ="" # 下载保存到哪个目录
if name in song_dict:
url = song_dict[name]
response = requests.get(url,timeout=30)
html = response.text
regs = r‘<div class="songName"><a target="_1" href="(.*?)" class="songNameA">‘
data = re.findall(regs, html)
for i in data:
song_id = i.strip(‘/play/‘)
url = ‘http://www.9ku.com/down/‘ + song_id
response = requests.get(url,timeout=30)
html = response.text
regs = r‘<a href="(.*?)" style="display:none">(.*?)</a>‘
data = re.findall(regs, html)
for src, title in data:
song_name = title.strip(‘Mp3下载‘)
r = requests.get(src,timeout=30).content
time.sleep(1)
f = open(‘%s/%s.mp3‘ % (path,song_name), ‘wb‘)
f.write(r)
print(‘{}:下载成功‘.format(song_name))
f.close()
else:
print("未找到歌手")


if __name__ == ‘__main__‘:
"""采集静态页面数据"""
song_static()
"""采集动态页面数据"""
song_List()
"""下载歌曲"""
song_search()

网页爬虫---音乐

标签:findall   stat   main   tar   int   lse   集中   成功   import   

原文地址:https://www.cnblogs.com/sheshidu/p/13282811.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!