网页爬虫---音乐

时间：2020-07-11 11:16:32 阅读：105 评论：0 收藏：0 [点我收藏+]

标签：findall stat main tar int lse 集中成功 import

import requests
import time
import re
import os

"""歌手字典"""
song_dict = {}

def song_static():
    """采集静态页面url和歌手"""
    try:
        response = requests.get(‘http://www.9ku.com/music/T_Singer.htm‘, timeout=30)
        html = response.text
        reg = r‘<a href="(.*?)" class="t-t">(.*?)</a>‘
        static_singer = re.findall(reg, html)
        for ul, title in static_singer:
            url = ‘http://www.9ku.com‘ + ul
            song_dict[title]=url
    except requests.exceptions.Timeout as e:
        print(e)
    except requests.exceptions.HTTPError as e:
        print(e)
    # df = pd.DataFrame(song_list, columns=[‘url‘, ‘歌手‘])
    # df.to_excel(‘歌手url.xlsx‘, engine=‘xlsxwriter‘, index=False)
    return song_dict


#动态歌手地址采集
def song_List():
    """采集动态页面url和歌手"""
    i=2
    print(‘数据采集中......‘)
    try:
        while True:
            print(‘正在采集第{}页数据‘.format(i))
            response = requests.get("http://www.9ku.com/geshou/all-all-all/{}.htm".format(i),timeout=30)
            html = response.text
            reg = r‘<a href="(.*?)" class="t-t">(.*?)</a>‘
            data = re.findall(reg,html)
            if len(data):
                i += 1
                for ul ,title in data:
                    url = ‘http://www.9ku.com‘+ul
                    song_dict[title] = url
            else:
                response.close()
                break
    except requests.exceptions.Timeout as e:
        print (e)
    except requests.exceptions.HTTPError as e:
        print (e)
    # df = pd.DataFrame(dynamic_singer,columns=[‘url‘,‘歌手‘])
    # df.to_excel(‘歌手url.xlsx‘,engine=‘xlsxwriter‘,index=False)
    print (‘数据采集完成‘)
    return song_dict


def song_search():
    """歌曲下载"""
    while True:
        name = input("请输入歌手名称：")
        path ="" # 下载保存到哪个目录
        if name in song_dict:
            url = song_dict[name]
            response = requests.get(url,timeout=30)
            html = response.text
            regs = r‘<div class="songName"><a target="_1" href="(.*?)" class="songNameA">‘
            data = re.findall(regs, html)
            for i in data:
                song_id = i.strip(‘/play/‘)
                url = ‘http://www.9ku.com/down/‘ + song_id
                response = requests.get(url,timeout=30)
                html = response.text
                regs = r‘<a href="(.*?)" style="display:none">(.*?)</a>‘
                data = re.findall(regs, html)
                for src, title in data:
                    song_name = title.strip(‘Mp3下载‘)
                    r = requests.get(src,timeout=30).content
                    time.sleep(1)
                    f = open(‘%s/%s.mp3‘ % (path,song_name), ‘wb‘)
                    f.write(r)
                    print(‘{}：下载成功‘.format(song_name))
                    f.close()
        else:
            print("未找到歌手")


if __name__ == ‘__main__‘:
    """采集静态页面数据"""
    song_static()
    """采集动态页面数据"""
    song_List()
    """下载歌曲"""
    song_search()

网页爬虫---音乐

标签：findall stat main tar int lse 集中成功 import

原文地址：https://www.cnblogs.com/sheshidu/p/13282811.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行