python 爬虫获取文件式网站资源（基于python 3.6）

时间：2017-08-15 21:22:25 阅读：163 评论：0 收藏：0 [点我收藏+]

import urllib.request

from bs4 import BeautifulSoup

from urllib.parse import urljoin

from Cat.findLinks import get_link

from Cat.Load import  Schedule

import os
import time
import errno

-------import的其余包代码----------------

def get_link(page):  # 寻找链接的href
    linkData = []
    for page in page.find_all(‘td‘):
        links = page.select("a")
        for each in links:
            # if str(each.get(‘href‘))[:1] == ‘/‘: 过滤if代码
                data=each.get(‘href‘)
                linkData.append(data)
    return(linkData)

def Schedule(a,b,c):  #当数据过大，加载显示模块
    ‘‘‘‘‘
    a:已经下载的数据块
    b:数据块的大小
    c:远程文件的大小
   ‘‘‘
    per = 100.0 * a * b / c
    if per > 100 :
        per = 100
    print(‘%.2f%%‘ % per)
----------end-------------------



def mkdir_p(path):   #递归创建多级目录
    try:
        os.makedirs(path)
    except OSError as exc: # Python >2.5 (except OSError, exc: for Python <2.5)
        if exc.errno == errno.EEXIST and os.path.isdir(path):
            pass
        else: raise

def file_Down(connet,file):
    urllib.request.urlretrieve(connet, file, Schedule)

def decice(data):
    a = ‘/‘
    if a in data:
        return 1



def findAll(): #主函数
    url=‘http://www.nco.ncep.noaa.gov/pmb/codes/nwprod/nosofs.v3.0.4/‘
    page = urllib.request.urlopen(url).read()
    soup = BeautifulSoup(page,‘lxml‘) #利用BeautifulSoup取得网页代码
    links=get_link(soup)
    # print(links)

    for childLink in range(len(links)-1):
        childLink =childLink +1
        connet = urljoin(url, links[childLink]) #拼接网址路径
        page_next = urllib.request.urlopen(connet).read()
        soup_next = BeautifulSoup(page_next, ‘lxml‘)
        link_next=get_link(soup_next )   #第2次链接内的<a href=?
        file = os.path.join(‘D:\\test\\Index‘ + "\\" + links[childLink])
        # decice(links[childLink])
        # file_cre=os.path.join(‘D:\\test\\Index‘ ,links[childLink])
        if decice(links[childLink]):
            mkdir_p(file )
        else:
            file_Down(connet, file)

        print(connet)
        for child_next in range(len(link_next)-1):
            child_next =child_next +1
            connet_next=urljoin(connet,link_next[child_next] )
            page_next = urllib.request.urlopen(connet_next).read()
            soup_nextF = BeautifulSoup(page_next , ‘lxml‘)
            link_nextF = get_link(soup_nextF)  # 第3次链接内的<a href=?
            fileF = os.path.join(‘D:/test/Index‘ + "/", links[childLink]+link_next[child_next])
            if decice(links[childLink]):
                mkdir_p(fileF)
            else:
                file_Down(connet, fileF)
            print("Start : %s" % time.ctime())
            time.sleep(4)
            print("End : %s" % time.ctime())
            print(connet_next)
            for child_nextT in range(len(link_nextF )-1):
                child_nextT = child_nextT + 1
                connet_nextT = urljoin(connet_next, link_nextF[child_nextT])
                fileT = os.path.join(‘D:/test/Index‘ + "/", links[childLink] + link_next[child_next]+link_nextF[child_nextT] )
                if decice(link_nextF[child_nextT]) == 1:
                    mkdir_p(fileT)
                else:
                    file_Down(connet, fileT)
                print(connet_nextT)


if __name__ == ‘__main__‘:
    findAll()

python 爬虫获取文件式网站资源（基于python 3.6）

标签：url name 过滤 dir down dal 网站 star 模块

原文地址：http://www.cnblogs.com/setname/p/7366989.html

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行