码迷,mamicode.com
首页 > 编程语言 > 详细

python 爬虫获取文件式网站资源(基于python 3.6)

时间:2017-08-15 21:22:25      阅读:163      评论:0      收藏:0      [点我收藏+]

标签:url   name   过滤   dir   down   dal   网站   star   模块   

import urllib.request

from bs4 import BeautifulSoup

from urllib.parse import urljoin

from Cat.findLinks import get_link

from Cat.Load import Schedule

import os
import time
import errno

-------import的其余包代码----------------
def get_link(page):  # 寻找链接的href
linkData = []
for page in page.find_all(‘td‘):
links = page.select("a")
for each in links:
# if str(each.get(‘href‘))[:1] == ‘/‘: 过滤if代码
data=each.get(‘href‘)
linkData.append(data)
return(linkData)

def Schedule(a,b,c):  #当数据过大,加载显示模块
‘‘‘‘‘
a:已经下载的数据块
b:数据块的大小
c:远程文件的大小
‘‘‘
per = 100.0 * a * b / c
if per > 100 :
per = 100
print(‘%.2f%%‘ % per)
----------end-------------------


def mkdir_p(path): #递归创建多级目录
try:
os.makedirs(path)
except OSError as exc: # Python >2.5 (except OSError, exc: for Python <2.5)
if exc.errno == errno.EEXIST and os.path.isdir(path):
pass
else: raise

def file_Down(connet,file):
urllib.request.urlretrieve(connet, file, Schedule)

def decice(data):
a = ‘/‘
if a in data:
return 1



def findAll(): #主函数
url=‘http://www.nco.ncep.noaa.gov/pmb/codes/nwprod/nosofs.v3.0.4/‘
page = urllib.request.urlopen(url).read()
soup = BeautifulSoup(page,‘lxml‘) #利用BeautifulSoup取得网页代码
links=get_link(soup)
# print(links)

for childLink in range(len(links)-1):
childLink =childLink +1
connet = urljoin(url, links[childLink]) #拼接网址路径
page_next = urllib.request.urlopen(connet).read()
soup_next = BeautifulSoup(page_next, ‘lxml‘)
link_next=get_link(soup_next ) #第2次链接内的<a href=?
file = os.path.join(‘D:\\test\\Index‘ + "\\" + links[childLink])
# decice(links[childLink])
# file_cre=os.path.join(‘D:\\test\\Index‘ ,links[childLink])
if decice(links[childLink]):
mkdir_p(file )
else:
file_Down(connet, file)

print(connet)
for child_next in range(len(link_next)-1):
child_next =child_next +1
connet_next=urljoin(connet,link_next[child_next] )
page_next = urllib.request.urlopen(connet_next).read()
soup_nextF = BeautifulSoup(page_next , ‘lxml‘)
link_nextF = get_link(soup_nextF) # 第3次链接内的<a href=?
fileF = os.path.join(‘D:/test/Index‘ + "/", links[childLink]+link_next[child_next])
if decice(links[childLink]):
mkdir_p(fileF)
else:
file_Down(connet, fileF)
print("Start : %s" % time.ctime())
time.sleep(4)
print("End : %s" % time.ctime())
print(connet_next)
for child_nextT in range(len(link_nextF )-1):
child_nextT = child_nextT + 1
connet_nextT = urljoin(connet_next, link_nextF[child_nextT])
fileT = os.path.join(‘D:/test/Index‘ + "/", links[childLink] + link_next[child_next]+link_nextF[child_nextT] )
if decice(link_nextF[child_nextT]) == 1:
mkdir_p(fileT)
else:
file_Down(connet, fileT)
print(connet_nextT)


if __name__ == ‘__main__‘:
findAll()


python 爬虫获取文件式网站资源(基于python 3.6)

标签:url   name   过滤   dir   down   dal   网站   star   模块   

原文地址:http://www.cnblogs.com/setname/p/7366989.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!