发现某站点文章很多,爬取所有文章名和链接,并保存在txt文档中,方便后续查看
#!/usr/bin/python
# -*- coding: UTF-8 -*-
import urllib,urllib2,re,requests
import sys
reload(sys)
sys.setdefaultencoding(‘utf-8‘)
domain = [‘http://linux.linuxidc.com/‘]
name_url = [] #一级页面url name
name_url2 = [] #二级页面url name
name_url3 = [] #三级页面url name
name_url4 = [] #四级页面url name
def get():
hd = {"User-Agent":"Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"}
url = ‘http://linux.linuxidc.com/index.php‘
html = requests.get(url,headers=hd).text
#print html
url_content = re.compile(r‘(<div style="float:left;width:410px"><img src="linuxconf/icons/folder.png"> <a href=".*?">.*?</a></div>)‘,re.S) #编译
url_contents = re.findall(url_content,html) #匹配页面
#print url_contents
for i in url_contents:
url_reg = re.compile(r‘<a href="(.*?)">‘) #过滤资料链接
name_reg = re.compile(r‘<a href=".*?">(.*?)</a></div>‘) #过滤资料名称
url_items = re.findall(url_reg,i)
name_items = re.findall(name_reg,i)
#print name_items[0]
#拼接地址链接
url = domain + url_items
url_items = [‘‘.join(url)]
#print url_items[0]
for i,v in zip(name_items,url_items):
name_url.append([i,v])
#print i,v
for j in name_url: #j[0]=name j[1]=url
if j[1] == ‘http://linux.linuxidc.com/index.php?folder=cHVi‘: #忽略pub目录
continue
elif j[1] == ‘http://linux.linuxidc.com/index.php?folder=MjAxMcTq18rBzw==‘: # 忽略2011资料目录
continue
else: #获取其他目录
#print i[0]
html2 = requests.get(j[1], headers=hd).text
# print html2
url_content2 = re.compile(r‘(<div style="float:left;width:410px"><img src="linuxconf/icons/folder.png"> <a href=".*?">.*?</a></div>)‘,re.S) # 编译
url_contents2 = re.findall(url_content, html2) # 匹配二级页面
#print url_contents2
for p in url_contents2:
url_reg2 = re.compile(r‘<a href="(.*?)">‘) # 过滤二级页面资料链接
name_reg2 = re.compile(r‘<a href=".*?">(.*?)</a></div>‘) # 过滤二级页面资料名称
url_items2 = re.findall(url_reg2, p)
name_items2 = re.findall(name_reg2, p)
#print name_items2,url_items2
#拼接地址链接
url2 = domain + url_items2
url_items2 = [‘‘.join(url2)]
#print name_items2[0],url_items2[0]
for m,n in zip(name_items2,url_items2):
name_url2.append([m,n])
#print m,n
for k in name_url2: #k[0]=name k[1]=url
html3 = requests.get(k[1], headers=hd).text
#print html3
url_content3 = re.compile(r‘(<div style="float:left;width:410px"><img src="linuxconf/icons/folder.png"> <a href=".*?">.*?</a></div>)‘,re.S) # 编译
url_contents3 = re.findall(url_content3,html3) #匹配三级页面
#print url_contents3
for p in url_contents3:
url_reg3 = re.compile(r‘<a href="(.*?)">‘) #过滤三级页面资料链接
name_reg3 = re.compile(r‘<a href=".*?">(.*?)</a></div>‘) # 过滤三级页面资料名称
url_items3 = re.findall(url_reg3, p)
name_items3 = re.findall(name_reg3, p)
#print name_items3,url_items3
# 拼接地址链接
url3 = domain + url_items3
url_items3 = [‘‘.join(url3)]
#print name_items3[0],url_items3[0]
for m, n in zip(name_items3, url_items3):
name_url3.append([m, n])
#print m,n
for l in name_url3: #l[0]=name l[1]=url
html4 = requests.get(l[1],headers=hd).text
#print html4
url_content4 = re.compile(r‘(<div style="float:left;width:410px"><img src="linuxconf/icons/folder.png"> <a href=".*?">.*?</a></div>)‘,re.S) # 编译
url_contents4 = re.findall(url_content4, html4) # 匹配四级页面
# print url_contents4
for p in url_contents4:
url_reg4 = re.compile(r‘<a href="(.*?)">‘) # 过滤四级页面资料链接
name_reg4 = re.compile(r‘<a href=".*?">(.*?)</a></div>‘) # 过滤四级页面资料名称
url_items4 = re.findall(url_reg4, p)
name_items4 = re.findall(name_reg4, p)
# print name_items4,url_items4
# 拼接地址链接
url4 = domain + url_items4
url_items4 = [‘‘.join(url4)]
# print name_items4[0],url_items4[0]
for m, n in zip(name_items4, url_items4):
name_url4.append([m, n])
f = open(‘get_list.txt‘,‘a+‘)
print "正在保存--%s" %m
print >> f,"%s,%s" %(m,n)
if __name__ == "__main__":
get()执行过程:
在脚本文件同路径下会生成保存的文件:
文件内容:文章标题,文章链接
报错:
requests.exceptions.ConnectionError: HTTPConnectionPool(host=‘linux.linuxidc.com‘, port=80): Max retries exceeded with url: /index.php?folder=MjAxN8Tq18rBzy8z1MIvMjXI1Q== (Caused by NewConnectionError(‘<requests.packages.urllib3.connection.HTTPConnection object at 0x0000000002B6D198>: Failed to establish a new connection: [Errno 10060] ‘,))
原因:http连接太多没有关闭导致
解决:使用requests的session客户端模式和保持长连接的状态
#定义 request = requests.Session() #代码中全部替换为 html = request.get(url,headers=hd).text
本文出自 “M四月天” 博客,请务必保留此出处http://msiyuetian.blog.51cto.com/8637744/1929710
原文地址:http://msiyuetian.blog.51cto.com/8637744/1929710