标签:agent title lis __name__ session arc ice import requests
#导入库
import os import requests from bs4 import BeautifulSoup import time from config import * import random import re from requests import ConnectionError
#生成mzitu请求headers
def res_headers():
headers = {
‘User-Agent‘: random.choice(USER_AGENT_LIST),
‘Referer‘:random.choice(REFERER_LIST),
}
return headers
#生成单个user-agent
def get_header():
headers = {
‘User-Agent‘:random.choice(USER_AGENT_LIST)
}
return headers
#获取list后checkip返回可用ip
def get_proxy_list():
ip_list = []
base_url = ‘https://www.xicidaili.com/wt/‘
header = get_header()
actual_url = base_url + str(random.randint(1,300))
try:
res = requests.get(url=actual_url, headers=header)
if res.status_code == 200:
html = res.text
pattern = ‘(\d+\.\d+\.\d+\.\d+)</td>\s*<td>(\d+)‘
re_list = re.findall(pattern, html)
for ip_port in re_list:
ip_port = ip_port[0] + ‘:‘ + ip_port[1]
ip_list.append(ip_port)
check_ip(ip_list)
else:get_proxy_list()
except ConnectionError:
get_proxy_list()
#check 有效ip
def check_ip(ip_list):
# print(‘check_ip‘)
url_baidu = ‘https://www.mzitu.com/‘
proxy_ip = ‘http://‘ + random.choice(ip_list)
proxy_ip_dic = {
‘http‘: proxy_ip
}
header = get_header()
# print(proxy_ip_dic)
try:
res = requests.get(url_baidu, headers=header, proxies=proxy_ip_dic, timeout=8)
if res.status_code == 200:
# print(proxy_ip_dic)
return proxy_ip_dic
except ConnectionError:
get_proxy_list()
#网站请求
def get_page(url):
headers=res_headers()
# 创建session
s = requests.session()
s.keep_alive = False
# 获取页面
res = s.get(url,headers=headers)
html = res.text
return html
#获取页面all girls的详情页url
def get_all_girls(url):
html = get_page(url)
# 构建soup页面
soup = BeautifulSoup(html, ‘html.parser‘)
# 获取 class_=‘archives‘ 下的所有 ‘a‘标签
total_info = soup.find(class_=‘archives‘).find_all(‘a‘)
# 遍历 ‘a‘ 标签,读取‘href‘值
all_list=[]
for girls_info in total_info:
link_url = girls_info[‘href‘]
all_list.append(link_url)
# print(all_list)
return all_list
#获取girl的所有图片url
def get_girl_all_page(url):
html=get_page(url)
soup = BeautifulSoup(html,‘html.parser‘)
# 在 class_=‘pagenavi‘ 中的倒数第3个标签,读取 ‘span‘ 的值(图片数量)
max_page = soup.find(class_=‘pagenavi‘,).find_all(‘a‘)[-2].find(‘span‘).string
title = soup.find(class_=‘main-title‘).string
# 循环读取详情页面中的‘img‘标签中的‘src‘值
pic_url_list = []
for i in range(int(max_page)):
html = get_page(url + "/%s" %(i+1))
# print(html)
soup = BeautifulSoup(html,‘html.parser‘)
# print(soup.text)
# pic_url = soup.find(‘img‘).get(‘src‘)
pic_url = soup.find(‘img‘).get(‘src‘)
# print(pic_url)
pic_url_list.append(pic_url)
time.sleep(0.1)
# print(pic_url_list)
download_Pic(title,pic_url_list)
#下载图片,以标题为文件夹名
def download_Pic(title, pic_url_list):
# 新建文件夹,路径
os.mkdir(title)
headers = res_headers()
proxy = get_proxy_list()
# 自定义序列号
j = 1
# 下载图片
for item in pic_url_list:
# 定义文件路径及名称
filename = ‘%s/%s.jpg‘ % (title, str(j))
print(‘downloading....%s : NO.%s‘ % (title, str(j)))
with open(filename, ‘wb‘) as f:
img = requests.get(item, headers=headers,proxies=proxy).content
f.write(img)
j += 1
time.sleep(10)
#主程序
if __name__ == ‘__main__‘:
url = "https://www.mzitu.com/all"
pic_list = get_all_girls(url)
for i in pic_list:
get_girl_all_page(i)
标签:agent title lis __name__ session arc ice import requests
原文地址:https://www.cnblogs.com/lijifei/p/12048437.html