码迷,mamicode.com
首页 > 其他好文 > 详细

爬虫 第三弹

时间:2021-04-23 12:20:22      阅读:0      评论:0      收藏:0      [点我收藏+]

标签:ati   end   动作   driver   动态   off   页面   位置   html   

1.selenium 基本操作

from selenium import webdriver
from time import sleep
#实例化一款浏览器对象
bro = webdriver.Chrome(executable_path=‘chromedriver.exe‘)
#executable_path 里面是浏览器驱动 要根据自己的浏览器版本进行下载
#对指定的url发起请求
bro.get(‘https://www.jd.com/‘)
sleep(2)
进行标签定位
search_box = bro.find_element_by_xpath(‘//*[@id="key"]‘)
#向定位到的标签中录入文本信息
search_box.send_keys(‘mac pro‘)
sleep(2)
bro.find_element_by_xpath(‘//*[@id="search"]/div/div[2]/button‘).click()
sleep(2)
#js注入 刷新页面至页面中间
bro.execute_script(‘window.scrollTo(0,document.body.scrollHeight)‘)

sleep(2)
bro.quit()
#关闭页面

2.selenium 捕获动态加载数据

from selenium import webdriver
from time import sleep
from lxml import etree
#实例化一款浏览器对象
bro = webdriver.Chrome(executable_path=‘chromedriver.exe‘)
bro.get(‘http://scxk.nmpa.gov.cn:81/xk/‘)
sleep(1)
#获取当前页的页面源码数据
page_text = bro.page_source
all_page_text = [page_text]
for i in range(4):
	next_page = bro.find_element_by_xpath(‘//*[@id="pageIto_next"]‘)
	next_page.click()
	sleep(1)
	all_page_text.append(bro.page_source)
for page_text in all_page_text:
	tree = etree.HTML(page_text)
	li_list = tree.xpath(‘//*[@id="gzlist"]/li‘)
	for li in li_list:
		title = li.xpath(‘./dl/@title‘)[0]
		print(title)
sleep(2)
bro.quit()

3.动作链

from selenium import webdriver
from time import sleep
from lxml import etree
from selenium.webdriver import ActionChains #动作连
#实例化一款浏览器对象
bro = webdriver.Chrome(executable_path=‘chromedriver.exe‘)
bro.get(‘https://www.runoob.com/try/try.php?filename=jqueryui-api-droppable‘)

sleep(1)
#注意:如果定位的标签是出现在iframe标签之中,则定位失败,需要完成如下操作
bro.switch_to.frame(‘iframeResult‘)
div_tag = bro.find_element_by_xpath(‘//*[@id="draggable"]‘)
#1.实例化一个动作连对象,且将其和指定的浏览器进行关联

action = ActionChains(bro)
action.click_and_hold(div_tag) #点击且长按
for i in range(5):
	action.move_by_offset(15,17).perform() #perform表示让动作连立即执行
	sleep(0.5)
action.release()
sleep(3)
bro.quit()

4.selenium模拟登陆

from selenium import webdriver
from time import sleep
import base64
import json
import requests
from PIL import Image
def base64_api(uname, pwd,  img,typeid):
	with open(img,‘rb‘) as f:
		base64_data = base64.b64encode(f.read())
		b64 = base64_data.decode()
	data = {"username": uname, "password": pwd,"typeid":typeid, "image": b64}

	result = json.loads(requests.post("http://api.ttshitu.com/predict", json=data).text)

	if result[‘success‘]:
		return result["data"]["result"]
	else:
		return result["message"]
	return ""


from selenium.webdriver import ActionChains #动作连
#实例化一款浏览器对象
bro = webdriver.Chrome(executable_path=‘chromedriver.exe‘)
bro.get(‘https://kyfw.12306.cn/otn/resources/login.html‘)
sleep(2)
bro.find_element_by_xpath(‘/html/body/div[2]/div[2]/ul/li[2]/a‘).click()

sleep(2)
bro.find_element_by_xpath(‘//*[@id="J-userName"]‘).send_keys(‘bobo123456‘)

sleep(2)
bro.find_element_by_xpath(‘//*[@id="J-password"]‘).send_keys(‘1234567890‘)

sleep(2)
#验证码操作#1.将当前整个页面进行图片保存
1.将当前整个页面进行图片保存
bro.save_screenshot(‘./main.png‘)
2.制定裁剪区域
img_tag = bro.find_element_by_xpath(‘//*[@id="J-loginImg"]‘)

location = img_tag.location
size = img_tag.size
#rangle就是裁剪区域
rangle = (int(location[‘x‘]),int(location[‘y‘]),int(location[‘x‘]+size[‘width‘]),int(location[‘y‘]+size[‘height‘]))

i = Image.open(‘./main.png‘)
frame = i.crop(rangle)
frame.save(‘./code.png‘)

#通过打码平台识别验证码
img_path = "./code.png"
result = base64_api(uname=‘bb328410948‘, pwd=‘bb328410948‘, img=img_path,typeid=21)

print(result) #识别验证码的结果:259,141|28,160

all_list = []
if ‘|‘ in result:
	list_1 = result.split(‘|‘)
	count_1 = len(list_1)
	for i in range(count_1):
		xy_list = []
		x = int(list_1[i].split(‘,‘)[0])
		y = int(list_1[i].split(‘,‘)[1])
		xy_list.append(x)
		xy_list.append(y)
		all_list.append(xy_list)
else:
	 x = int(result.split(‘,‘)[0])
	y = int(result.split(‘,‘)[1])
	xy_list = []
	xy_list.append(x)
	xy_list.append(y)
	all_list.append(xy_list)
	
	
	按照all_list中的位置进行定向点击
按照all_list中的位置进行定向点击
for loc in all_list:
	x = loc[0]
	y = loc[1]
	ActionChains(bro).move_to_element_with_offset(img_tag,x,y).click().perform()


	sleep(1)
bro.find_element_by_xpath(‘//*[@id="J-login"]‘).click()
sleep(2)
sleep(2)

爬虫 第三弹

标签:ati   end   动作   driver   动态   off   页面   位置   html   

原文地址:https://www.cnblogs.com/0jiaqing0/p/14691589.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!