码迷,mamicode.com
首页 > 其他好文 > 详细

用selenium爬取淘宝美食

时间:2017-07-23 22:49:57      阅读:291      评论:0      收藏:0      [点我收藏+]

标签:ati   title   open   ted   final   美食   enc   port   select   

技术分享
‘‘‘利用selenium爬取淘宝美食网页内容‘‘‘

import re
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from pyquery import PyQuery as pq
from config import *

driver = webdriver.PhantomJS(service_args=SERVICE_ARGS)
# driver = webdriver.Chrome()
wait = WebDriverWait(driver, 10)

driver.set_window_size(1400,900)            #有这这句话就是可以爬取到网页的内容,没有的话就出现TimeOut错误

def search():
    print(正在搜索)
    try:
        driver.get(http://www.taobao.com)
        s_input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,#q)))
        sumbit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,#J_TSearchForm > div.search-button > button)))
        s_input.send_keys(KEYWORD)
        sumbit.click()
        totle = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,#mainsrp-pager > div > div > div > div.total)))
        get_products()
        return totle.text
    except TimeoutException:
        print(TimeOut)
        return search()

def next_page(page_number):
    print(正在翻页, page_number)
    try:
        s_input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, #mainsrp-pager > div > div > div > div.form > input)))
        sumbit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, #mainsrp-pager > div > div > div > div.form > span.btn.J_Submit)))
        s_input.clear()
        s_input.send_keys(page_number)
        sumbit.click()
        wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,#mainsrp-pager > div > div > div > ul > li.item.active > span),str(page_number)))
        get_products()
    except TimeoutException:
        print(TimeOut)
        next_page(page_number)

def get_products():
    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,#mainsrp-itemlist .items .item)))
    html = driver.page_source
    doc = pq(html)
    items = doc(#mainsrp-itemlist .items .item).items()
    for item in items:
        product = {
            image: item.find(.pic .img).attr(src),
            price:item.find(.price).text(),
            deal: item.find(.deal-cnt).text()[:-3],
            title: item.find(.title).text(),
            shop: item.find(.shop).text(),
            location: item.find(.location).text()
        }
        print(product)


def main():
    try:
        totle = search()
        totle = int(re.compile((\d+)).search(totle).group(1))
        for num in range(2,totle + 1):
            next_page(num)
    except Exception as e:
        print(e)
    finally:        #最后执行的操作
        driver.close()

if __name__  == __main__:
    main()
View Code

config文件

技术分享
SERVICE_ARGS = [--load-images=false, --disk-cache=true]
KEYWORD = 美食
View Code

 

用selenium爬取淘宝美食

标签:ati   title   open   ted   final   美食   enc   port   select   

原文地址:http://www.cnblogs.com/114811yayi/p/7226206.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!