码迷,mamicode.com
首页 > 其他好文 > 详细

使用Pyquery+selenium抓取淘宝商品信息

时间:2018-06-15 21:44:08      阅读:190      评论:0      收藏:0      [点我收藏+]

标签:number   count   nbsp   def   跳转   win   return   cache   sea   

配置文件,配置好数据库名称,表名称,要搜索的产品类目,要爬取的页数

MONGO_URL = localhost
MONGO_DB = taobao
MONGO_TABLE = phone

SERVICE_ARGS = [
    --disk-cache=true,  #在phantomjs时使用缓存
    --load-images=false‘  #使用phantomjs时不加载出图pain
]

KEYWORD = 手机
MAXPAGE = 5

主程序

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Date    : 2018-06-14 22:02:26
# @Author  : Chenjun (320316430@qq.com;)
# @Link    : http://example.org
# @Version : $Id$
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from pyquery import PyQuery as pq
from config import *
import pymongo   #使用mongodb数据库存储,在此python提供pymongo库方便使用

client = pymongo.MongoClient(MONGO_URL)
db = client[MONGO_DB]
browser = webdriver.PhantomJS(service_args=SERVICE_ARGS)  #使用phantomjs无界面浏览器,在爬虫抓取时更方便,并且提供api配置
browser.set_window_size(1400, 900)
wait = WebDriverWait(browser, 10)  #设置等待时长等待信息加载出来
 
#拿到所有的商品信息
def search(): print(正在搜索...) try: browser.get(https://www.taobao.com) input = wait.until( EC.presence_of_element_located((By.CSS_SELECTOR, #q)) #等待输入框加载出来并插入光标 ) submit = wait.until(EC.element_to_be_clickable( (By.CSS_SELECTOR, #J_TSearchForm > div.search-button > button))) #等待搜索兼可被点击 input.send_keys(KEYWORD) #模拟用户输入 submit.click() #模拟用户点击 get_products() total = wait.until(EC.presence_of_element_located( (By.CSS_SELECTOR, #mainsrp-pager > div > div > div > div.total))) #获取搜索结果总页数 return total.text except TimeoutException: return search()
def next_page(page_number): print(正在翻页...) try: input = wait.until( EC.presence_of_element_located( (By.CSS_SELECTOR, #mainsrp-pager > div > div > div > div.form > input)) #等待输入页码框加载出来并插入光标 ) submit = wait.until(EC.element_to_be_clickable( (By.CSS_SELECTOR, #mainsrp-pager > div > div > div > div.form > span.btn.J_Submit))) #等待跳转按钮可以被点击 input.clear() #清除当前页码 input.send_keys(page_number) #模拟输入新页码 submit.click() #模拟点击 wait.until(EC.text_to_be_present_in_element( (By.CSS_SELECTOR, #mainsrp-pager > div > div > div > ul > li.item.active > span), str(page_number))) #等到网页跳转到输入的页面 get_products() except TimeoutException: next_page(page_number)
#拿到具体商品信息
def get_products(): wait.until(EC.presence_of_element_located(( By.CSS_SELECTOR, #mainsrp-itemlist .items .item))) #等待商品被加载出来 html = browser.page_source #拿到当前页面dom文档 doc = pq(html) items = doc(#mainsrp-itemlist .items .item).items() count = 0 for item in items: count += 1 product = { #pyquery解析文档 image: item.find(.pic .img).attr(src), price: item.find(.price).text(), deal: item.find(.deal-cnt).text()[:-3], title: item.find(.title).text(), shop: item.find(.shop).text(), location: item.find(.location).text() } save_to_mongo(product, count) print(type(items), type(item)) def save_to_mongo(result, count): try: if db[MONGO_TABLE].insert(result): #存储到mongodb print(f存储{count}到了MONGODB成功) except Exception: print(存储失败) def main(): try: total = search() total = int(re.compile((\d+)).search(total).group(1)) if total >= MAXPAGE: total = MAXPAGE for i in range(2, total + 1): next_page(i) except Exception: print(出错啦!) finally: browser.close() #无论成败,记得关闭浏览器 if __name__ == __main__: main()

 

使用Pyquery+selenium抓取淘宝商品信息

标签:number   count   nbsp   def   跳转   win   return   cache   sea   

原文地址:https://www.cnblogs.com/tarantino/p/9188972.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!