码迷,mamicode.com
首页 > 其他好文 > 详细

记一次爬需要登录之后才能爬取数据的demo

时间:2017-07-01 15:20:48      阅读:319      评论:0      收藏:0      [点我收藏+]

标签:mac   apache   raw   awl   efault   click   blog   sel   thread   

一:工程概况

技术分享

注意:技术分享

 

二:涉及到的类

package com.bigdata.crawler;

import java.io.IOException;
import java.util.List;

import org.apache.commons.io.IOUtils;
import org.apache.http.HttpEntity;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.openqa.selenium.By;
import org.openqa.selenium.Keys;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.firefox.FirefoxDriver;
import org.openqa.selenium.interactions.Actions;

import com.bigdata.util.DriverCommon;

public class CnzzCrawler {
	private String baseUrl ="http://new.cnzz.com/v1/login.php?siteid=1262437219";
   private String password = "******";//查看密码
	
	private ChromeDriver driver;
	public CnzzCrawler() {
	}
	public CnzzCrawler(ChromeDriver driver) {
		super();
		this.driver = driver;
	}
	public void start(){
		// 登入网站
		driver.get(baseUrl);
		// 输入密码
		driver.findElement(By.id("password")).sendKeys(password);
			
		try {
			Thread.sleep(1000);
		} catch (InterruptedException e) {
			e.printStackTrace();
		}
		// 点击登入 html body div.pwdmain div.pwdcheck div.pwdcheck4 div form div img
		// body > div.pwdmain > div.pwdcheck > div.pwdcheck4 > div:nth-child(1) > form > div:nth-child(2) > img
		
		driver.findElement(By.cssSelector("div.pwdcheck4 > div:nth-child(1) > form > div:nth-child(2) > img")).click();
		try {
			Thread.sleep(1000);
		} catch (InterruptedException e) {
			e.printStackTrace();
		}
		Document doc  = Jsoup.parse(driver.getPageSource());		
		//System.out.println(doc);
		//html.cnzz body div#userLoginHeader.userLoginHeader div.section div#rightContainer.rightContainer div#dashboardRootEl.dashboard ul#module_container.module.ui-sortable li.module_data0.moduleTwo table tbody tr.bg-white td.url div a.blue12
		Elements  elements= doc.select("a.blue12");
		
		for(Element e: elements ){
        	String string = e.attr("href");
        	System.out.println(string);
		}
		driver.close();
	}
	public static void main(String[] args) throws IOException {
		System.setProperty("webdriver.chrome.driver", DriverCommon.getDriverName(DriverCommon.getOSType()));
		//System.setProperty("webdriver.firefox.driver", "D:/Program Files (x86)/Mozilla Firefox/firefox.exe");
		
		ChromeDriver driver = new ChromeDriver();
		//FirefoxDriver driver = new FirefoxDriver();
		new CnzzCrawler(driver).start();
	}
	
}

  

package com.bigdata.util;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;

public class DriverCommon {
	/**
     * 获取操作系统类型
     */
    public static String getOSType(){
    	String temp  = System.getProperty("os.name");
    	if(temp.contains("Mac")){
    		return "mac";
    	}else if(temp.contains("Win")){
    		return "win";
    	}else{
    		try {
				Process process = Runtime.getRuntime().exec("getconf LONG_BIT");
				BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(process.getInputStream()));
				String s = bufferedReader.readLine();
				if(s.contains("64")){
					return "linux64";
				}else{
					return "linux32";
				}
			} catch (IOException e) {
				e.printStackTrace();
				return "linux64"; //默认Linux64
			}
    	}
    }
    
    /**
     * 获取浏览器驱动
     * @param os
     * @return
     * @throws IOException
     */
    public static String getDriverName(String os) throws IOException{
		if(os == null)
			return null;
		switch (os) {
		case "win":
			return "chromedriver.exe";
		case "mac":
			return "chromedriver_mac";
		case "linux_32":
			return "chromedriver_linux32";
		case "linux_64":
		default:
			return "chromedriver_linux64";
		}
	}
}

  

 

记一次爬需要登录之后才能爬取数据的demo

标签:mac   apache   raw   awl   efault   click   blog   sel   thread   

原文地址:http://www.cnblogs.com/ipetergo/p/7102290.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!