标签:快速 标题 attr att 间隔 sts blank jsoup tor
webmagic简介:
WebMagic是一个简单灵活的Java爬虫框架。你可以快速开发出一个高效、易维护的爬虫。
准备工作:
Maven依赖(我这里用的Maven创建的web项目做测试):
<dependencies> <!-- junit --> <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> <version>4.11</version> <scope>test</scope> </dependency> <!--日志配置 --> <dependency> <groupId>org.slf4j</groupId> <artifactId>slf4j-api</artifactId> <version>1.7.12</version> </dependency> <dependency> <groupId>ch.qos.logback</groupId> <artifactId>logback-core</artifactId> <version>1.2.3</version> </dependency> <!-- 实现slf4j接口并整合 --> <dependency> <groupId>ch.qos.logback</groupId> <artifactId>logback-classic</artifactId> <version>1.2.3</version> </dependency> <!-- 数据库部分 --> <dependency> <groupId>mysql</groupId> <artifactId>mysql-connector-java</artifactId> <version>5.1.34</version> <scope>runtime</scope> </dependency> <!-- c3p0连接池 --> <dependency> <groupId>c3p0</groupId> <artifactId>c3p0</artifactId> <version>0.9.1.2</version> </dependency> <!-- dao框架:mybatis --> <dependency> <groupId>org.mybatis</groupId> <artifactId>mybatis</artifactId> <version>3.4.0</version> </dependency> <!-- mybatis 整合spring --> <dependency> <groupId>org.mybatis</groupId> <artifactId>mybatis-spring</artifactId> <version>1.3.0</version> </dependency> <!-- servlet web依赖 --> <dependency> <groupId>taglibs</groupId> <artifactId>standard</artifactId> <version>1.1.2</version> </dependency> <dependency> <groupId>jstl</groupId> <artifactId>jstl</artifactId> <version>1.2</version> </dependency> <dependency> <groupId>com.fasterxml.jackson.core</groupId> <artifactId>jackson-databind</artifactId> <version>2.5.1</version> </dependency> <dependency> <groupId>javax.servlet</groupId> <artifactId>javax.servlet-api</artifactId> <version>3.1.0</version> </dependency> <!-- spring 依赖 --> <!-- 1.spring核心依赖 --> <dependency> <groupId>org.springframework</groupId> <artifactId>spring-core</artifactId> <version>4.2.5.RELEASE</version> </dependency> <dependency> <groupId>org.springframework</groupId> <artifactId>spring-beans</artifactId> <version>4.2.5.RELEASE</version> </dependency> <dependency> <groupId>org.springframework</groupId> <artifactId>spring-context</artifactId> <version>4.2.5.RELEASE</version> </dependency> <!-- 2.spring dao 层依赖 --> <dependency> <groupId>org.springframework</groupId> <artifactId>spring-jdbc</artifactId> <version>4.2.5.RELEASE</version> </dependency> <dependency> <groupId>org.springframework</groupId> <artifactId>spring-tx</artifactId> <version>4.2.5.RELEASE</version> </dependency> <!-- spring web --> <dependency> <groupId>org.springframework</groupId> <artifactId>spring-web</artifactId> <version>4.2.5.RELEASE</version> </dependency> <dependency> <groupId>org.springframework</groupId> <artifactId>spring-webmvc</artifactId> <version>4.2.5.RELEASE</version> </dependency> <!-- spring test 依赖 --> <dependency> <groupId>org.springframework</groupId> <artifactId>spring-test</artifactId> <version>4.2.6.RELEASE</version> </dependency> <!-- webmagic 网络爬虫jar --> <dependency> <groupId>us.codecraft</groupId> <artifactId>webmagic-extension</artifactId> <version>0.7.3</version> </dependency> </dependencies>
数据库表SQL:
CREATE TABLE `Boke` ( `id` int(11) NOT NULL AUTO_INCREMENT COMMENT ‘id‘, `title` varchar(255) DEFAULT NULL COMMENT ‘标题‘, `linke` varchar(255) DEFAULT NULL COMMENT ‘正文地址‘, `author` varchar(255) DEFAULT NULL COMMENT ‘作者‘, `authorUrl` varchar(255) DEFAULT NULL COMMENT ‘作者主页‘, `summary` varchar(1000) DEFAULT NULL COMMENT ‘简介‘, PRIMARY KEY (`id`) ) ENGINE=InnoDB AUTO_INCREMENT=4890 DEFAULT CHARSET=utf8;
数据库链接工具类:
import java.sql.DriverManager;
import java.sql.SQLException;
import com.mysql.jdbc.Connection;
public class MySqlJdbcUtils {
private static String driver = "com.mysql.jdbc.Driver";
private static String url = "jdbc:mysql://192.168.0.132:3306/xbDB?useUnicode=true&characterEncoding=utf-8";
private static String name="tradingbp";
private static String pwd="123456";
/**
*
* 获取链接
*
* @date 2017年8月31日
* @return
*/
public static Connection getOpenConnection(){
Connection conn= null;
try {
//加载驱动
Class.forName(driver);
conn=(Connection) DriverManager.getConnection(url, name, pwd);
System.out.println("获得数据库链接");
} catch (ClassNotFoundException e) {
e.printStackTrace();
}catch (SQLException e) {
e.printStackTrace();
}
return conn;
}
public static void main(String[] args) {
getOpenConnection();
}
}
import java.sql.DriverManager;
import java.sql.SQLException;
import com.mysql.jdbc.Connection;
public class MySqlJdbcUtils {
private static String driver = "com.mysql.jdbc.Driver";
private static String url = "jdbc:mysql://192.168.0.132:3306/xbDB?useUnicode=true&characterEncoding=utf-8";
private static String name="tradingbp";
private static String pwd="123456";
/**
*
* 获取链接
*
* @date 2017年8月31日
* @return
*/
public static Connection getOpenConnection(){
Connection conn= null;
try {
//加载驱动
Class.forName(driver);
conn=(Connection) DriverManager.getConnection(url, name, pwd);
System.out.println("获得数据库链接");
} catch (ClassNotFoundException e) {
e.printStackTrace();
}catch (SQLException e) {
e.printStackTrace();
}
return conn;
}
public static void main(String[] args) {
getOpenConnection();
}
}
实体类:
/**
*
*java 博客实体
*
* @date 2017年8月24日
* @see [相关类/方法]
* @since [产品/模块版本]
*/
public class JavaBokeModel {
//标题
private String title;
//链接地址
private String linke;
//作者
private String author;
//作者主页地址
private String authorUrl;
//简介
private String summary;
public String getSummary() {
return summary;
}
public void setSummary(String summary) {
this.summary = summary;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getLinke() {
return linke;
}
public void setLinke(String linke) {
this.linke = linke;
}
public String getAuthor() {
return author;
}
public void setAuthor(String author) {
this.author = author;
}
public String getAuthorUrl() {
return authorUrl;
}
public void setAuthorUrl(String authorUrl) {
this.authorUrl = authorUrl;
}
}
webmagic 框架爬取数据并保存
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import com.mysql.jdbc.Connection;
import com.nio.webmagic.jdbc.MySqlJdbcUtils;
import com.nio.webmagic.model.JavaBokeModel;
/**
*
* 爬虫
*
* @version [VCES V201R001, 2017年10月12日]
*
* @see 方法实现 PageProcessor
* @since [产品/模块版本]
*/
public class JavaBoKePageProcessor implements PageProcessor {
private static Connection conn=null;
private static PreparedStatement ps =null;
//标题和链接获取
private static String TITLEQUERY="div.post_item_body h3 a.titlelnk";
//作者
private static String AUTHORQUERY="div.post_item_foot a.lightblue ";
//简介
private static String SUMMARYQUERY="div.post_item_body p.post_item_summary";
//插入sql语句
private static String insertSql ="INSERT INTO Boke (title,linke,author,authorUrl,summary)VALUES(?,?,?,?,?)";
//初始链接
private static Connection getConnection(){
if (conn==null) {
conn = MySqlJdbcUtils.getOpenConnection();
}
return conn;
}
/**
*
* insert操作
*
* @date 2017年8月31日
* @return
*/
private synchronized void insertDb(List<JavaBokeModel> javaBokes){
try {
ps = conn.prepareStatement(insertSql);
for (JavaBokeModel javaBoke:javaBokes) {
ps.setString(1, javaBoke.getTitle().toString());
ps.setString(2, javaBoke.getLinke().toString());
ps.setString(3, javaBoke.getAuthor().toString());
ps.setString(4, javaBoke.getAuthorUrl().toString());
ps.setString(5, javaBoke.getSummary().toString());
ps.executeUpdate();
}
} catch (SQLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
//初始化带爬取网页地址
private static List<String> urls(){
List<String> listUrl =new ArrayList<String>();
for (int i = 2; i <=200; i++) {
//listUrl.add("http://www.cnblogs.com/cate/java/"+i);
listUrl.add("http://www.cnblogs.com/cate/java/"+i);
}
listUrl.toArray(new String[listUrl.size()]);
return listUrl;
}
/**
*
* jsoup根据 html 字符串和语法获取内容;
* @date 2017年8月31日
* @param htmlText
* @return
*/
private static String seletDocumentText(String htmlText,String Query){
Document doc = Jsoup.parse(htmlText);
String select = doc.select(Query).text();
return select;
}
/**
*
* jsoup根据 html 字符串和语法获取链接地址;
* @date 2017年8月31日
* @param htmlText
* @return
*/
private static String seletDocumentLink(String htmlText,String Query){
Document doc = Jsoup.parse(htmlText);
String select = doc.select(Query).attr("href");
return select;
}
/**
* process是定制爬虫逻辑的核心接口,在这里编写抽取逻辑
* @see us.codecraft.webmagic.processor.PageProcessor#process(us.codecraft.webmagic.Page)
*/
@Override
public void process(Page page) {
//
page.addTargetRequests(urls());
//div[@class=‘post_item‘]//div[@class=‘post_item_body‘]//h3//a[@class=‘titlelnk‘]/text()‘
// 定义如何抽取页面信息,并保存下来
List<String> htmls =page.getHtml().xpath("//div[@class=‘post_item‘]/html()").all();
List<JavaBokeModel> javaBokes=new ArrayList<JavaBokeModel>();
for (String html:htmls) {
JavaBokeModel javaBoke =new JavaBokeModel();
//标题和链接
String title =seletDocumentText(html,TITLEQUERY);
String linke =seletDocumentLink(html,TITLEQUERY);
//作者和作者主页
String author=seletDocumentText(html, AUTHORQUERY);
String authorUrl=seletDocumentLink(html, AUTHORQUERY);
//简介
String summary=seletDocumentText(html, SUMMARYQUERY);
javaBoke.setTitle(title);
javaBoke.setAuthor(author);
javaBoke.setAuthorUrl(authorUrl);
javaBoke.setLinke(linke);
javaBoke.setSummary(summary);
javaBokes.add(javaBoke);
}
insertDb(javaBokes);
}
@Override
public Site getSite() {
//抓去网站的相关配置包括:编码、重试次数、抓取间隔
return Site.me().setSleepTime(1000).setRetryTimes(10);
}
public static void main(String[] args) {
long startTime ,endTime;
System.out.println("========小爬虫【启动】喽!=========");
getConnection();
startTime = new Date().getTime();
//入口
Spider create = Spider.create(new JavaBoKePageProcessor());
//定义入口地址
create.addUrl("http://www.cnblogs.com/cate/java/").thread(5).run();
try {
ps.close();
conn.close();
} catch (Exception e) {
// TODO: handle exception
}
endTime = new Date().getTime();
System.out.println("========小爬虫【结束】喽!=========");
System.out.println("用时为:"+(endTime-startTime)/1000+"s");
}
}
数据:
标签:快速 标题 attr att 间隔 sts blank jsoup tor
原文地址:https://www.cnblogs.com/aibabel/p/11017558.html