htmlparser 基础网页拔取

时间：2015-01-14 11:08:13 阅读：213 评论：0 收藏：0 [点我收藏+]

拔取网页是http://mm.10086.cn/android/info/300008730468.html?from=www&fw=227062网页

打开网页网页的bug模式（F12）

技术分享

找出你想要爬取的数据

代码

package com.baidu;

import org.htmlparser.Node;
import org.htmlparser.Parser;
import org.htmlparser.Tag;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.HasChildFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;

import com.baidu.apply.bean.Apply;
import com.baidu.util.WebUtils;
import com.baidu.util.httpDownload;

public class HtmlParse {

public static void main(String[] args) throws ParserException {

Parser parse=new Parser("http://mm.10086.cn/android/info/300008730468.html?from=www&fw=227062");
parse.setEncoding("UTF-8");
//获取应用的名称
Apply apply=new Apply();
String appName="";

TagNameFilter nameFilter = new TagNameFilter("div");

AndFilter andFilter = new AndFilter(nameFilter, new HasAttributeFilter("class","mj_big_title font-f-yh"));

NodeList list= parse.parse(andFilter);

Tag tag=(Tag) list.elementAt(0);

System.out.println(tag.toPlainTextString());

//清零
parse.reset();

//获取应用
andFilter = new AndFilter(nameFilter, new HasAttributeFilter("class","mj_info font-f-yh"));

list= parse.parse(andFilter);

tag=(Tag) list.elementAt(0);

Node children=tag.getFirstChild();

list = children.getChildren();

for (int i = 0; i < list.size(); i++) {

tag = (Tag) list.elementAt(i);
if(i==0){
apply.setDownloadsize(tag.toPlainTextString());
System.out.println(tag.toPlainTextString());
}
System.out.println(tag.toPlainTextString());

String[] split = tag.toPlainTextString().split("：");

for (int j = 0; j < split.length; j++) {
if(i==1&&j==1){

apply.setPrice(split[j]);
}else if(i==2&&j==1){

apply.setVersion(split[j]);
}else if(i==3&&j==1){

apply.setFilesize(split[j]);
}
else if(i==4&&j==1){
apply.setDeveloper(split[j]);
}else if(i==5&&j==1){
apply.setApptype(split[j]);
}else if(i==6&&j==1){
apply.setUpdatetime(split[j]);
}else if(i==7&&j==1){
apply.setPlatform(split[j]);
}

}

}

//清零
parse.reset();

/**
*下载图片
*/

andFilter = new AndFilter(nameFilter,new HasAttributeFilter("class","mj_lunbo"));

list = parse.parse(andFilter);

tag= (Tag) list.elementAt(0);

// parse.reset();

list = tag.getChildren();

HasChildFilter hasChildFilter = new HasChildFilter(new TagNameFilter("img"));

andFilter=new AndFilter(nameFilter,hasChildFilter);

NodeList extractAll = list.extractAllNodesThatMatch(andFilter);

for (int i = 0; i < extractAll.size(); i++) {

tag=(Tag) extractAll.elementAt(i);
System.out.println(tag.getAttribute("id"));
Tag tag1 = (Tag) tag.getFirstChild();
String pic = tag1.getAttribute("src");


httpDownload.httpDownload(pic, apply.getAppname()+WebUtils.getRandomId()+".jpg");

}

parse.reset();

//
// /**
// * 下载apk
// */
//
// nameFilter = new TagNameFilter("div");
//
// andFilter =new AndFilter(nameFilter, new HasAttributeFilter("class", "mj_cont_left_t"));
//
// list = parse.parse(andFilter);
//
// tag =(Tag) list.elementAt(0);
//
// NodeList children1 = tag.getChildren();
//
// tag= (Tag)children1.elementAt(2);
//
// String href = tag.getAttribute("href");
//
// httpDownload.httpDownload(href, apply.getAppname()+WebUtils.getRandomId()+".apk");
//
//
// parse.reset();

/**
* 下载二维码
*/

nameFilter=new TagNameFilter("div");

andFilter =new AndFilter(nameFilter,new HasAttributeFilter("class","mj_ewlist"));

list=parse.parse(andFilter);

tag = (Tag) list.elementAt(0);

list = tag.getChildren();

tag = (Tag) list.elementAt(1);

list = tag.getChildren();

for (int i = 0; i < list.size(); i++) {
tag=(Tag) list.elementAt(i);

list= tag.getChildren();

tag = (Tag) list.elementAt(0);

String src = tag.getAttribute("src");

httpDownload.httpDownload(src, apply.getAppname()+WebUtils.getRandomId()+".jpg");
}

//添加到数据库
String appid = WebUtils.getRandomId();
String sql="insert into t_app (id,appname,version,description,filesize,updatetime,developer,apptype,price,downloadsize,platform,status) "
+ "values(‘"+appid+"‘,‘"+apply.getAppname()+"‘,‘"+apply.getVersion()+"‘,‘"+apply.getDescription()+"‘,"
+ "‘"+apply.getFilesize()+"‘,‘"+apply.getUpdatetime()+"‘,‘"+apply.getDeveloper()+"‘,‘"+apply.getApptype()+"‘,"
+ "‘"+apply.getPrice()+"‘,‘"+apply.getDownloadsize()+"‘,‘"+apply.getPlatform()+"‘,‘0‘)";
System.out.println(sql);

}




}

htmlparser 基础网页拔取

标签：htmlparser

原文地址：http://blog.csdn.net/zh_cinderella/article/details/42706243

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行

htmlparser 基础 网页拔取

htmlparser 基础网页拔取