标签:
2016年一月,刚做完三个课程设计,C++网络版打地鼠,北山超市收银系统J2EE,JAVA聊天程序,累不堪言,置身奋斗之年承受这些是应该的,毕竟自己的技术还太菜了,没有一个开发者应有的底气。
-------------------------------------- 前记
在此之际,一同事介绍了一个项目,做一个教务信息记录抓取到自己的网页显示之。
--------------------------------------缘由
做这个东西首先也百度了下,网上的文章大多没什么营养,不过也基于网上的文章,整合出了实际可行的一条实现路径。
分析:实现要想抓取网页,抓网页的程序运行规则必须http协议的法则,换句话说,就是你的程序做的事情和浏览器差不多。
实现:首先抓取网页通过java的一些内置类,或者Apache的一些类来实现,例如httpclient。其次如果网页有验证措施,还需要使用相应的容器来存储验证对象(基于JAVA的开发),例如session,cookie。
到这里大概流程:模拟网络蜘蛛访问网页--->是否有验证(有/没有)----->(没有)------->直接抓取网页,在抓取网页时根据自己需要的数据做合适的抓取,没有特殊需求的话建议不要全盘抓取,这会影响抓取速度------>现在剩下的就是用土方法或者正则表达式处理抓到的数据。(有验证)-------->保存cookie,或者session,每次提交查询时构造头信息即可,下列代码直接复制运行可能会报错,自己修改之,现在直接上代码:
servlet执行类:
package src.servlet.cls;
import java.io.IOException;
import java.net.URLEncoder;
import javax.servlet.ServletException;
import javax.servlet.annotation.WebServlet;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
import org.apache.commons.httpclient.Cookie;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.tomcat.util.net.URL;
import src.docatch.cls.CatchDataByCookie;
import src.filter.cls.ChickLoad;
import src.filter.cls.ViewState;
/**
* Servlet implementation class redirecter
*/
@WebServlet("/loadingForCookie")
public class LoadingGetCookie extends HttpServlet {
private static final long serialVersionUID = 1L;
/**
* @see HttpServlet#HttpServlet()
*/
public LoadingGetCookie() {
super();
// TODO Auto-generated constructor stub
}
/**
* @see HttpServlet#doGet(HttpServletRequest request, HttpServletResponse response)
*/
protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
// TODO Auto-generated method stub
response.getWriter().append("Served at: ").append(request.getContextPath());
doPost(request, response);
}
/**
* @see HttpServlet#doPost(HttpServletRequest request, HttpServletResponse response)
*/
protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
org.apache.catalina.util.URLEncoder ue = new org.apache.catalina.util.URLEncoder();
String ViewState = new ViewState().getViewState();
String Account = request.getParameter("tbYHM").trim();
String Password = request.getParameter("tbPSW").trim();
String url = "http://jw.fjcc.edu.cn/Default3.aspx?__VIEWSTATE="+ue.encode(ViewState).toString()+"&tbYHM="+Account+"&tbPSW="+Password+"&ddlSF=%D1%A7%C9%FA&imgDL.x=17&imgDL.y=8";
if(Account == null || Password == null){
//url = "";
System.out.println("参数丢失");
return;
}
///***************************登录成功
//javax.servlet.http.Cookie mycookie = new javax.servlet.http.Cookie();
Cookie[] cookie = null;
//----------------------------------
GetMethod getMethod = new GetMethod(url);//Get方法
HttpClient httpclient = new HttpClient();
httpclient.getHostConfiguration().setHost(url, 80);
if(httpclient.executeMethod(getMethod) == 200){
cookie = httpclient.getState().getCookies();//获取曲奇饼干
for (int i = 0; i < cookie.length; i++) {
System.out.println("cookiename=="+cookie[i].getName());
System.out.println("cookieValue=="+cookie[i].getValue());
//System.out.println("Domain=="+cookie[i].getDomain());
//System.out.println("Path=="+cookie[i].getPath());
//System.out.println("Version=="+cookie[i].getVersion());
response.addCookie(new javax.servlet.http.Cookie(cookie[i].getName(),cookie[i].getValue()));//重新构造参数
}
}
String path;
////由于request的不可刷新性,本类不负责检测账户是否是有效登录
//转入下一级处理
response.sendRedirect((path = String.valueOf(request.getPathInfo())).substring(0, path.length()-4)+"tab.jsp?user="+Account);//转向
/*
response.setCharacterEncoding("GBK");
try{
//后续访问使用
response.getWriter().println(new CatchDataByCookie("http://jw.fjcc.edu.cn/xscj.aspx?xh="+Account, request).getDataByCookie());
}catch(Exception e){
e.printStackTrace();
response.getWriter().println("请求超时!");
}*/
}
}
过滤规则接口:
package src.implement.cls; /* * 集合抓取方法集合 */ public interface catchFromWeb { //抓取方法 public void doCatch(); }
核心抓取器(cookie验证):
package src.docatch.cls; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.io.UnsupportedEncodingException; import java.net.MalformedURLException; import java.net.URL; import java.net.URLConnection; import javax.servlet.http.HttpServletRequest; import src.implement.cls.catchFromWeb; public class CatchDataByCookie implements catchFromWeb{ //http://jw.fjcc.edu.cn/xskcxxcx.aspx?xh=131702237&type=xs private URL domainUrl; private URLConnection conn; //选择加载 private String indexBreak;//结束的条件 //**************************** InputStreamReader isr;// BufferedReader br;// //---------------------------- private StringBuilder webContext;// // HttpServletRequest req;// //---------------------------- public CatchDataByCookie(String url ,HttpServletRequest request){ ////////////////////////// try { domainUrl = new URL(url); conn = domainUrl.openConnection(); } catch (MalformedURLException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } //------------------------- this.req = request; } //----------------------------- public StringBuilder getDataByCookie(){ doCatch(); return webContext; } //----------------------------- public StringBuilder getDataByCookie(String flag){ indexBreak = flag; doCatch(); return webContext; } @Override public void doCatch() { for(int i = 0;i < req.getCookies().length;i++){ if(req.getCookies()[i] == null){ //System.out.println("锟斤拷取cookie失锟斤拷"); break; } //ASP.NET_SessionId=nlselh45eorhjmv2kydb4k55; conn.setRequestProperty("Cookie", req.getCookies()[i].getName()+"="+req.getCookies()[i].getValue());//锟斤拷取cookie //System.out.println("注入cookie:"+req.getCookies()[i].getValue()); } //---------------------------------------------- conn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.152 Safari/537.36"); conn.setRequestProperty("Content-Type", "text/plain; charset=utf-8"); conn.setRequestProperty("Connection", "Keep-Alive"); //----------------------------------------------- String tmpStr = null;// webContext = new StringBuilder(); try { isr = new InputStreamReader(conn.getInputStream(),"GBK"); br = new BufferedReader(isr); while((tmpStr = br.readLine()) != null){ if(indexBreak != null && tmpStr.indexOf(indexBreak) != -1){//没有找到目标 break; } webContext.append(tmpStr); } System.out.println(tmpStr); } catch (UnsupportedEncodingException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); }finally{ try { br.close(); isr.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } }
实现了接口的过滤规则“:
package src.filter.cls; import src.docatch.cls.CatchAchievements; public class ViewState { public String getViewState(){ CatchAchievements ca = new CatchAchievements("http://jw.fjcc.edu.cn/"); String result = String.valueOf(ca.getResult("justify"));//输出结果 //********************* result = result.substring(result.indexOf("__VIEWSTATE"), result.length()).trim(); result = result.substring(result.indexOf("value=\"")+7, result.indexOf("/>")-2); return result; } }
标签:
原文地址:http://www.cnblogs.com/homes/p/5169996.html