标签:javascript nodejs 爬虫 url filter
var http = require('http');
var fs = require('fs');
var iconv = require('iconv-lite');   
function webSpider(argument) {	
	this.url = argument.url;	//开始的页面地址
	this.outpath = argument.outpath || 'g://temp/';
	this.filter = argument.filter || new filter({
		regex : argument.regex,	//默认过滤所有的图片
		url : argument.url,
		custom : argument.custom
	});
	this.pagin = argument.pagin || false;	//默认不处理
	this.download = argument.download;	//是否立即下载[默认为是true]
	this.page = argument.page || new page({
		filter:this.filter ,		//默认的过滤器
		outpath:this.outpath ,		//默认输出路径
		download:this.download		//默认不处理
	});	
}
// 开始执行
webSpider.prototype.start = function() {
	if(this.pagin == false) this.page.getHtml(this.url);
	else this.paginHandle();
};
//处理多页的问题
webSpider.prototype.paginHandle = function() {
	var _pagin = this.pagin,_urlRule = _pagin.urlRule,
		i = _pagin.start,len = _pagin.end,
		_page = this.page,_url;
	//仅有第一页,需要处理
	_page.getHtml(this.url ,1);
	//处理剩余页
	while(i<=len){
		_url = _urlRule.replace(/({page})/g,i);
		_page.getHtml(_url ,i);
		i++;
	}
};
//过滤对象
function filter(argument){
	this.regex = argument.regex || /<img\s.*src="(.*?)"/g;
	this.custom = argument.custom;
	this.domainName = this.tools.getDomain(argument.url);
	this.url = argument.url;
}
filter.prototype={
	tools : {
		getDomain : function _getDomain (url) {
	        url = url.split('/');
	        return url[0]+'//'+url[2];
		},
		getTrueFileUrl : function _getTrueFileUrl (fileUrl ,domain) {
			if(fileUrl.indexOf('http')>=0) return fileUrl;
			return domain+'/'+fileUrl;
			//==================================================================================应该继续处理
		}
	},
	execute : function _execute (html) {
		if(!html){console.log('html is null');return;}
		//处理过滤条件 或 调用过滤方法
		var arr = [];
		if(typeof(this.custom)=='function') {/*console.log('file -> custom');*/ arr = this.custom(html ,this.tools.getTrueFileUrl);}
		else {console.log('file -> regex'); arr = this.byRegex(html);}
		//return arr.removal();
		return arr;
	},
	byRegex : function _byRegex (html) {
		var results=[] ,match ,
			_regex = this.regex ,
			_domain = this.domainName,
			_url = this.url ,
			getFilrUrl = this.tools.getTrueFileUrl,i=1;
		while ((match = _regex.exec(html)) != null) {
			console.log('>>:'+match[1]);
	      	results.push({src:getFilrUrl(_domain ,match[1]) ,id:i});
	      	i++;
	    }
	    return results;
	}
}
//处理页面对象 ,包括获得页面的html -> 根据过滤对象获取所需的内容数组 -> 执行下载或自定义的返回方法
//方法包括:获取一个页面
function page(argument) {
	this.filter = argument.filter;
	this.outpath = argument.outpath;
	this.download = argument.download;
}
page.prototype={
	//获取一页的html
	getHtml : function _getHtml (url ,pagei) {
		var self = this, data = null ,download = this.download ,charset = this.charset;
		http.get(url, function (res) {
	       	res.setEncoding('binary');
	       	res.on('data', function (chunk) {
	         	data += chunk;
	       	}).on('end', function () {
	       		var arr = self.filter.execute(iconv.decode(new Buffer(data,'binary'),'gbk'));	//保证中文不乱码的问题
	       		if(download==true) self.downloadFiles(arr ,pagei);
	       	});
	    }).on('error',function () {
	    	console.log('getHtml is error');
	    });
	},
	//下载文件集合,集合必须包含链接
	downloadFiles : function _downloadFiles (arr, pagei) {
		var len, _pagei = pagei || '';
		if(arr && (len=arr.length) > 0){
	        for(var i=0,_tele;i<len;i++){
	            _tele = arr[i];
	            this.downloadFile(_tele.src, this.outpath ,_pagei+'_'+_tele.id);
	        }
	    }else{
	        console.log('results is null');
	    }
	},
	//下载一个文件
	//outpath 的最后一个字符 必须是/
	//默认多线程下载
	downloadFile : function _downloadFile (src ,outpath ,_i) {
		var filename = _i + '_'+ src.substring(src.lastIndexOf('/') + 1);
	    if(!fs.exists(outpath)) fs.mkdir(outpath, 777 ,function () {
	    	var writestream = fs.createWriteStream(outpath + filename);
		    http.get(src, function (res) {
		    	try{
		    		res.pipe(writestream);
			        writestream.on('finish', function(e){
			        	console.log('download : ' + src);
			        }).on('error' ,function(e) {
			        	console.log('####download Error:'+src);
			        });
		    	}catch(e){
		    		console.log('>>>>#######download error:'+e);
		    	}
		       
		    });
	    });
	}
}
module.exports=webSpider;
var fs = require('fs');
var cheerio = require('cheerio');
var webSpider = require('./webSpider');
var downloadZips =[];
function getApk () {
	var ws = new webSpider({
		url:'http://www.duote.com/android/game.html',	//默认第一页
		pagin : {
			urlRule : 'http://www.duote.com/android/game_0_down_{page}.html',
			start : 2,
			end : 714
		},
		custom : function (html ,getpathfun) {	//自定义过滤条件
			var results=[], $ = cheerio.load(html) ,_this;
			$('.list_item .link').each(function(i){
				_this = $(this);
				getPageLinks(getpathfun(_this.attr('href'),'http://www.duote.com/') ,i);
			});
			setTimeout(function () {
				writeFile('E:/webFile/多特apk.txt' ,downloadZips.join(' '));
			},1000);
		},
		download : false 
	});
	ws.start();
}
function getPageLinks(url ,i){
	var ws = new webSpider({
		url:url,	//默认第一页
		custom : function (html ,getpathfun) {	//自定义过滤条件
			var $ = cheerio.load(html);
			var _regex = /var sUrl = '(.*)';/g ,match;
			while ((match = _regex.exec(html)) != null) {
				downloadZips.push('\n'+$('.tit_area h1').text()+"\t\tsrc:"+getpathfun(match[1],'http://app.2345.cn'));
		    }
		}
	});
	ws.start();
}
function writeFile(outpath ,str){  
    // 如果用writeFile,那么会删除旧文件,直接写新文件
    fs.appendFile(outpath, str, function(err){  
        if(err) console.log("fail " + err);  
        else console.log("写入文件ok");  
    });  
}
getApk();版权声明:本文为博主原创文章,未经博主允许不得转载。
标签:javascript nodejs 爬虫 url filter
原文地址:http://blog.csdn.net/u013934914/article/details/47280281