码迷,mamicode.com
首页 > Web开发 > 详细

ThinkPHP Http工具类(用于远程采集 远程下载) phpSimpleHtmlDom采集类库_Jquery筛选方式 使用phpQuery轻松采集网页内容

时间:2016-06-27 23:05:21      阅读:286      评论:0      收藏:0      [点我收藏+]

标签:

[php]代码库

<?php
// +----------------------------------------------------------------------
// | ThinkPHP [ WE CAN DO IT JUST THINK IT ]
// +----------------------------------------------------------------------
// | Copyright (c) 2009 http://thinkphp.cn All rights reserved.
// +----------------------------------------------------------------------
// +----------------------------------------------------------------------
// | Author: liu21st <liu21st@gmail.com>
// +----------------------------------------------------------------------
 
/**
 * Http 工具类
 * 提供一系列的Http方法
 * @category   ORG
 * @package  ORG
 * @subpackage  Net
 * @author    liu21st <liu21st@gmail.com>
 */
class Http {
 
    /**
     * 采集远程文件
     * @access public
     * @param string $remote 远程文件名
     * @param string $local 本地保存文件名
     * @return mixed
     */
    static public function curlDownload($remote,$local) {
        $cp = curl_init($remote);
        $fp = fopen($local,"w");
        curl_setopt($cp, CURLOPT_FILE, $fp);
        curl_setopt($cp, CURLOPT_HEADER, 0);
        curl_exec($cp);
        curl_close($cp);
        fclose($fp);
    }
 
   /**
    * 使用 fsockopen 通过 HTTP 协议直接访问(采集)远程文件
    * 如果主机或服务器没有开启 CURL 扩展可考虑使用
    * fsockopen 比 CURL 稍慢,但性能稳定
    * @static
    * @access public
    * @param string $url 远程URL
    * @param array $conf 其他配置信息
    *        int   limit 分段读取字符个数
    *        string post  post的内容,字符串或数组,key=value&形式
    *        string cookie 携带cookie访问,该参数是cookie内容
    *        string ip    如果该参数传入,$url将不被使用,ip访问优先
    *        int    timeout 采集超时时间
    *        bool   block 是否阻塞访问,默认为true
    * @return mixed
    */
    static public function fsockopenDownload($url, $conf = array()) {
        $return = ‘‘;
        if(!is_array($conf)) return $return;
 
        $matches = parse_url($url);
        !isset($matches[‘host‘])    && $matches[‘host‘]     = ‘‘;
        !isset($matches[‘path‘])    && $matches[‘path‘]     = ‘‘;
        !isset($matches[‘query‘])   && $matches[‘query‘]    = ‘‘;
        !isset($matches[‘port‘])    && $matches[‘port‘]     = ‘‘;
        $host = $matches[‘host‘];
        $path = $matches[‘path‘] ? $matches[‘path‘].($matches[‘query‘] ? ‘?‘.$matches[‘query‘] : ‘‘) : ‘/‘;
        $port = !empty($matches[‘port‘]) ? $matches[‘port‘] : 80;
 
        $conf_arr = array(
            ‘limit‘     =>   0,
            ‘post‘      =>   ‘‘,
            ‘cookie‘    =>   ‘‘,
            ‘ip‘        =>   ‘‘,
            ‘timeout‘   =>   15,
            ‘block‘     =>   TRUE,
            );
 
        foreach (array_merge($conf_arr, $conf) as $k=>$v) ${$k} = $v;
 
        if($post) {
            if(is_array($post))
            {
                $post = http_build_query($post);
            }
            $out  = "POST $path HTTP/1.0\r\n";
            $out .= "Accept: */*\r\n";
            //$out .= "Referer: $boardurl\r\n";
            $out .= "Accept-Language: zh-cn\r\n";
            $out .= "Content-Type: application/x-www-form-urlencoded\r\n";
            $out .= "User-Agent: $_SERVER[HTTP_USER_AGENT]\r\n";
            $out .= "Host: $host\r\n";
            $out .= ‘Content-Length: ‘.strlen($post)."\r\n";
            $out .= "Connection: Close\r\n";
            $out .= "Cache-Control: no-cache\r\n";
            $out .= "Cookie: $cookie\r\n\r\n";
            $out .= $post;
        } else {
            $out  = "GET $path HTTP/1.0\r\n";
            $out .= "Accept: */*\r\n";
            //$out .= "Referer: $boardurl\r\n";
            $out .= "Accept-Language: zh-cn\r\n";
            $out .= "User-Agent: $_SERVER[HTTP_USER_AGENT]\r\n";
            $out .= "Host: $host\r\n";
            $out .= "Connection: Close\r\n";
            $out .= "Cookie: $cookie\r\n\r\n";
        }
        $fp = @fsockopen(($ip ? $ip : $host), $port, $errno, $errstr, $timeout);
        if(!$fp) {
            return ‘‘;
        } else {
            stream_set_blocking($fp, $block);
            stream_set_timeout($fp, $timeout);
            @fwrite($fp, $out);
            $status = stream_get_meta_data($fp);
            if(!$status[‘timed_out‘]) {
                while (!feof($fp)) {
                    if(($header = @fgets($fp)) && ($header == "\r\n" ||  $header == "\n")) {
                        break;
                    }
                }
 
                $stop = false;
                while(!feof($fp) && !$stop) {
                    $data = fread($fp, ($limit == 0 || $limit > 8192 ? 8192 : $limit));
                    $return .= $data;
                    if($limit) {
                        $limit -= strlen($data);
                        $stop = $limit <= 0;
                    }
                }
            }
            @fclose($fp);
            return $return;
        }
    }
 
    /**
     * 下载文件
     * 可以指定下载显示的文件名,并自动发送相应的Header信息
     * 如果指定了content参数,则下载该参数的内容
     * @static
     * @access public
     * @param string $filename 下载文件名
     * @param string $showname 下载显示的文件名
     * @param string $content  下载的内容
     * @param integer $expire  下载内容浏览器缓存时间
     * @return void
     */
    static public function download ($filename, $showname=‘‘,$content=‘‘,$expire=180) {
        if(is_file($filename)) {
            $length = filesize($filename);
        }elseif(is_file(UPLOAD_PATH.$filename)) {
            $filename = UPLOAD_PATH.$filename;
            $length = filesize($filename);
        }elseif($content != ‘‘) {
            $length = strlen($content);
        }else {
            throw_exception($filename.L(‘下载文件不存在!‘));
        }
        if(empty($showname)) {
            $showname = $filename;
        }
        $showname = basename($showname);
        if(!empty($filename)) {
            $type = mime_content_type($filename);
        }else{
            $type    =   "application/octet-stream";
        }
        //发送Http Header信息 开始下载
        header("Pragma: public");
        header("Cache-control: max-age=".$expire);
        //header(‘Cache-Control: no-store, no-cache, must-revalidate‘);
        header("Expires: " . gmdate("D, d M Y H:i:s",time()+$expire) . "GMT");
        header("Last-Modified: " . gmdate("D, d M Y H:i:s",time()) . "GMT");
        header("Content-Disposition: attachment; filename=".$showname);
        header("Content-Length: ".$length);
        header("Content-type: ".$type);
        header(‘Content-Encoding: none‘);
        header("Content-Transfer-Encoding: binary" );
        if($content == ‘‘ ) {
            readfile($filename);
        }else {
            echo($content);
        }
        exit();
    }
 
    /**
     * 显示HTTP Header 信息
     * @return string
     */
    static function getHeaderInfo($header=‘‘,$echo=true) {
        ob_start();
        $headers    = getallheaders();
        if(!empty($header)) {
            $info   = $headers[$header];
            echo($header.‘:‘.$info."\n"); ;
        }else {
            foreach($headers as $key=>$val) {
                echo("$key:$val\n");
            }
        }
        $output     = ob_get_clean();
        if ($echo) {
            echo (nl2br($output));
        }else {
            return $output;
        }
 
    }
 
    /**
     * HTTP Protocol defined status codes
     * @param int $num
     */
    static function sendHttpStatus($code) {
        static $_status = array(
            // Informational 1xx
            100 => ‘Continue‘,
            101 => ‘Switching Protocols‘,
 
            // Success 2xx
            200 => ‘OK‘,
            201 => ‘Created‘,
            202 => ‘Accepted‘,
            203 => ‘Non-Authoritative Information‘,
            204 => ‘No Content‘,
            205 => ‘Reset Content‘,
            206 => ‘Partial Content‘,
 
            // Redirection 3xx
            300 => ‘Multiple Choices‘,
            301 => ‘Moved Permanently‘,
            302 => ‘Found‘// 1.1
            303 => ‘See Other‘,
            304 => ‘Not Modified‘,
            305 => ‘Use Proxy‘,
            // 306 is deprecated but reserved
            307 => ‘Temporary Redirect‘,
 
            // Client Error 4xx
            400 => ‘Bad Request‘,
            401 => ‘Unauthorized‘,
            402 => ‘Payment Required‘,
            403 => ‘Forbidden‘,
            404 => ‘Not Found‘,
            405 => ‘Method Not Allowed‘,
            406 => ‘Not Acceptable‘,
            407 => ‘Proxy Authentication Required‘,
            408 => ‘Request Timeout‘,
            409 => ‘Conflict‘,
            410 => ‘Gone‘,
            411 => ‘Length Required‘,
            412 => ‘Precondition Failed‘,
            413 => ‘Request Entity Too Large‘,
            414 => ‘Request-URI Too Long‘,
            415 => ‘Unsupported Media Type‘,
            416 => ‘Requested Range Not Satisfiable‘,
            417 => ‘Expectation Failed‘,
 
            // Server Error 5xx
            500 => ‘Internal Server Error‘,
            501 => ‘Not Implemented‘,
            502 => ‘Bad Gateway‘,
            503 => ‘Service Unavailable‘,
            504 => ‘Gateway Timeout‘,
            505 => ‘HTTP Version Not Supported‘,
            509 => ‘Bandwidth Limit Exceeded‘
        );
        if(isset($_status[$code])) {
            header(‘HTTP/1.1 ‘.$code.‘ ‘.$_status[$code]);
        }
    }
}//类定义结束
if( !function_exists (‘mime_content_type‘)) {
    /**
     * 获取文件的mime_content类型
     * @return string
     */
    function mime_content_type($filename) {
       static $contentType = array(
            ‘ai‘        => ‘application/postscript‘,
            ‘aif‘       => ‘audio/x-aiff‘,
            ‘aifc‘      => ‘audio/x-aiff‘,
            ‘aiff‘      => ‘audio/x-aiff‘,
            ‘asc‘       => ‘application/pgp‘, //changed by skwashd - was text/plain
            ‘asf‘       => ‘video/x-ms-asf‘,
            ‘asx‘       => ‘video/x-ms-asf‘,
            ‘au‘        => ‘audio/basic‘,
            ‘avi‘       => ‘video/x-msvideo‘,
            ‘bcpio‘     => ‘application/x-bcpio‘,
            ‘bin‘       => ‘application/octet-stream‘,
            ‘bmp‘       => ‘image/bmp‘,
            ‘c‘         => ‘text/plain‘, // or ‘text/x-csrc‘, //added by skwashd
            ‘cc‘        => ‘text/plain‘, // or ‘text/x-c++src‘, //added by skwashd
            ‘cs‘        => ‘text/plain‘, //added by skwashd - for C# src
            ‘cpp‘       => ‘text/x-c++src‘, //added by skwashd
            ‘cxx‘       => ‘text/x-c++src‘, //added by skwashd
            ‘cdf‘       => ‘application/x-netcdf‘,
            ‘class‘     => ‘application/octet-stream‘,//secure but application/java-class is correct
            ‘com‘       => ‘application/octet-stream‘,//added by skwashd
            ‘cpio‘      => ‘application/x-cpio‘,
            ‘cpt‘       => ‘application/mac-compactpro‘,
            ‘csh‘       => ‘application/x-csh‘,
            ‘css‘       => ‘text/css‘,
            ‘csv‘       => ‘text/comma-separated-values‘,//added by skwashd
            ‘dcr‘       => ‘application/x-director‘,
            ‘diff‘      => ‘text/diff‘,
            ‘dir‘       => ‘application/x-director‘,
            ‘dll‘       => ‘application/octet-stream‘,
            ‘dms‘       => ‘application/octet-stream‘,
            ‘doc‘       => ‘application/msword‘,
            ‘dot‘       => ‘application/msword‘,//added by skwashd
            ‘dvi‘       => ‘application/x-dvi‘,
            ‘dxr‘       => ‘application/x-director‘,
            ‘eps‘       => ‘application/postscript‘,
            ‘etx‘       => ‘text/x-setext‘,
            ‘exe‘       => ‘application/octet-stream‘,
            ‘ez‘        => ‘application/andrew-inset‘,
            ‘gif‘       => ‘image/gif‘,
            ‘gtar‘      => ‘application/x-gtar‘,
            ‘gz‘        => ‘application/x-gzip‘,
            ‘h‘         => ‘text/plain‘, // or ‘text/x-chdr‘,//added by skwashd
            ‘h++‘       => ‘text/plain‘, // or ‘text/x-c++hdr‘, //added by skwashd
            ‘hh‘        => ‘text/plain‘, // or ‘text/x-c++hdr‘, //added by skwashd
            ‘hpp‘       => ‘text/plain‘, // or ‘text/x-c++hdr‘, //added by skwashd
            ‘hxx‘       => ‘text/plain‘, // or ‘text/x-c++hdr‘, //added by skwashd
            ‘hdf‘       => ‘application/x-hdf‘,
            ‘hqx‘       => ‘application/mac-binhex40‘,
            ‘htm‘       => ‘text/html‘,
            ‘html‘      => ‘text/html‘,
            ‘ice‘       => ‘x-conference/x-cooltalk‘,
            ‘ics‘       => ‘text/calendar‘,
            ‘ief‘       => ‘image/ief‘,
            ‘ifb‘       => ‘text/calendar‘,
            ‘iges‘      => ‘model/iges‘,
            ‘igs‘       => ‘model/iges‘,
            ‘jar‘       => ‘application/x-jar‘, //added by skwashd - alternative mime type
            ‘java‘      => ‘text/x-java-source‘, //added by skwashd
            ‘jpe‘       => ‘image/jpeg‘,
            ‘jpeg‘      => ‘image/jpeg‘,
            ‘jpg‘       => ‘image/jpeg‘,
            ‘js‘        => ‘application/x-javascript‘,
            ‘kar‘       => ‘audio/midi‘,
            ‘latex‘     => ‘application/x-latex‘,
            ‘lha‘       => ‘application/octet-stream‘,
            ‘log‘       => ‘text/plain‘,
            ‘lzh‘       => ‘application/octet-stream‘,
            ‘m3u‘       => ‘audio/x-mpegurl‘,
            ‘man‘       => ‘application/x-troff-man‘,
            ‘me‘        => ‘application/x-troff-me‘,
            ‘mesh‘      => ‘model/mesh‘,
            ‘mid‘       => ‘audio/midi‘,
            ‘midi‘      => ‘audio/midi‘,
            ‘mif‘       => ‘application/vnd.mif‘,
            ‘mov‘       => ‘video/quicktime‘,
            ‘movie‘     => ‘video/x-sgi-movie‘,
            ‘mp2‘       => ‘audio/mpeg‘,
            ‘mp3‘       => ‘audio/mpeg‘,
            ‘mpe‘       => ‘video/mpeg‘,
            ‘mpeg‘      => ‘video/mpeg‘,
            ‘mpg‘       => ‘video/mpeg‘,
            ‘mpga‘      => ‘audio/mpeg‘,
            ‘ms‘        => ‘application/x-troff-ms‘,
            ‘msh‘       => ‘model/mesh‘,
            ‘mxu‘       => ‘video/vnd.mpegurl‘,
            ‘nc‘        => ‘application/x-netcdf‘,
            ‘oda‘       => ‘application/oda‘,
            ‘patch‘     => ‘text/diff‘,
            ‘pbm‘       => ‘image/x-portable-bitmap‘,
            ‘pdb‘       => ‘chemical/x-pdb‘,
            ‘pdf‘       => ‘application/pdf‘,
            ‘pgm‘       => ‘image/x-portable-graymap‘,
            ‘pgn‘       => ‘application/x-chess-pgn‘,
            ‘pgp‘       => ‘application/pgp‘,//added by skwashd
            ‘php‘       => ‘application/x-httpd-php‘,
            ‘php3‘      => ‘application/x-httpd-php3‘,
            ‘pl‘        => ‘application/x-perl‘,
            ‘pm‘        => ‘application/x-perl‘,
            ‘png‘       => ‘image/png‘,
            ‘pnm‘       => ‘image/x-portable-anymap‘,
            ‘po‘        => ‘text/plain‘,
            ‘ppm‘       => ‘image/x-portable-pixmap‘,
            ‘ppt‘       => ‘application/vnd.ms-powerpoint‘,
            ‘ps‘        => ‘application/postscript‘,
            ‘qt‘        => ‘video/quicktime‘,
            ‘ra‘        => ‘audio/x-realaudio‘,
            ‘rar‘       => ‘application/octet-stream‘,
            ‘ram‘       => ‘audio/x-pn-realaudio‘,
            ‘ras‘       => ‘image/x-cmu-raster‘,
            ‘rgb‘       => ‘image/x-rgb‘,
            ‘rm‘        => ‘audio/x-pn-realaudio‘,
            ‘roff‘      => ‘application/x-troff‘,
            ‘rpm‘       => ‘audio/x-pn-realaudio-plugin‘,
            ‘rtf‘       => ‘text/rtf‘,
            ‘rtx‘       => ‘text/richtext‘,
            ‘sgm‘       => ‘text/sgml‘,
            ‘sgml‘      => ‘text/sgml‘,
            ‘sh‘        => ‘application/x-sh‘,
            ‘shar‘      => ‘application/x-shar‘,
            ‘shtml‘     => ‘text/html‘,
            ‘silo‘      => ‘model/mesh‘,
            ‘sit‘       => ‘application/x-stuffit‘,
            ‘skd‘       => ‘application/x-koan‘,
            ‘skm‘       => ‘application/x-koan‘,
            ‘skp‘       => ‘application/x-koan‘,
            ‘skt‘       => ‘application/x-koan‘,
            ‘smi‘       => ‘application/smil‘,
            ‘smil‘      => ‘application/smil‘,
            ‘snd‘       => ‘audio/basic‘,
            ‘so‘        => ‘application/octet-stream‘,
            ‘spl‘       => ‘application/x-futuresplash‘,
            ‘src‘       => ‘application/x-wais-source‘,
            ‘stc‘       => ‘application/vnd.sun.xml.calc.template‘,
            ‘std‘       => ‘application/vnd.sun.xml.draw.template‘,
            ‘sti‘       => ‘application/vnd.sun.xml.impress.template‘,
            ‘stw‘       => ‘application/vnd.sun.xml.writer.template‘,
            ‘sv4cpio‘   => ‘application/x-sv4cpio‘,
            ‘sv4crc‘    => ‘application/x-sv4crc‘,
            ‘swf‘       => ‘application/x-shockwave-flash‘,
            ‘sxc‘       => ‘application/vnd.sun.xml.calc‘,
            ‘sxd‘       => ‘application/vnd.sun.xml.draw‘,
            ‘sxg‘       => ‘application/vnd.sun.xml.writer.global‘,
            ‘sxi‘       => ‘application/vnd.sun.xml.impress‘,
            ‘sxm‘       => ‘application/vnd.sun.xml.math‘,
            ‘sxw‘       => ‘application/vnd.sun.xml.writer‘,
            ‘t‘         => ‘application/x-troff‘,
            ‘tar‘       => ‘application/x-tar‘,
            ‘tcl‘       => ‘application/x-tcl‘,
            ‘tex‘       => ‘application/x-tex‘,
            ‘texi‘      => ‘application/x-texinfo‘,
            ‘texinfo‘   => ‘application/x-texinfo‘,
            ‘tgz‘       => ‘application/x-gtar‘,
            ‘tif‘       => ‘image/tiff‘,
            ‘tiff‘      => ‘image/tiff‘,
            ‘tr‘        => ‘application/x-troff‘,
            ‘tsv‘       => ‘text/tab-separated-values‘,
            ‘txt‘       => ‘text/plain‘,
            ‘ustar‘     => ‘application/x-ustar‘,
            ‘vbs‘       => ‘text/plain‘, //added by skwashd - for obvious reasons
            ‘vcd‘       => ‘application/x-cdlink‘,
            ‘vcf‘       => ‘text/x-vcard‘,
            ‘vcs‘       => ‘text/calendar‘,
            ‘vfb‘       => ‘text/calendar‘,
            ‘vrml‘      => ‘model/vrml‘,
            ‘vsd‘       => ‘application/vnd.visio‘,
            ‘wav‘       => ‘audio/x-wav‘,
            ‘wax‘       => ‘audio/x-ms-wax‘,
            ‘wbmp‘      => ‘image/vnd.wap.wbmp‘,
            ‘wbxml‘     => ‘application/vnd.wap.wbxml‘,
            ‘wm‘        => ‘video/x-ms-wm‘,
            ‘wma‘       => ‘audio/x-ms-wma‘,
            ‘wmd‘       => ‘application/x-ms-wmd‘,
            ‘wml‘       => ‘text/vnd.wap.wml‘,
            ‘wmlc‘      => ‘application/vnd.wap.wmlc‘,
            ‘wmls‘      => ‘text/vnd.wap.wmlscript‘,
            ‘wmlsc‘     => ‘application/vnd.wap.wmlscriptc‘,
            ‘wmv‘       => ‘video/x-ms-wmv‘,
            ‘wmx‘       => ‘video/x-ms-wmx‘,
            ‘wmz‘       => ‘application/x-ms-wmz‘,
            ‘wrl‘       => ‘model/vrml‘,
            ‘wvx‘       => ‘video/x-ms-wvx‘,
            ‘xbm‘       => ‘image/x-xbitmap‘,
            ‘xht‘       => ‘application/xhtml+xml‘,
            ‘xhtml‘     => ‘application/xhtml+xml‘,
            ‘xls‘       => ‘application/vnd.ms-excel‘,
            ‘xlt‘       => ‘application/vnd.ms-excel‘,
            ‘xml‘       => ‘application/xml‘,
            ‘xpm‘       => ‘image/x-xpixmap‘,
            ‘xsl‘       => ‘text/xml‘,
            ‘xwd‘       => ‘image/x-xwindowdump‘,
            ‘xyz‘       => ‘chemical/x-xyz‘,
            ‘z‘         => ‘application/x-compress‘,
            ‘zip‘       => ‘application/zip‘,
       );
       $type = strtolower(substr(strrchr($filename, ‘.‘),1));
       if(isset($contentType[$type])) {
            $mime = $contentType[$type];
       }else {
            $mime = ‘application/octet-stream‘;
       }
       return $mime;
    }
}
 
if(!function_exists(‘image_type_to_extension‘)){
   function image_type_to_extension($imagetype) {
       if(empty($imagetype)) return false;
       switch($imagetype) {
           case IMAGETYPE_GIF       : return ‘.gif‘;
           case IMAGETYPE_JPEG      : return ‘.jpg‘;
           case IMAGETYPE_PNG       : return ‘.png‘;
           case IMAGETYPE_SWF       : return ‘.swf‘;
           case IMAGETYPE_PSD       : return ‘.psd‘;
           case IMAGETYPE_BMP       : return ‘.bmp‘;
           case IMAGETYPE_TIFF_II   : return ‘.tiff‘;
           case IMAGETYPE_TIFF_MM   : return ‘.tiff‘;
           case IMAGETYPE_JPC       : return ‘.jpc‘;
           case IMAGETYPE_JP2       : return ‘.jp2‘;
           case IMAGETYPE_JPX       : return ‘.jpf‘;
           case IMAGETYPE_JB2       : return ‘.jb2‘;
           case IMAGETYPE_SWC       : return ‘.swc‘;
           case IMAGETYPE_IFF       : return ‘.aiff‘;
           case IMAGETYPE_WBMP      : return ‘.wbmp‘;
           case IMAGETYPE_XBM       : return ‘.xbm‘;
           default                  : return false;
       }
   }
 

}

 

 

 

 

 

 

 

 

 

 

JQERUY方式筛选采集内容,相信很多大牛都知道这个类库,可自学出身的我还是找了N久,phpquery,Snoopy等一遍一遍尝试,最后才在无意中找到phpSimpleHtmlDom,更让人惊喜的是又找到了中文手册.
一个人的学习,漫长而又艰辛,真希望有时候能得到指点,不至于让时间无辜的流失.

基础代码获取网页建议用CURL,附加POST数据可以登陆后采集

  1. <?php
  2. require_once(‘./simple_html_dom.php‘);

  3. $url=‘http://www.w3cschool.cc/‘;
  4. $Curl=curl_init();//实例化cURL
  5. curl_setopt($Curl, CURLOPT_URL, $url);//初始化路径
  6. curl_setopt($Curl, CURLOPT_RETURNTRANSFER, 1);//0获取后直接打印出来
  7. curl_setopt($Curl, CURLOPT_HEADER, 1);//0关闭打印相应头,直接打印需为1,
  8. $result=curl_exec($Curl);//执行一个cURL会话
  9. curl_close($Curl);//关闭cURL会话

  10. $html = str_get_html($result);//创建DOM
  11. foreach($html->find(‘#leftcolumn a‘) as $element) {
  12.     echo $element->href . ‘<br>‘;//获取URL
  13.        echo $element->plaintext . ‘<br>‘;//获取纯文本
  14. }

  15. $html->clear(); 
  16. unset($html);
复制代码

中文手册(作者: S.C. Chen):
http://www.ecartchina.com/php-simple-html-dom/index.htm

采集淘宝测试

  1.     require_once(‘simple_html_dom.php‘);
  2.     ini_set("time_limit","0");
  3.     ini_set("memory_limit","512M");
  4.     $memory=memory_get_usage();
  5.     echo ‘memory:‘.($memory/1024).‘KB<br/>‘;
  6.     echo ‘time:‘.date(‘H:i:s‘,time()).‘<br/>‘;

  7. function curl_get_content($url){
  8.     $Curl=curl_init();//实例化cURL
  9.     curl_setopt($Curl, CURLOPT_URL, $url);//初始化路径
  10.     curl_setopt($Curl, CURLOPT_RETURNTRANSFER, 1);//0获取后直接打印出来
  11.     curl_setopt($Curl, CURLOPT_HEADER, 0);//0关闭打印相应头,直接打印需为1,
  12.     $result=curl_exec($Curl);//执行一个cURL会话
  13.     curl_close($Curl);//关闭cURL会话
  14.     return $result;
  15. }

  16.     $cateUrl=‘http://the-seventh-sense.taobao.com/‘;
  17.     $cateCon=curl_get_content($cateUrl);
  18.     $cateHtml = str_get_html($cateCon);//创建DOM
  19.     $CateList=array();
  20.     $i=0;
  21.     foreach($cateHtml->find(‘.J_TAllCatsTree li .fst-cat-hd a[href*=category]‘) as $element) {
  22.         $CateList[$i][‘url‘]=urldecode($element->href);//获取URL
  23.         $CateList[$i][‘name‘]=$element->plaintext;//获取纯文本
  24.         $i++;
  25.     }
  26.     $cateHtml->clear();
  27.     unset($cateHtml);

  28.     $i=0;
  29.     foreach ($CateList as $goodsUrl) {
  30.         $goodsCon=curl_get_content($goodsUrl[‘url‘]);
  31.         $goodsHtml = str_get_html($goodsCon);//创建DOM
  32.         $goodsBlock=$goodsHtml->find(‘.shop-hesper-bd .item‘);
  33.         foreach($goodsBlock as $goodsElement ) {
  34.             $goodsList[$i][‘name‘]=$goodsElement->find(".detail .item-name",0)->plaintext;
  35.             $goodsList[$i][‘price‘]=$goodsElement->find(".detail .c-price",0)->plaintext;
  36.             $goodsList[$i][‘img‘]=$goodsElement->find(".photo a img",0)->src;
  37.             $goodsList[$i][‘catename‘]=$goodsUrl[‘name‘];
  38.             $i++;
  39.         }
  40.         $goodsHtml->clear();
  41.         unset($goodsHtml);
  42.     }

  43.     echo ‘<hr/>‘;

  44.     $n1=count($CateList);
  45.     $n2=count($goodsList);
  46.     echo ‘采集‘.$n1.‘条栏目‘.$n2.‘个商品<br/>‘;

  47.     $memory=memory_get_usage();
  48.     echo ‘memory:‘.($memory/1024).‘KB<br/>‘;
  49.     echo ‘time:‘.date(‘H:i:s‘,time()).‘<br/>‘;
复制代码

beginmemory:971.953125KB
begintime:05:30:19
overmemory:1352.890625KB
overtime:05:30:39
耗时20s,成功采集9个栏目127个商品

 

 

 

phpQuery是一个基于PHP的服务端开源项目,它可以让PHP开发人员轻松处理DOM文档内容,比如获取某新闻网站的头条信息。更有意思的是,它采用了jQuery的思想,你可以像使用jQuery一样处理页面内容,获取你想要的页面信息。

采集头条

先看一实例,现在我要采集新浪网国内新闻的头条,代码如下:

include ‘phpQuery/phpQuery.php‘; 
phpQuery::newDocumentFile(‘http://news.sina.com.cn/china‘); 
echo pq(".blkTop h1:eq(0)")->html(); 

简单的三行代码,就可以获取头条内容。首先在程序中包含phpQuery.php核心程序,然后调用读取目标网页,最后输出对应标签下的内容。

pq()是一个功能强大的方法,跟jQuery的$()如出一辙,jQuery的选择器基本上都能使用在phpQuery上,只要把“.”变成“->”。如上例中,pq(".blkTop h1:eq(0)")抓取了页面class属性为blkTop的DIV元素,并找到该DIV内部的第一个h1标签,然后用html()方法获取h1标签里的内容(带html标签),也就是我们要获取的头条信息,如果使用text()方法,则只获取头条的文本内容。当然要使用好phpQuery,关键是要找对文档中对应内容的节点。

采集文章列表

下面再来看一个例子,获取helloweba.com网站的blog列表,请看代码:

include ‘phpQuery/phpQuery.php‘; 
phpQuery::newDocumentFile(‘http://www.helloweba.com/blog.html‘); 
$artlist = pq(".blog_li"); 
foreach($artlist as $li){ 
   echo pq($li)->find(‘h2‘)->html().""; 

通过循环列表中的DIV,找出文章标题并输出,就是这么简单。

解析XML文档

假设现在有一个这样的test.xml文档:

<?xml version="1.0" encoding="utf-8"?> 
<root
  <contact
     <name>张三</name> 
     <age>22</age> 
  </contact> 
  <contact
     <name>王五</name> 
     <age>18</age> 
  </contact> 
</root> 

现在我要获取名字为张三的联系人的年龄,代码如下:

include ‘phpQuery/phpQuery.php‘; 
phpQuery::newDocumentFile(‘test.xml‘); 
echo pq(‘contact > age:eq(0)‘); 

结果输出:22

像jQuery一样,精准查找文档节点,输出节点下的内容,解析一个XML文档就是这么简单。现在你不必为采集网站内容而使用那些头疼的正则算法、内容替换等繁琐的代码了,有了phpQuery,一切就变得轻松多了。

项目官网地址:http://code.google.com/p/phpquery/

ThinkPHP Http工具类(用于远程采集 远程下载) phpSimpleHtmlDom采集类库_Jquery筛选方式 使用phpQuery轻松采集网页内容

标签:

原文地址:http://www.cnblogs.com/caicaizi/p/5621611.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!