标签:style blog http io os 使用 ar for sp
有时候用 file_get_contents() 函数抓取网页会发生乱码现象。有两个原因会导致乱码,一个是编码问题,一个是目标页面开了Gzip。
编码问题好办,把抓取到的内容转下编码即可($content=iconv("GBK", "UTF-8//IGNORE", $content);),我们这里讨论的是如何抓取开了Gzip的页面。怎么判断呢?获取的头部当中有Content-Encoding: gzip说明内容是GZIP压缩的。用FireBug看一下就知道页面开了gzip没有。下面是用firebug查看我的博客的头信息,Gzip是开了的。繁峙县粮食局
| 1 | 请求头信息原始头信息 | 
| 2 | Accept  text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8 | 
| 3 | Accept-Encoding gzip, deflate | 
| 4 | Accept-Language zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3 | 
| 5 | Connection  keep-alive | 
| 6 | Cookie  __utma=225240837.787252530.1317310581.1335406161.1335411401.1537; __utmz=225240837.1326850415.887.3.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=%E4%BB%BB%E4%BD%95%E9%A1%B9%E7%9B%AE%E9%83%BD%E4%B8%8D%E4%BC%9A%E9%82%A3%E4%B9%88%E7%AE%80%E5%8D%95%20site%3Awww.nowamagic.net; PHPSESSID=888mj4425p8s0m7s0frre3ovc7; __utmc=225240837; __utmb=225240837.1.10.1335411401 | 
| 7 | Host    www.nowamagic.net | 
| 8 | User-Agent  Mozilla/5.0 (Windows NT 5.1; rv:12.0) Gecko/20100101 Firefox/12.0 | 
下面介绍一些解决方案:
1. 使用自带的zlib库
如果服务器已经装了zlib库,用下面的代码可以轻易解决乱码问题。
| 1 | $data= file_get_contents("compress.zlib://".$url);  | 
2. 使用CURL代替file_get_contents
| 1 | functioncurl_get($url, $gzip=false){ | 
| 2 |     $curl= curl_init($url); | 
| 3 |     curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1); | 
| 4 |     curl_setopt($curl, CURLOPT_CONNECTTIMEOUT, 10); | 
| 5 |     if($gzip) curl_setopt($curl, CURLOPT_ENCODING, "gzip"); // 关键在这里 | 
| 6 |     $content= curl_exec($curl); | 
| 7 |     curl_close($curl); | 
| 8 |     return$content; | 
| 9 | } | 
3. 使用gzip解压函数
| 001 | functiongzdecode($data) {  | 
| 002 |   $len= strlen($data);  | 
| 003 |   if($len< 18 || strcmp(substr($data,0,2),"\x1f\x8b")) {  | 
| 004 |     returnnull;  // Not GZIP format (See RFC 1952)  | 
| 005 |   }  | 
| 006 |   $method= ord(substr($data,2,1));  // Compression method  | 
| 007 |   $flags= ord(substr($data,3,1));  // Flags  | 
| 008 |   if($flags& 31 != $flags) {  | 
| 009 |     // Reserved bits are set -- NOT ALLOWED by RFC 1952  | 
| 010 |     returnnull;  | 
| 011 |   }  | 
| 012 |   // NOTE: $mtime may be negative (PHP integer limitations)  | 
| 013 |   $mtime= unpack("V", substr($data,4,4));  | 
| 014 |   $mtime= $mtime[1];  | 
| 015 |   $xfl= substr($data,8,1);  | 
| 016 |   $os= substr($data,8,1);  | 
| 017 |   $headerlen= 10;  | 
| 018 |   $extralen= 0;  | 
| 019 |   $extra= "";  | 
| 020 |   if($flags& 4) {  | 
| 021 |     // 2-byte length prefixed EXTRA data in header  | 
| 022 |     if($len- $headerlen- 2 < 8) {  | 
| 023 |       returnfalse;    // Invalid format  | 
| 024 |     }  | 
| 025 |     $extralen= unpack("v",substr($data,8,2));  | 
| 026 |     $extralen= $extralen[1];  | 
| 027 |     if($len- $headerlen- 2 - $extralen< 8) {  | 
| 028 |       returnfalse;    // Invalid format  | 
| 029 |     }  | 
| 030 |     $extra= substr($data,10,$extralen);  | 
| 031 |     $headerlen+= 2 + $extralen;  | 
| 032 |   }  | 
| 033 | 
| 034 |   $filenamelen= 0;  | 
| 035 |   $filename= "";  | 
| 036 |   if($flags& 8) {  | 
| 037 |     // C-style string file NAME data in header  | 
| 038 |     if($len- $headerlen- 1 < 8) {  | 
| 039 |       returnfalse;    // Invalid format  | 
| 040 |     }  | 
| 041 |     $filenamelen= strpos(substr($data,8+$extralen),chr(0));  | 
| 042 |     if($filenamelen=== false || $len- $headerlen- $filenamelen- 1 < 8) {  | 
| 043 |       returnfalse;    // Invalid format  | 
| 044 |     }  | 
| 045 |     $filename= substr($data,$headerlen,$filenamelen);  | 
| 046 |     $headerlen+= $filenamelen+ 1;  | 
| 047 |   }  | 
| 048 | 
| 049 |   $commentlen= 0;  | 
| 050 |   $comment= "";  | 
| 051 |   if($flags& 16) {  | 
| 052 |     // C-style string COMMENT data in header  | 
| 053 |     if($len- $headerlen- 1 < 8) {  | 
| 054 |       returnfalse;    // Invalid format  | 
| 055 |     }  | 
| 056 |     $commentlen= strpos(substr($data,8+$extralen+$filenamelen),chr(0));  | 
| 057 |     if($commentlen=== false || $len- $headerlen- $commentlen- 1 < 8) {  | 
| 058 |       returnfalse;    // Invalid header format  | 
| 059 |     }  | 
| 060 |     $comment= substr($data,$headerlen,$commentlen);  | 
| 061 |     $headerlen+= $commentlen+ 1;  | 
| 062 |   }  | 
| 063 | 
| 064 |   $headercrc= "";  | 
| 065 |   if($flags& 1) {  | 
| 066 |     // 2-bytes (lowest order) of CRC32 on header present  | 
| 067 |     if($len- $headerlen- 2 < 8) {  | 
| 068 |       returnfalse;    // Invalid format  | 
| 069 |     }  | 
| 070 |     $calccrc= crc32(substr($data,0,$headerlen)) & 0xffff;  | 
| 071 |     $headercrc= unpack("v", substr($data,$headerlen,2));  | 
| 072 |     $headercrc= $headercrc[1];  | 
| 073 |     if($headercrc!= $calccrc) {  | 
| 074 |       returnfalse;    // Bad header CRC  | 
| 075 |     }  | 
| 076 |     $headerlen+= 2;  | 
| 077 |   }  | 
| 078 | 
| 079 |   // GZIP FOOTER - These be negative due to PHP‘s limitations  | 
| 080 |   $datacrc= unpack("V",substr($data,-8,4));  | 
| 081 |   $datacrc= $datacrc[1];  | 
| 082 |   $isize= unpack("V",substr($data,-4));  | 
| 083 |   $isize= $isize[1];  | 
| 084 | 
| 085 |   // Perform the decompression:  | 
| 086 |   $bodylen= $len-$headerlen-8;  | 
| 087 |   if($bodylen< 1) {  | 
| 088 |     // This should never happen - IMPLEMENTATION BUG!  | 
| 089 |     returnnull;  | 
| 090 |   }  | 
| 091 |   $body= substr($data,$headerlen,$bodylen);  | 
| 092 |   $data= "";  | 
| 093 |   if($bodylen> 0) {  | 
| 094 |     switch($method) {  | 
| 095 |       case8:  | 
| 096 |         // Currently the only supported compression method:  | 
| 097 |         $data= gzinflate($body);  | 
| 098 |         break;  | 
| 099 |       default:  | 
| 100 |         // Unknown compression method  | 
| 101 |         returnfalse;  | 
| 102 |     }  | 
| 103 |   } else{  | 
| 104 |     // I‘m not sure if zero-byte body content is allowed.  | 
| 105 |     // Allow it for now...  Do nothing...  | 
| 106 |   }  | 
| 107 | 
| 108 |   // Verifiy decompressed size and CRC32:  | 
| 109 |   // NOTE: This may fail with large data sizes depending on how  | 
| 110 |   //       PHP‘s integer limitations affect strlen() since $isize  | 
| 111 |   //       may be negative for large sizes.  | 
| 112 |   if($isize!= strlen($data) || crc32($data) != $datacrc) {  | 
| 113 |     // Bad format!  Length or CRC doesn‘t match!  | 
| 114 |     returnfalse;  | 
| 115 |   }  | 
| 116 |   return$data;  | 
| 117 | } | 
使用:
| 1 | $html=file_get_contents(‘http://www.nowamagic.net/librarys/veda/‘); | 
| 2 | $html=gzdecode($html); | 
就介绍这三个方法,应该能解决大部分gzip引起的抓取乱码问题了。
标签:style blog http io os 使用 ar for sp
原文地址:http://www.cnblogs.com/xiaoyang002/p/4020133.html