先要到网上下载Snoopy.class.php 调用方法: <div class="codetitle"><a style="CURSOR: pointer" data="29757" class="copybut" id="copybut29757" onclick="doCopy('code29757')"> 代码如下:<div class="codebody" id="code29757"> <?php require 'lib/Snoopy.class.php'; require 'lib/WebCrawl.class.php';//包含下面代码 $go=new WebCrawl('http://www.baidu.com'); echo $go->getCharset(); ?> <div class="codetitle"><a style="CURSOR: pointer" data="21471" class="copybut" id="copybut21471" onclick="doCopy('code21471')"> 代码如下:<div class="codebody" id="code21471"> <?php class WebCrawl { private $url; private $request; public $charset_arr=array( 'gb2312', 'utf-8', 'big5', 'gbk', 'ascii', 'cp936', 'ibm037', 'ibm437', 'ibm500', 'asmo-708', 'dos-720', 'ibm737', 'ibm775', 'ibm850', 'ibm852', 'ibm855', 'ibm857', 'ibm00858', 'ibm861', 'ibm860', 'dos-862', 'ibm863', 'ibm864', 'ibm865', 'cp866', 'ibm869', 'ibm870', 'windows-874', 'cp875', 'shift_jis', 'ks_c_5601-1987', 'ibm1026', 'ibm01047', 'ibm01040', 'ibm01041', 'ibm01042', 'ibm01043', 'ibm01044', 'ibm01045', 'ibm01046', 'ibm01048', 'ibm01049', 'utf-16', 'unicodefffe', 'windows-1250', 'windows-1251', 'windows-1252', 'windows-1253', 'windows-1254', 'windows-1255', 'windows-1256', 'windows-1257', 'windows-1258', 'johab', 'macintosh', 'x-mac-japanese', 'x-mac-chinesetrad', 'x-mac-korean', 'x-mac-arabic', 'x-mac-hebrew', 'x-mac-greek', 'x-mac-cyrillic', 'x-mac-chinesesimp', 'x-mac-romanian', 'x-mac-ukrainian', 'x-mac-thai', 'x-mac-ce', 'x-mac-icelandic', 'x-mac-turkish', 'x-mac-croatian', 'x-chinese-cns', 'x-cp20001', 'x-chinese-eten', 'x-cp20003', 'x-cp20004', 'x-cp20005', 'x-ia5', 'x-ia5-german', 'x-ia5-swedish', 'x-ia5-norwegian', 'us-ascii', 'x-cp20261', 'x-cp20269', 'ibm273', 'ibm277', 'ibm278', 'ibm280', 'ibm284', 'ibm285', 'ibm290', 'ibm420', 'ibm423', 'ibm424', 'x-ebcdic-koreanextended', 'ibm-thai', 'koi8-r', 'ibm871', 'ibm880', 'ibm905', 'ibm00924', 'x-cp20936', 'x-cp20949', 'cp1025', 'koi8-u', 'iso-8859-1', 'iso-8859-2', 'iso-8859-3', 'iso-8859-4', 'iso-8859-5', 'iso-8859-6', 'iso-8859-7', 'iso-8859-8', 'iso-8859-9', 'iso-8859-13', 'iso-8859-15', 'x-europa', 'iso-8859-8-i', 'iso-2022-jp', 'csiso2022jp', 'iso-2022-kr', 'x-cp50227', 'euc-jp', 'euc-cn', 'euc-kr', 'hz-gb-2312', 'gb18030', 'x-iscii-de', 'x-iscii-be', 'x-iscii-ta', 'x-iscii-te', 'x-iscii-as', 'x-iscii-or', 'x-iscii-ka', 'x-iscii-ma', 'x-iscii-gu', 'x-iscii-pa', 'utf-7', 'utf-32', 'utf-32be' ); public function __construct($url) { $this->url=$url; } //打开网站 private function open($url) { if($this->request!==null) { if($this->request->status==200) { return true; } else { return false; } } else { $this->request=new Snoopy(); $this->request->fetch($url); if($this->request->status==200) { $this->request->results=strtolower($this->request->results); $charset=$this->getCharset(); if($charset!="utf-8") { if($charset=="windows-1252") { $this->request->results=$this->uni_decode($this->request->results); } else { $this->request->results=mb_convert_encoding($this->request->results,"UTF-8",$charset); } } return true; } else { return false; } } } //获取网站title,keywords,description public function getWebinfo() { $info=array( 'title'=>'', 'keywords'=>'', 'desc'=>'', 'ip'=>'' ); if(!$this->open($this->url)){return $info;exit;} // print_r($this->request->results);exit; preg_match('/([^>])</title>/si',$this->request->results,$titlematch ); if (isset($titlematch) && is_array($titlematch) && count($titlematch) > 0) { $info['title'] = strip_tags($titlematch[1]); } preg_match_all('/<[s]meta[s]name="?' . '([^>"])"?[s]' . 'content="?([^>"])"?[s][/]?[s]>/si',$match); $ft=0; foreach($match[1] as $mt) { if($mt=="keywords" || $mt=="description") { $ft=1; } } if($ft==0) { preg_match_all('/<[s]meta[s]content="?([^>"])"?[s]name="?' . '([^>"])"?[s][/]?[s]*>/si',$match); if (isset($match) && is_array($match) && count($match) == 3) { $originals = $match[0]; $names = $match[2]; $values = $match[1]; if (count($originals) == count($names) && count($names) == count($values)) { $metaTags = array(); for ($i=0,$limiti=count($names); $i < $limiti; $i++) { $metaTags[$names[$i]] = array ( 'html' => htmlentities($originals[$i]), 'value' => $values[$i] ); } } } } else { if (isset($match) && is_array($match) && count($match) == 3) { $originals = $match[0]; $names = $match[1]; $values = $match[2]; if (count($originals) == count($names) && count($names) == count($values)) { $metaTags = array(); for ($i=0, 'value' => $values[$i] ); } } } } $result = array ( 'metaTags' => $metaTags ); if(isset($result['metaTags']['keywords']['value'])) { $info['keywords']=$result['metaTags']['keywords']['value']; } else { $info['keywords']=""; } if(isset($result['metaTags']['description']['value'])) { $info['desc']=$result['metaTags']['description']['value']; } else { $info['desc']=""; } $domain=preg_replace('/http:///si','',$this->url); $ip=@gethostbyname($domain); $ip_arr=explode(".",$ip); if(count($ip_arr)==4) { $info['ip']=$ip; } return $info; } public function t($string,$o) { for($i=0;$i<strlen($string);$i++) { if(ord($string{$i})<128) continue; if((ord($string{$i})&224)==224) { //第一个字节判断通过 $char = $string{++$i}; if((ord($char)&128)==128) { //第二个字节判断通过 $char = $string{++$i}; if((ord($char)&128)==128) { $encoding = "UTF-8"; break; } } } if((ord($string{$i})&192)==192) { //第一个字节判断通过 $char = $string{++$i}; if((ord($char)&128)==128) { //第二个字节判断通过 $encoding = "GB2312"; break; } } } return strtolower($encoding); } function uni_decode ($str,$code = 'utf-8'){ $str = json_decode(preg_replace_callback('/&#(d{5});/',create_function('$dec','return 'u'.dechex($dec[1]);'),'"'.$str.'"')); if($code != 'utf-8'){ $str = iconv('utf-8',$code,$str); } return $str; } //获取网站编码 public function getCharset() { if(!$this->open($this->url)){return false;exit;} //首先从html获取编码 preg_match("/<meta.+?charset=[^w]?([-w]+)/i",$temp) ? strtolower($temp[1]):""; if($temp[1]!="") { if(in_array($temp[1],$this->charset_arr)) { if($temp[1]=="gb2312") { $tmp_charset=$this->t($this->request->results,$temp[1]); if($tmp_charset==$temp[1]) { return $temp[1]; } } else { return $temp[1]; } } } if(!empty($this->request->headers)) { //从header中获取编码 $hstr=strtolower(implode("|||",$this->request->headers)); preg_match("/charset=[^w]?([-w]+)/is",$hstr,$lang) ? strtolower($lang[1]):""; if($lang[1]!="") { return $lang[1]; } } $encode_arr=array("UTF-8","GB2312","GBK","BIG5","ASCII","EUC-JP","Shift_JIS","CP936","ISO-8859-1","JIS","eucjp-win","sjis-win"); $encoded=mb_detect_encoding($this->request->results,$encode_arr); if($encoded) { return strtolower($encoded); } else { return false; } } } ?>
(编辑:李大同)
【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容!
|