加入收藏 | 设为首页 | 会员中心 | 我要投稿 李大同 (https://www.lidatong.com.cn/)- 科技、建站、经验、云计算、5G、大数据,站长网!
当前位置: 首页 > 站长学院 > PHP教程 > 正文

PHP制作百度词典查词采集器

发布时间:2020-12-13 02:23:11 所属栏目:PHP教程 来源:网络整理
导读:《:PHP制作百度词典查词采集器》要点: 本文介绍了:PHP制作百度词典查词采集器,希望对您有用。如果有疑问,可以联系我们。 PHP学习 百度dict 采集样本 写的采集百度dict词典翻译后的所有结果数据,当然附带了13.5w单词库和采集简单的案例,这里我把写出的主

《:PHP制作百度词典查词采集器》要点:
本文介绍了:PHP制作百度词典查词采集器,希望对您有用。如果有疑问,可以联系我们。

PHP学习

百度dict 采集样本

写的采集百度dict词典翻译后的所有结果数据,当然附带了13.5w单词库和采集简单的案例,这里我把写出的主要类dict.class.php放出来,项目地址http://github.com/widuu/baidu_dict,有需要的直接fork就可以了~么么哒,这东西用的人很少,所以有用的兄弟拿走了哈~


<?php
/**
 * dict.class.php 采集百度词典翻译内容
 *
 * @copyright      (C) 2014 widuu
 * @license       http://www.widuu.com
 * @lastmodify     2014-2-15
 */
 
 
header("content-type:text/html;charset=utf8");
class Dict{
 private $word;
 
 //显示的条数
 private static $num = 10;
 public function __construct(){}
 
 
 /**
   * 公用返回百度采集数据的办法
   * @param string 英文单词
   * retun array(
  *    symbol" => 音标
  *    "pro"  => 发音
  *    "example"=> 例句
  *    "explain"=> 简明释义
  *    "synonym"=> 同反义词
  *    "phrase" => 短语数组
  *   )
   *
  */
 public function content($word){
   $this -> word = $word;
   $symbol = $this -> Pronounced();
   $pro  = $this->getSay();
   $example = $this -> getExample();
   $explain = $this -> getExplain();
   $synonym = $this -> getSynonym();
   $phrase = $this -> getPhrase();
   $result = array(
    "symbol" => $symbol,//音标
    "pro"  => $pro,//发音
    "example"=> $example,//例句
    "explain"=> $explain,//简明释义
    "synonym"=> $synonym,//同反义词
    "phrase" => $phrase   //短语数组
   );
  return $result;
 }
 /**
   * 远程获取百度翻译内容
   * get function curl
   * retun string
   *
  */
 private function getContent(){
   $useragent = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0";
   $ch = curl_init();
   $url = "http://dict.baidu.com/s?wd=".$this->word;
   curl_setopt($ch,CURLOPT_URL,$url);
   curl_setopt($ch,CURLOPT_USERAGENT,$useragent);
  curl_setopt($ch,CURLOPT_RETURNTRANSFER,TRUE); 
  curl_setopt($ch,CURLOPT_FOLLOWLOCATION,1); 
  curl_setopt($ch,CURLOPT_HTTPGET,1);
  curl_setopt($ch,CURLOPT_AUTOREFERER,CURLOPT_HEADER,0); 
  curl_setopt($ch,CURLOPT_TIMEOUT,30);
  $result = curl_exec($ch);
  if (curl_errno($curl)) {
   echo 'Errno'.curl_error($curl);
  }
  curl_close($ch);
  return $result;
 }
 /**
   * 获取百度翻译发音
   * retun array(英,美)
   *
  */
 private function Pronounced(){
  $data = $this -> getContent();
  preg_match_all("/"EN-US"&;(.*)&;/b&;/Ui",$data,$pronounced);
  return array(
   'en' => $pronounced[1][0],'us' => $pronounced[1][1]
  );
 }
 /**
  * 获取百度翻译发音
  * return array(英,美)
  *
  */
 private function getSay(){
  $data = $this -> getContent();
  preg_match_all("/url="(.*)"/Ui",'us' => $pronounced[1][1]
  ); 
 }
 /**
   * 获取百度翻译例句
   * return array() 多维数组 例句
   * 
  */
 private function getExample(){
  $str = "";
  $data = $this -> getContent();
  preg_match_all("/var example_data = (.*)];/Us",$example);
   $data1 = "[[[".ltrim($example[1][0],"[");
   $data2 = explode("[[[",$data1);
   $num = count(array_filter($data2));
  foreach($data2 as $key => $value){
    $data3 = explode("[[","[[".$value);
    foreach ($data3 as $k => $v) {
     preg_match_all("/["(.*)",/Us","[".$v,$match);
     if(!empty($match[1])){
      $str .= implode($match[1]," ")."@";
     }
    }
  }
  $data4 = trim($str,"@");
  $data5 = explode("@",$data4);
  $result = array_chunk($data5,2);
  return $result;
 }
 /**
   * 获取简明释义
   * return array (x => "词性",b => "附属")
   * 
  **/
 private function getExplain(){
  $data = $this -> getContent();
  preg_match_all("/id="en-simple-means"&;(.*)&;div(s+)class="source"&;/Us",$explain);
  $r_data = $explain[1][0];
  preg_match_all("/&;p&;&;strong&;(?P<adj>.*)&;/strong&;&;span&;(?P<name>.*)&;/span&;&;/p&;/Us",$r_data,$a_data);
  preg_match_all("/&;span&;(?P<tag>[^&;]+):&;a(s+)href="(.*)"&;(?P<word>.*)&;/a&;&;/span&;/Us",$b_data);
  
  $result = array();
  foreach ($a_data["adj"] as $key => $value) {
   $result[$value] = $a_data["name"][$key];
  }
  
  $word_b = array();
  foreach ($b_data["tag"] as $key => $value) {
   $word_b[$value] = strip_tags($b_data["word"][$key]);
  }
  
  $result_data = array("x" => $result,"b" => $word_b);
   return $result_data;
 }
 /**
   * 获取同义词
   * return array(0 => "同义词",1 => "反义词") 一般为多维数组
   * 
  */
 private function getSynonym(){
  $data = $this -> getContent();
  preg_match_all("/id="en-syn-ant"&;(.*)<div(s+)class="source">/Us",$synonym);
  $content = $synonym[1][0];
  $data1 = explode("</dl>",$content);
  $result = array();
  $data2 = array();
  foreach ($data1 as $key => $value) {
   preg_match_all("/&;strong&;(?P<adj>.*)&;nbsp;&;/strong&;&;/div&;&;div(s+)class="syn-ant-list"&;&;ul&;(?<content>.*)&;/ul&;/Us",$value,$r_data);
   $data2[$key]["adj"] = $r_data["adj"];
   $data2[$key]["content"] = $r_data["content"];
  }
  foreach ($data2 as $key => $value) {
   foreach ($value["content"] as $k => $v) {
    if(!empty($v)){
     preg_match_all("/&;li&;&;p&;(?P<title>.*)&;/p&;(?P<value>.*)&;/li>/Us",$v,$v_data);
     foreach ($v_data['title'] as $m => $d) {
      $data = strip_tags(preg_replace("<</a>>"," ",$v_data["value"][$m]));
      $result[$key][$value["adj"][$k]][$d] = $data;
     }
    }
   }
  }
   return $result;
 }
 /**
   * 获取短语词组
   * return array (key => value) 一维或者多维数组
   * 
  */
 private function getPhrase(){
  $num = self::$num;
  $data = $this -> getContent();
  preg_match_all("/id="en-phrase"&;(.*)&;div class="source"&;/Us",$phrase);
  $data = explode("</dd>",$phrase[1][0]);
  $data1 = array_slice($data,$num);
  $result = array();
  foreach ($data1 as $key => $value) {
   $data2 = explode("</p>",$value);
   $n = count($data2);
   if($n<=3){
    $result[str_replace("?","",strip_tags($data2[0]))] = strip_tags($data2[1]);
   }else{
    $data3 = array_slice($data2,$n-1);
    $data4 = array_slice($data2,2);
    $res = array_diff($data3,$data4);
    $data5 = array_chunk($res,2);
    $key_value = trim(str_replace("?",strip_tags($data4[0])));
    $result[$key_value] = strip_tags($data4[1]);
    foreach ($data5 as $key => $value) {
     foreach ($value as $k => $v) {
      $value[$k] = strip_tags($v);
     }
     $array = array($result[$key_value],$value);
     if (array_key_exists($key_value,$result)){
      $result[$key_value] = $array;
     }
    }
    
   }
  }
  return $result;
 }
 /**
  * 将数组转换为字符串
  *
  * @param  array  $data    数组
  * @param  bool  $isformdata 如果为0,则不使用new_stripslashes处理,可选参数,默认为1
  * @return  string 返回字符串,如果,data为空,则返回空
  */
 private function array2string($data,$isformdata = 1) {
   if($data == '') return '';
   if($isformdata) $data = $this->new_stripslashes($data);
   return addslashes(var_export($data,TRUE));
 }
 /**
  * 返回经stripslashes处理过的字符串或数组
  * @param $string 需要处理的字符串或数组
  * @return mixed
  */
 private function new_stripslashes($string) {
   if(!is_array($string)) return stripslashes($string);
   foreach($string as $key => $val) $string[$key] = $this->new_stripslashes($val);
   return $string;
 }
}
// $word = new dict("express");
// $word ->content();

以上就是本文的全部内容了,非常实用的功能,希望小伙伴们能够喜欢.

欢迎参与《:PHP制作百度词典查词采集器》讨论,分享您的想法,编程之家 52php.cn为您提供专业教程。

(编辑:李大同)

【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容!

    推荐文章
      热点阅读