使用正则表达式抓取网易云课堂中的数据

发布时间：2020-12-14 00:34:34 所属栏目：百科来源：网络整理

导读：要抓取数据的页面如下：代码： package com.url;import java.io.BufferedReader;import java.io.InputStreamReader;import java.net.URL;import java.net.URLConnection;import java.util.Vector;import java.util.regex.Matcher;import java.util.regex.Pa

要抓取数据的页面如下：

代码：

package com.url;

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.Vector;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class PaChong {
	static Vector<String> url1 = new Vector<>();
    public static void FindOne(URL url) throws Exception{
      URLConnection Conn = url.openConnection();
      Conn.setReadTimeout(10000);
         
      BufferedReader read = new BufferedReader(new InputStreamReader(Conn.getInputStream(),"UTF-8"));
      String line = "";
      while((line = read.readLine())!=null){
    	  int index = line.indexOf("about/aboutus.htm#/about?"); //截取URL搜索到的网页源码中的包含该字段的源码
    	  if(index>=0){
    		  String URL ="http://study.163.com/"+line.substring(index);
              try {
				URL = URL.substring(0,URL.indexOf("""));
			} catch (Exception e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			}
              url1.add(URL);
    		  System.out.println(URL);
    	  }
       } 
   }
    
    
    public static void FindTitle(URL url) throws Exception{
        URLConnection Conn = url.openConnection();
        Conn.setReadTimeout(10000);
           
        BufferedReader read = new BufferedReader(new InputStreamReader(Conn.getInputStream(),"UTF-8"));
        String line = "";
        while((line = read.readLine())!=null){
      	  int Titleindex = line.indexOf("<title>");
      	  if(Titleindex>=0){
      		  System.out.println(PaChong.getChinese(line));
         } 
        }
     }
    
    public static void FindContent(URL url) throws Exception{
        URLConnection Conn = url.openConnection();
        Conn.setReadTimeout(10000);
           
        BufferedReader read = new BufferedReader(new InputStreamReader(Conn.getInputStream(),"UTF-8"));
        String line = "";
        while((line = read.readLine())!=null){
//          int Contentindex = line.indexOf("<a data-index=");
          int Contentindex = line.indexOf("data-name=");
      	  if(Contentindex>=0){
   		  String content = line.substring(line.indexOf("""));
      	  System.out.println(PaChong.getChinese(content));
         } 
        }
     }
    
    //正则表达式提取搜索到网页中需要的中文字符
    public static String getChinese(String paramValue) {
    	String regex = "([u4e00-u9fa5]+)";
    	String str = "";
    	Matcher matcher = Pattern.compile(regex).matcher(paramValue);
    	while (matcher.find()) {
    		str+= matcher.group(0);
    		str+= "  ";
    	}
    	return str;
    }
	public static void main(String[] args) throws Exception {
		// TODO Auto-generated method stub
       URL url = new URL("http://study.163.com/courses-search?keyword=JAVA");  //爬取的链接
       System.err.println("提取的相关介绍网页如下：");
       FindOne(url);
       System.err.println("提取的网页标题如下：");
       FindTitle(url);
       System.err.println("提取的网页内容如下：");
       FindContent(url);
	}

}

截图：

（编辑：李大同）

【声明】本站内容均来自网络，其相关言论仅代表作者个人观点，不代表本站立场。若无意侵犯到您的权利，请及时与联系站长删除相关内容!