使用正则表达式抓取网易云课堂中的数据
发布时间:2020-12-14 00:34:34 所属栏目:百科 来源:网络整理
导读:要抓取数据的页面如下: 代码: package com.url;import java.io.BufferedReader;import java.io.InputStreamReader;import java.net.URL;import java.net.URLConnection;import java.util.Vector;import java.util.regex.Matcher;import java.util.regex.Pa
要抓取数据的页面如下:
代码:
package com.url; import java.io.BufferedReader; import java.io.InputStreamReader; import java.net.URL; import java.net.URLConnection; import java.util.Vector; import java.util.regex.Matcher; import java.util.regex.Pattern; public class PaChong { static Vector<String> url1 = new Vector<>(); public static void FindOne(URL url) throws Exception{ URLConnection Conn = url.openConnection(); Conn.setReadTimeout(10000); BufferedReader read = new BufferedReader(new InputStreamReader(Conn.getInputStream(),"UTF-8")); String line = ""; while((line = read.readLine())!=null){ int index = line.indexOf("about/aboutus.htm#/about?"); //截取URL搜索到的网页源码中的包含该字段的源码 if(index>=0){ String URL ="http://study.163.com/"+line.substring(index); try { URL = URL.substring(0,URL.indexOf(""")); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } url1.add(URL); System.out.println(URL); } } } public static void FindTitle(URL url) throws Exception{ URLConnection Conn = url.openConnection(); Conn.setReadTimeout(10000); BufferedReader read = new BufferedReader(new InputStreamReader(Conn.getInputStream(),"UTF-8")); String line = ""; while((line = read.readLine())!=null){ int Titleindex = line.indexOf("<title>"); if(Titleindex>=0){ System.out.println(PaChong.getChinese(line)); } } } public static void FindContent(URL url) throws Exception{ URLConnection Conn = url.openConnection(); Conn.setReadTimeout(10000); BufferedReader read = new BufferedReader(new InputStreamReader(Conn.getInputStream(),"UTF-8")); String line = ""; while((line = read.readLine())!=null){ // int Contentindex = line.indexOf("<a data-index="); int Contentindex = line.indexOf("data-name="); if(Contentindex>=0){ String content = line.substring(line.indexOf(""")); System.out.println(PaChong.getChinese(content)); } } } //正则表达式提取搜索到网页中需要的中文字符 public static String getChinese(String paramValue) { String regex = "([u4e00-u9fa5]+)"; String str = ""; Matcher matcher = Pattern.compile(regex).matcher(paramValue); while (matcher.find()) { str+= matcher.group(0); str+= " "; } return str; } public static void main(String[] args) throws Exception { // TODO Auto-generated method stub URL url = new URL("http://study.163.com/courses-search?keyword=JAVA"); //爬取的链接 System.err.println("提取的相关介绍网页如下:"); FindOne(url); System.err.println("提取的网页标题如下:"); FindTitle(url); System.err.println("提取的网页内容如下:"); FindContent(url); } } 截图:
(编辑:李大同) 【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容! |