使用TDD方式开发:列出CSDN所有博客文章
最近,在做一个Code Kata,突然想把自己CSDN博客上面所有的文章全部列出来,而且是先写测试,在写实现(传说中的TDD)。下面把其分享出来。笔者是基于org.htmlparser.htmlparser来进行页面解析的。如果大家需要用的话,请在pom.xml文件里面加入下面的依赖。 <dependency> <groupId>org.htmlparser</groupId> <artifactId>htmlparser</artifactId> <version>2.1</version> </dependency>值得一提的是,在使用org.htmlparser.htmlparser的时候,恰当合理的Filter(过滤器)非常的重要,如果使用得当的,往往会事半功倍。下面把常用的16个Filter(过滤器)列出来一下。 16个不同的Filter,也可以分为几类。
* 逻辑运算Filter:
* 其他Filter:
#1 TDD中测试先行,测试程序部分package com.winneryum.csdn;
import static org.junit.Assert.*;
import java.util.List;
import org.junit.Test;
public class CSDNPageParserTest {
@Test
public void testListAllCategoryURLByCSDNIdURL(){
//http://blog.csdn.net/chancein007/
String csdnID="chancein007";
CSDNPageParser csdnPageParser=new CSDNPageParser(csdnID);
List<String> lsCategryURLs=csdnPageParser.listAllCategoryURLsByCSDNId();
assertTrue(lsCategryURLs.size()>0);
System.out.println(lsCategryURLs.toString());
}
@Test
public void testListPagesByCategoryURLs(){
CSDNPageParser csdnPageParser=new CSDNPageParser();
String categoryURL="http://blog.csdn.net//chancein007/article/category/2331239";
List<String> lsPages= csdnPageParser.listPagesByCategoryURL(categoryURL);
assertTrue(lsPages.size()>0);
System.out.println(lsPages.toString());
}
@Test
public void testGetAllPageURLs(){
String csdnID="chancein007";
CSDNPageParser csdnPageParser=new CSDNPageParser(csdnID);
List<String> lsAllPages= csdnPageParser.getAllPageURLs();
assertTrue(lsAllPages.size()>0);
for(int i=0;i<lsAllPages.size();i++){
System.out.println(lsAllPages.get(i));
}
}
}
# 2 程序实现部分package com.winneryum.csdn;
import java.util.ArrayList;
import java.util.Hashtable;
import java.util.List;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.http.ConnectionManager;
import org.htmlparser.nodes.TagNode;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
public class CSDNPageParser {
public final static String CSDN_ROOT_URL="http://blog.csdn.net";
private String csdnID;
private String getCSDNRootBlogURL(){
return CSDN_ROOT_URL+"/"+csdnID+"/";
}
public CSDNPageParser(String csdnID) {
this.csdnID=csdnID;
}
public CSDNPageParser() {
}
public List<String> listAllCategoryURLsByCSDNId() {
List<String> categoryURLs=new ArrayList<String>();
String encoding = "UTF-8";
try {
Parser onLineHtmlParser;
onLineHtmlParser = new Parser();
ConnectionManager connectionManager=Parser.getConnectionManager ();
Hashtable hashTable=connectionManager.getRequestProperties();
hashTable.put("User-Agent","Firefox");
connectionManager.setRequestProperties(hashTable);
onLineHtmlParser.setURL(getCSDNRootBlogURL());
onLineHtmlParser.setEncoding(encoding);
NodeFilter filter = new HasAttributeFilter( "id","panel_Category" );
//NodeClassFilter nodeClassFilter=new NodeClassFilter(org.htmlparser.tags.LinkTag.class);
//AndFilter andFilter=new AndFilter(new NodeFilter[]{filter,nodeClassFilter});
NodeList nodes = onLineHtmlParser.extractAllNodesThatMatch(filter);
String categorySegment= nodes.elementAt(1).toHtml();
Parser categorySegementParser = new Parser(categorySegment);
TagNameFilter tagFileter=new TagNameFilter("a");
NodeList categoryNode=categorySegementParser.extractAllNodesThatMatch(tagFileter);
for(int i=0;i<categoryNode.size();i++){
TagNode linkNode=(TagNode)categoryNode.elementAt(i);
categoryURLs.add(CSDN_ROOT_URL+linkNode.getAttribute("href"));
}
} catch (ParserException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return categoryURLs;
}
public List<String> listPagesByCategoryURL(String categoryURL) {
List<String> pageURLs=new ArrayList<String>();
String encoding = "UTF-8";
try {
Parser onLineHtmlParser;
onLineHtmlParser = new Parser();
ConnectionManager connectionManager=Parser.getConnectionManager ();
Hashtable hashTable=connectionManager.getRequestProperties();
hashTable.put("User-Agent","Firefox");
connectionManager.setRequestProperties(hashTable);
onLineHtmlParser.setURL(categoryURL);
onLineHtmlParser.setEncoding(encoding);
TagNameFilter h1TagFileter=new TagNameFilter("h1");
NodeList h1Nodes = onLineHtmlParser.extractAllNodesThatMatch(h1TagFileter);
String pageSegment= h1Nodes.toHtml();
Parser categorySegementParser = new Parser(pageSegment);
TagNameFilter tagFileter=new TagNameFilter("a");
NodeList pageDetailedNode=categorySegementParser.extractAllNodesThatMatch(tagFileter);
for(int i=0;i<pageDetailedNode.size();i++){
TagNode linkNode=(TagNode)pageDetailedNode.elementAt(i);
pageURLs.add(CSDN_ROOT_URL+linkNode.getAttribute("href"));
}
} catch (ParserException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return pageURLs;
}
public List<String> getAllPageURLs() {
List<String> allpageURLs=new ArrayList<String>();
List<String> allCatgoryURL=listAllCategoryURLsByCSDNId();
for(String categoryURL:allCatgoryURL){
List<String> listPageURLs=listPagesByCategoryURL(categoryURL);
allpageURLs.addAll(listPageURLs);
}
return allpageURLs;
}
}
#3 注意事项注意上面这段代码,
ConnectionManager connectionManager=Parser.getConnectionManager (); Hashtable hashTable=connectionManager.getRequestProperties(); hashTable.put("User-Agent","Firefox"); connectionManager.setRequestProperties(hashTable); 如果没有这段代码,CSDN网站就会认为这是一个机器在访问CSDN网站,就会抛出下面的403 Forbidden的状态码。 org.htmlparser.util.ParserException: Exception getting input stream from http://blog.csdn.net/chancein007/ (Server returned HTTP response code: 403 for URL: http://blog.csdn.net/chancein007/).; #4 运行结果
http://blog.csdn.net/chancein007/article/details/53731148 http://blog.csdn.net/chancein007/article/details/52675923 http://blog.csdn.net/chancein007/article/details/52305324 http://blog.csdn.net/chancein007/article/details/52109198 http://blog.csdn.net/chancein007/article/details/50646864 http://blog.csdn.net/chancein007/article/details/50507571 http://blog.csdn.net/chancein007/article/details/50507557 http://blog.csdn.net/chancein007/article/details/46508301 http://blog.csdn.net/chancein007/article/details/46444385 http://blog.csdn.net/chancein007/article/details/42132741 http://blog.csdn.net/chancein007/article/details/42033201 http://blog.csdn.net/chancein007/article/details/41652415 http://blog.csdn.net/chancein007/article/details/41628911 http://blog.csdn.net/chancein007/article/details/41276881 http://blog.csdn.net/chancein007/article/details/41158659 http://blog.csdn.net/chancein007/article/details/32181133 http://blog.csdn.net/chancein007/article/details/30587585 http://blog.csdn.net/chancein007/article/details/30475085 http://blog.csdn.net/chancein007/article/details/30365737 http://blog.csdn.net/chancein007/article/details/27123273 http://blog.csdn.net/chancein007/article/details/50984565 http://blog.csdn.net/chancein007/article/details/50909318 http://blog.csdn.net/chancein007/article/details/50909263 http://blog.csdn.net/chancein007/article/details/50909156 http://blog.csdn.net/chancein007/article/details/42131677 http://blog.csdn.net/chancein007/article/details/42120567 http://blog.csdn.net/chancein007/article/details/42120019 http://blog.csdn.net/chancein007/article/details/42119581 http://blog.csdn.net/chancein007/article/details/41950775 http://blog.csdn.net/chancein007/article/details/41280997 http://blog.csdn.net/chancein007/article/details/41157887 http://blog.csdn.net/chancein007/article/details/41156783 http://blog.csdn.net/chancein007/article/details/41156609 http://blog.csdn.net/chancein007/article/details/52939738 http://blog.csdn.net/chancein007/article/details/52939565 http://blog.csdn.net/chancein007/article/details/52939194 http://blog.csdn.net/chancein007/article/details/52684365 http://blog.csdn.net/chancein007/article/details/52676040 http://blog.csdn.net/chancein007/article/details/52624514 http://blog.csdn.net/chancein007/article/details/52609930 http://blog.csdn.net/chancein007/article/details/52601915 http://blog.csdn.net/chancein007/article/details/52551955 http://blog.csdn.net/chancein007/article/details/52551852 http://blog.csdn.net/chancein007/article/details/52551722 http://blog.csdn.net/chancein007/article/details/52551686 http://blog.csdn.net/chancein007/article/details/46552887 http://blog.csdn.net/chancein007/article/details/46539231 http://blog.csdn.net/chancein007/article/details/46490361 http://blog.csdn.net/chancein007/article/details/46476601 http://blog.csdn.net/chancein007/article/details/46470389 http://blog.csdn.net/chancein007/article/details/46470201 http://blog.csdn.net/chancein007/article/details/46447351 http://blog.csdn.net/chancein007/article/details/46318359 http://blog.csdn.net/chancein007/article/details/46317799 http://blog.csdn.net/chancein007/article/details/46293391 http://blog.csdn.net/chancein007/article/details/46293031 http://blog.csdn.net/chancein007/article/details/41157887 http://blog.csdn.net/chancein007/article/details/34514439 http://blog.csdn.net/chancein007/article/details/29642625 http://blog.csdn.net/chancein007/article/details/28016097 http://blog.csdn.net/chancein007/article/details/28001087 http://blog.csdn.net/chancein007/article/details/27384237 http://blog.csdn.net/chancein007/article/details/27239605 http://blog.csdn.net/chancein007/article/details/25926035 http://blog.csdn.net/chancein007/article/details/7318315 http://blog.csdn.net/chancein007/article/details/46310051 http://blog.csdn.net/chancein007/article/details/46301553 http://blog.csdn.net/chancein007/article/details/46242685 http://blog.csdn.net/chancein007/article/details/46241983 http://blog.csdn.net/chancein007/article/details/46241413 http://blog.csdn.net/chancein007/article/details/46238469 http://blog.csdn.net/chancein007/article/details/46238217 http://blog.csdn.net/chancein007/article/details/46137277 http://blog.csdn.net/chancein007/article/details/46136925 http://blog.csdn.net/chancein007/article/details/34537989 http://blog.csdn.net/chancein007/article/details/30340095 http://blog.csdn.net/chancein007/article/details/29653831 http://blog.csdn.net/chancein007/article/details/29645055 http://blog.csdn.net/chancein007/article/details/28142261 http://blog.csdn.net/chancein007/article/details/28104355 http://blog.csdn.net/chancein007/article/details/28083799 http://blog.csdn.net/chancein007/article/details/28023411 http://blog.csdn.net/chancein007/article/details/53983755 http://blog.csdn.net/chancein007/article/details/53889470 http://blog.csdn.net/chancein007/article/details/53792477 http://blog.csdn.net/chancein007/article/details/53731662 http://blog.csdn.net/chancein007/article/details/53731148 http://blog.csdn.net/chancein007/article/details/53730991 http://blog.csdn.net/chancein007/article/details/52109198 http://blog.csdn.net/chancein007/article/details/52109226 http://blog.csdn.net/chancein007/article/details/52108986 http://blog.csdn.net/chancein007/article/details/51813468 http://blog.csdn.net/chancein007/article/details/41950775 http://blog.csdn.net/chancein007/article/details/7316076 http://blog.csdn.net/chancein007/article/details/41157887 http://blog.csdn.net/chancein007/article/details/7315951 http://blog.csdn.net/chancein007/article/details/7315936 http://blog.csdn.net/chancein007/article/details/7315922 http://blog.csdn.net/chancein007/article/details/54296014 http://blog.csdn.net/chancein007/article/details/54295796 http://blog.csdn.net/chancein007/article/details/54260855 http://blog.csdn.net/chancein007/article/details/53120622 http://blog.csdn.net/chancein007/article/details/53120527 http://blog.csdn.net/chancein007/article/details/51813468 http://blog.csdn.net/chancein007/article/details/51813421 http://blog.csdn.net/chancein007/article/details/51813351 http://blog.csdn.net/chancein007/article/details/51813218 http://blog.csdn.net/chancein007/article/details/51813089 http://blog.csdn.net/chancein007/article/details/46293851 http://blog.csdn.net/chancein007/article/details/41280997 http://blog.csdn.net/chancein007/article/details/26297455 http://blog.csdn.net/chancein007/article/details/5154175 http://blog.csdn.net/chancein007/article/details/5154051 http://blog.csdn.net/chancein007/article/details/46136219 http://blog.csdn.net/chancein007/article/details/32992877 http://blog.csdn.net/chancein007/article/details/32986523 http://blog.csdn.net/chancein007/article/details/29822487 http://blog.csdn.net/chancein007/article/details/7316044 http://blog.csdn.net/chancein007/article/details/7315987 http://blog.csdn.net/chancein007/article/details/7307017 http://blog.csdn.net/chancein007/article/details/7306937 http://blog.csdn.net/chancein007/article/details/50645199 http://blog.csdn.net/chancein007/article/details/50645197 http://blog.csdn.net/chancein007/article/details/46059489 http://blog.csdn.net/chancein007/article/details/46059049 http://blog.csdn.net/chancein007/article/details/41178345 http://blog.csdn.net/chancein007/article/details/30813569 http://blog.csdn.net/chancein007/article/details/26297455 http://blog.csdn.net/chancein007/article/details/53189912 http://blog.csdn.net/chancein007/article/details/53002892 http://blog.csdn.net/chancein007/article/details/52940032 http://blog.csdn.net/chancein007/article/details/53014952 http://blog.csdn.net/chancein007/article/details/53014738 http://blog.csdn.net/chancein007/article/details/53002981 http://blog.csdn.net/chancein007/article/details/42277345 http://blog.csdn.net/chancein007/article/details/27116691 http://blog.csdn.net/chancein007/article/details/54016636 http://blog.csdn.net/chancein007/article/details/30494313 http://blog.csdn.net/chancein007/article/details/30467815 http://blog.csdn.net/chancein007/article/details/37722755 http://blog.csdn.net/chancein007/article/details/27242265 http://blog.csdn.net/chancein007/article/details/52170752 http://blog.csdn.net/chancein007/article/details/52069057 http://blog.csdn.net/chancein007/article/details/53933872 http://blog.csdn.net/chancein007/article/details/53959603 http://blog.csdn.net/chancein007/article/details/27122719 http://blog.csdn.net/chancein007/article/details/54343653 http://blog.csdn.net/chancein007/article/details/54238017 http://blog.csdn.net/chancein007/article/details/27243793 http://blog.csdn.net/chancein007/article/details/54344730
(编辑:李大同) 【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容! |