ICU正则表达式运用方法
发布时间:2020-12-14 02:06:09 所属栏目:百科 来源:网络整理
导读:最近由于项目需求学习了一下ICU下的正则表达式的用法,在运用之前现在官网上下载ICU库,并编译,将include下的头文件路径与lib下的库文件路径都添加到编译器的option选项下,环境实在vs2008下,之后的编码如下 #include iostream#include list//#include unic
最近由于项目需求学习了一下ICU下的正则表达式的用法,在运用之前现在官网上下载ICU库,并编译,将include下的头文件路径与lib下的库文件路径都添加到编译器的option选项下,环境实在vs2008下,之后的编码如下 #include <iostream> #include <list> //#include <unicode/uregex.h> #include "unicode/utypes.h" //#include "unicode/parseerr.h" #include "testConvAnyCodeUtf8.h" #include "unicode/ucnv.h" #include "apr_file_io.h" #include "unicode/regex.h" using namespace std; void MatchFromFile(char* GBpath,list<UnicodeString> listring); int reg_exp_match(const char*pat,const char* sour); void print(list<UnicodeString> listString); #pragma comment(lib,"conv_anycode_utf8.lib") #pragma comment(lib,"libapr-1.lib") #pragma comment(lib,"icuind.lib") #pragma comment(lib,"icuucd.lib") #define SUM 3 #define BUFFSIZE 1024 int g_num[SUM] = {0}; int main() { list<UnicodeString> listString; char* pathname = "F:3_27的1.txt"; /*listString.push_back(UnicodeString("s123s"));*/ listString.push_back(UnicodeString("123")); /*listString.push_back(UnicodeString("[u4e00-u9fa5$]+好好学习"));*/ listString.push_back(UnicodeString("好好学习")); listString.push_back(UnicodeString("d[o]m")); MatchFromFile(pathname,listString); print(listString); system("pause"); return 0; } void MatchFromFile(char* GBpath,list<UnicodeString> listring) { char* path = conv_anycode_utf8(GBpath,"UTF-8","GB18030"); list<UnicodeString>::const_iterator _regter = listring.begin(); apr_pool_t *pool = NULL; apr_file_t *file = NULL; apr_status_t res = APR_SUCCESS; char* inbuf = new char[BUFFSIZE + 1]; char *result; int len = 0; UConverter *cv = NULL; UErrorCode status = U_ZERO_ERROR; memset(inbuf,BUFFSIZE + 1); apr_initialize(); res = apr_pool_create(&pool,NULL); if(res != APR_SUCCESS) { printf("create pool failed!n"); return; } res = apr_file_open(&file,path,APR_READ,APR_OS_DEFAULT,pool); if(res != APR_SUCCESS) { printf("open file fail!n"); return ; } int i = 0; while(!apr_file_gets(inbuf,BUFFSIZE+1,file)) { while(_regter != listring.end()) { len = (*_regter).length(); result = new char[BUFFSIZE+1]; memset(result,BUFFSIZE*sizeof(char)+1); const UChar* psrc = (*_regter).getBuffer(); cv = ucnv_open("GB18030",&status); int32_t num = ucnv_fromUChars(cv,result,psrc,len,&status); if(status != U_ZERO_ERROR) { printf("Convert fail!n"); break; } int n = reg_exp_match(result,inbuf); g_num[i] += n; delete [] result; ucnv_close(cv); i++; _regter++; } _regter = listring.begin(); i = 0; } delete []inbuf; } void print(list<UnicodeString> listring) { char *result; int len = 0; UConverter *cv = NULL; UErrorCode status = U_ZERO_ERROR; list<UnicodeString>::const_iterator iter = listring.begin(); for(int j = 0;(j < SUM) && (iter != listring.end());j++,iter++) { len = (*iter).length(); result = new char[BUFFSIZE+1]; memset(result,BUFFSIZE*sizeof(char)+1); const UChar* psrc = (*iter).getBuffer(); cv = ucnv_open("GB18030",&status); int32_t num = ucnv_fromUChars(cv,&status); /* ucnv_toUChars(*/ if(status != U_ZERO_ERROR) { printf("Convert fail!n"); break; } printf("%s出现:%d次n",g_num[j]); delete [] result; ucnv_close(cv); } } int reg_exp_match(const char*pat,const char* sour) { int num = 0; int LEN = strlen(sour); int len = strlen(pat); UConverter *cv = NULL; RegexPattern *REPattern = NULL;///正则表达式 RegexMatcher *REMatcher = NULL;//匹配器 UErrorCode status = U_ZERO_ERROR; cv = ucnv_open("GB18030",&status); UChar* patStr = new UChar[len + 1]; memset(patStr,len + 1); ucnv_toUChars(cv,patStr,len+1,pat,&status); ucnv_close(cv); UnicodeString patString(patStr); //Unicode正则表达式组装,这些函数经常代替构造函数来创建RegexPattern对象 REPattern = RegexPattern::compile(patString,status); if (U_FAILURE(status)) { return 0; } //把母串转换为Unicode UChar* result = new UChar[LEN + 1]; memset(result,LEN+1); cv = ucnv_open("utf-8",&status); ucnv_toUChars(cv,LEN+1,sour,LEN,&status); UnicodeString inputString(result); //剔除字符串中的一些序列 UnicodeString unEscapedInput = inputString.unescape(); //创建一个正则表达式匹配器 REMatcher = REPattern->matcher(unEscapedInput,status); if (U_FAILURE(status)) { return 0; } if (U_FAILURE(status)) { return 0; } int64_t pos = 0; UnicodeString str = REMatcher->group(status); while(pos < LEN) { if(REMatcher->find(pos,status)) { num++; pos += REMatcher->end64(status); } else { break; } } return num; } (编辑:李大同) 【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容! |
相关内容