使用ICU库中正则表达式匹配关键字示例

发布时间：2020-12-14 01:55:57 所属栏目：百科来源：网络整理

导读：#include iostream//#include "unicode/unistr.h"//#include unicode/ucsdet.h#include unicode/ucnv.h#include string.hifdef _DEBUG#ifdef linux#else#ifdef _WIN64#pragma comment(lib,"icuin64d.lib")#pragma comment(lib,"icuuc64d.lib")#else#pragma c

#include <iostream>
//#include "unicode/unistr.h"
//#include <unicode/ucsdet.h>
#include <unicode/ucnv.h>
#include <string.h>

ifdef _DEBUG
	#ifdef linux
	#else
		#ifdef _WIN64
		#pragma comment(lib,"icuin64d.lib")
		#pragma comment(lib,"icuuc64d.lib")
		#else
		#pragma comment(lib,"icuin32d.lib")
		#pragma comment(lib,"icuuc32d.lib")
		#endif
	#endif
#else
	#ifdef linux
	#else
		#ifdef _WIN64
		#pragma comment(lib,"icuin64.lib")
		#pragma comment(lib,"icuuc64.lib")
		#else
		#pragma comment(lib,"icuin32.lib")
		#pragma comment(lib,"icuuc32.lib")
		#endif
	#endif
#endif
int32_t BUFFSIZE = 8;

int FindSubNum(UnicodeString USrcStr,UnicodeString USubStr,int index)
{
	int32_t num = 0;
	int pos = USrcStr.indexOf(USubStr);
	while(pos != -1)
	{
		num++;
		pos += index;
		pos = USrcStr.indexOf(USubStr,pos);
	}
	return num;
}

int FindSubFromBuf(char* buff,int buflen,UnicodeString liststring)
{
	if((buff == NULL) || liststring.isEmpty() || buflen == 0)
	{
		return 0;
	}
	UCharsetDetector* dector = NULL;
	UErrorCode status = U_ZERO_ERROR;
	UConverter *conv = NULL;
	const UCharsetMatch *csm = NULL;
	int num = 0;
	//先检测buff里的字符编码格式
	dector = ucsdet_open(&status);
	if(status != U_ZERO_ERROR)
	{
		//std::cout<<"open charset detector failed!n";
		ucsdet_close(dector);
		return 0;
	}
	ucsdet_setText(dector,buff,buflen,&status);
	if(status != U_ZERO_ERROR)
	{
		//std::cout<<"set fail!n";
		ucsdet_close(dector);
		return 0;
	}
	csm = ucsdet_detect(dector,&status);
	const char* detected = ucsdet_getName(csm,&status);
	ucsdet_close(dector);
	//然后转化为Unicode编码进行比较
	UChar *target = NULL;        //指向存储转换后的字符串的结尾
	UChar *targetLimit = NULL;   //指向缓冲区尾部的指针
	const char *source = NULL;        //指向源代码页缓冲区
	const char *sourceLimit = NULL;   //指向缓冲区的尾部的字节
	int32_t *offset = NULL;      //表示什么也不做*/
	int32_t numread = 0;  //实际读了多少字节
	int32_t buffsize = 0;
	conv = ucnv_open(detected,&status);
	if(status != U_ZERO_ERROR)
	{
		std::cout<<"open converter failed!n";
		ucnv_close(conv);
		return 0;
	}
	buffsize = BUFFSIZE/ucnv_getMinCharSize(conv);
	char* read = buff;
	char* inbuf = new char[BUFFSIZE*sizeof(char) +1];
	UChar* uBuf = new UChar[BUFFSIZE*sizeof(UChar) + 2];
	memset(inbuf,BUFFSIZE*sizeof(char) + 1);
	memset(uBuf,BUFFSIZE*sizeof(UChar) + 2);
	UnicodeString readbuff= UnicodeString("");
	UnicodeString tempUStr = UnicodeString("");
	while((read-buff)<buflen)
	{
		memcpy(inbuf,read,BUFFSIZE);
		int fin_len = buflen-(read-buff);
		if (fin_len > BUFFSIZE)
		{
			fin_len = BUFFSIZE;
		}
		read = read + fin_len;
		numread = strlen(inbuf);
	/*	UnicodeString readbuff;*/

		source = inbuf;
		sourceLimit = inbuf + numread;
		do
		{
			target = uBuf;
			targetLimit = uBuf + buffsize;//分食
			ucnv_toUnicode(conv,&target,targetLimit,&source,sourceLimit,NULL,(read-buff == buflen)?true:false,&status);
			if(status != U_ZERO_ERROR)
			{
				//std::cout<<"Convert fail!n";
				if (uBuf)
				{
					delete [] uBuf;
					uBuf = NULL;
				}
				if (inbuf)
				{
					delete []inbuf;
					inbuf = NULL;
				}
				ucnv_close(conv);
				return 0;
			}
		}
		while(source < sourceLimit);
		//用uBuf初始化UnicString对象
		readbuff = UnicodeString(uBuf);
		int32_t readbuflen = readbuff.length();
		//第一次先给midUStr赋空，防止在进行
		int32_t plen = readbuff.length();
		UnicodeString temp = tempUStr + readbuff; 
		int len = liststring.length();
		num += FindSubNum(temp,liststring,len);
		//每保存一段内存块的回退字节就清空一次便于下一次继续存放
		//保留本次的后 len-1个字符
		tempUStr = temp.tempSubString(temp.length() - (len-1),len);
		memset(inbuf,BUFFSIZE+1);
		memset(uBuf,BUFFSIZE*sizeof(UChar)+2);
	}
	if (uBuf)
	{
		delete [] uBuf;
		uBuf = NULL;
	}
	if (inbuf)
	{
		delete []inbuf;
		inbuf = NULL;
	}
	ucnv_close(conv);
	return num;
}

（编辑：李大同）

【声明】本站内容均来自网络，其相关言论仅代表作者个人观点，不代表本站立场。若无意侵犯到您的权利，请及时与联系站长删除相关内容!