加入收藏 | 设为首页 | 会员中心 | 我要投稿 李大同 (https://www.lidatong.com.cn/)- 科技、建站、经验、云计算、5G、大数据,站长网!
当前位置: 首页 > 编程开发 > asp.Net > 正文

敏感词汇过滤DFA算法

发布时间:2020-12-15 21:26:43 所属栏目:asp.Net 来源:网络整理
导读:pre class="prettyprint"code class=" hljs java"using System; using System.Collections.Generic; using System.IO; using System.Linq; using System.Text; using System.Threading.Tasks; namespace SensitiveWordFilter { span class="hljs-keyword"pub

<pre class="prettyprint"><code class=" hljs java">using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Threading.Tasks;

namespace SensitiveWordFilter
{
<span class="hljs-keyword">public <span class="hljs-class"><span class="hljs-keyword">class <span class="hljs-title">SensitiveWord
{
<span class="hljs-keyword">private <span class="hljs-keyword">static readonly <span class="hljs-keyword">char IsEndChar = <span class="hljs-string">'$';

    <span class="hljs-javadoc"&gt;/**
     * 初始化敏感词库<br>
     * 将敏感词加入到HashMap中<br>
     * 构建DFA算法模型
     * 
     *<span class="hljs-javadoctag"&gt; @author</span> dxm
     * 
     */</span>
    <span class="hljs-keyword"&gt;public</span> <span class="hljs-class"&gt;<span class="hljs-keyword"&gt;class</span> <span class="hljs-title"&gt;SensitiveWordInit</span>
    {</span>

        <span class="hljs-comment"&gt;// 字符编码</span>
        <span class="hljs-keyword"&gt;private</span> <span class="hljs-keyword"&gt;static</span> readonly  String ENCODING = <span class="hljs-string"&gt;"UTF-8"</span>;

        <span class="hljs-javadoc"&gt;/**
         * 初始化敏感字库
         * 
         *<span class="hljs-javadoctag"&gt; @return</span>
         */</span>
        <span class="hljs-keyword"&gt;public</span> Dictionary<<span class="hljs-keyword"&gt;char</span>,object> <span class="hljs-title"&gt;initKeyWord</span>()
        {

            <span class="hljs-comment"&gt;// 读取敏感词库</span>
            HashSet<String> wordSet = readSensitiveWordFile();

            <span class="hljs-comment"&gt;// 将敏感词库加入到HashMap中</span>
            <span class="hljs-keyword"&gt;return</span> addSensitiveWordToHashMap(wordSet);
        }

        <span class="hljs-javadoc"&gt;/**
         * 读取敏感词库,将敏感词放入HashSet中,构建一个DFA算法模型:<br>
         * 中 = { 
         *       isEnd = 0 
         *       国 = {
         *             isEnd = 1 
         *             人 = { 
         *                   isEnd = 0 
         *                   民 = {
         *                         isEnd = 1 
         *                   }
         *             } 
         *             男 = { 
         *                   isEnd = 0 
         *                   人 = { 
         *                         isEnd = 1 
         *                   } 
         *             } 
         *       } 
         * } 
         * 五 = { 
         *       isEnd = 0 
         *       星 = { 
         *             isEnd = 0 
         *             红 = { 
         *                    isEnd = 0 
         *                    旗 = { 
         *                           isEnd = 1 
         *                    }
         *              } 
         *       } 
         * }
         */</span>
        <span class="hljs-keyword"&gt;private</span> Dictionary<<span class="hljs-keyword"&gt;char</span>,object> <span class="hljs-title"&gt;addSensitiveWordToHashMap</span>(HashSet<String> wordSet)
        {

            <span class="hljs-comment"&gt;// 初始化敏感词容器,减少扩容操作</span>
            Dictionary<<span class="hljs-keyword"&gt;char</span>,object> wordMap = <span class="hljs-keyword"&gt;new</span> Dictionary<<span class="hljs-keyword"&gt;char</span>,object>(wordSet.Count);

            foreach (String word in wordSet)
            {
                IDictionary<<span class="hljs-keyword"&gt;char</span>,object> nowMap = wordMap;
                <span class="hljs-keyword"&gt;for</span> (<span class="hljs-keyword"&gt;int</span> i = <span class="hljs-number"&gt;0</span>; i < word.Length; i++)
                {

                    <span class="hljs-comment"&gt;// 转换成char型</span>
                    <span class="hljs-keyword"&gt;char</span> keyChar = word[i];

                    <span class="hljs-keyword"&gt;if</span> (keyChar == IsEndChar)
                        <span class="hljs-keyword"&gt;continue</span>;

                    Object tempMap;
                    <span class="hljs-comment"&gt;// 获取</span>
                    nowMap.TryGetValue(keyChar,out tempMap);

                    <span class="hljs-comment"&gt;// 如果存在该key,直接赋值</span>
                    <span class="hljs-keyword"&gt;if</span> (tempMap != <span class="hljs-keyword"&gt;null</span>)
                    {
                        nowMap = (Dictionary<<span class="hljs-keyword"&gt;char</span>,object>)tempMap;
                    }

                    <span class="hljs-comment"&gt;// 不存在则,则构建一个map,同时将isEnd设置为0,因为他不是最后一个</span>
                    <span class="hljs-keyword"&gt;else</span> {

                        <span class="hljs-comment"&gt;// 设置标志位</span>
                        Dictionary<<span class="hljs-keyword"&gt;char</span>,object> newMap = <span class="hljs-keyword"&gt;new</span> Dictionary<<span class="hljs-keyword"&gt;char</span>,object>();
                        newMap.Add(IsEndChar,<span class="hljs-string"&gt;"0"</span>);

                        <span class="hljs-comment"&gt;// 添加到集合</span>
                        nowMap.Add(keyChar,newMap);
                        nowMap = newMap;
                    }

                    <span class="hljs-comment"&gt;// 最后一个</span>
                    <span class="hljs-keyword"&gt;if</span> (i == word.Length - <span class="hljs-number"&gt;1</span>)
                    {
                        nowMap[IsEndChar] = <span class="hljs-string"&gt;"1"</span>;
                    }
                }
            }

            <span class="hljs-keyword"&gt;return</span> wordMap;
        }

        <span class="hljs-javadoc"&gt;/**
         * 读取敏感词库中的内容,将内容添加到SortedSet集合中
         * 
         *<span class="hljs-javadoctag"&gt; @return</span>
         *<span class="hljs-javadoctag"&gt; @throws</span> Exception
         */</span>
        <span class="hljs-keyword"&gt;private</span> HashSet<String> <span class="hljs-title"&gt;readSensitiveWordFile</span>()
        {
            HashSet<String> wordSet = <span class="hljs-keyword"&gt;new</span> HashSet<string>();
            string content = File.ReadAllText(<span class="hljs-string"&gt;"dic.txt"</span>,Encoding.GetEncoding(ENCODING));
            using (StringReader sr = <span class="hljs-keyword"&gt;new</span> StringReader(content))
            {
                string s;
                <span class="hljs-keyword"&gt;while</span> ((s = sr.ReadLine()) != <span class="hljs-keyword"&gt;null</span>)
                {
                    wordSet.Add(s);
                }
            }
            <span class="hljs-keyword"&gt;return</span> wordSet;
        }
    }

    <span class="hljs-keyword"&gt;public</span> <span class="hljs-class"&gt;<span class="hljs-keyword"&gt;class</span> <span class="hljs-title"&gt;SensitivewordFilter</span>
    {</span>

        <span class="hljs-keyword"&gt;private</span> Dictionary<<span class="hljs-keyword"&gt;char</span>,object> sensitiveWordMap = <span class="hljs-keyword"&gt;null</span>;

        <span class="hljs-comment"&gt;// 最小匹配规则</span>
        <span class="hljs-keyword"&gt;public</span> <span class="hljs-keyword"&gt;static</span> <span class="hljs-keyword"&gt;int</span> minMatchTYpe = <span class="hljs-number"&gt;1</span>;

        <span class="hljs-comment"&gt;// 最大匹配规则</span>
        <span class="hljs-keyword"&gt;public</span> <span class="hljs-keyword"&gt;static</span> <span class="hljs-keyword"&gt;int</span> maxMatchType = <span class="hljs-number"&gt;2</span>;

        <span class="hljs-comment"&gt;// 单例</span>
        <span class="hljs-keyword"&gt;private</span> <span class="hljs-keyword"&gt;static</span> SensitivewordFilter inst = <span class="hljs-keyword"&gt;null</span>;

        <span class="hljs-javadoc"&gt;/**
         * 构造函数,初始化敏感词库
         */</span>
        <span class="hljs-keyword"&gt;private</span> <span class="hljs-title"&gt;SensitivewordFilter</span>()
        {
            sensitiveWordMap = <span class="hljs-keyword"&gt;new</span> SensitiveWordInit().initKeyWord();
        }

        <span class="hljs-javadoc"&gt;/**
         * 获取单例
         * 
         *<span class="hljs-javadoctag"&gt; @return</span>
         */</span>
        <span class="hljs-keyword"&gt;public</span> <span class="hljs-keyword"&gt;static</span> SensitivewordFilter <span class="hljs-title"&gt;getInstance</span>()
        {
            <span class="hljs-keyword"&gt;if</span> (<span class="hljs-keyword"&gt;null</span> == inst)
            {
                inst = <span class="hljs-keyword"&gt;new</span> SensitivewordFilter();
            }
            <span class="hljs-keyword"&gt;return</span> inst;
        }

        <span class="hljs-javadoc"&gt;/**
         * 判断文字是否包含敏感字符
         * 
         *<span class="hljs-javadoctag"&gt; @param</span> txt
         *<span class="hljs-javadoctag"&gt; @param</span> matchType
         *<span class="hljs-javadoctag"&gt; @return</span>
         */</span>
        <span class="hljs-keyword"&gt;public</span> bool <span class="hljs-title"&gt;isContaintSensitiveWord</span>(String txt,<span class="hljs-keyword"&gt;int</span> matchType = <span class="hljs-number"&gt;1</span>)
        {
            bool flag = <span class="hljs-keyword"&gt;false</span>;
            <span class="hljs-keyword"&gt;for</span> (<span class="hljs-keyword"&gt;int</span> i = <span class="hljs-number"&gt;0</span>; i < txt.Length; i++)
            {

                <span class="hljs-comment"&gt;// 判断是否包含敏感字符</span>
                <span class="hljs-keyword"&gt;int</span> matchFlag = <span class="hljs-keyword"&gt;this</span>.CheckSensitiveWord(txt,i,matchType);

                <span class="hljs-comment"&gt;// 大于0存在,返回true</span>
                <span class="hljs-keyword"&gt;if</span> (matchFlag > <span class="hljs-number"&gt;0</span>)
                {
                    flag = <span class="hljs-keyword"&gt;true</span>;
                }
            }
            <span class="hljs-keyword"&gt;return</span> flag;
        }

        <span class="hljs-javadoc"&gt;/**
         * 获取文字中的敏感词
         * 
         *<span class="hljs-javadoctag"&gt; @param</span> txt
         *<span class="hljs-javadoctag"&gt; @param</span> matchType
         *<span class="hljs-javadoctag"&gt; @return</span>
         */</span>
        <span class="hljs-keyword"&gt;public</span> HashSet<String> <span class="hljs-title"&gt;getSensitiveWord</span>(String txt,<span class="hljs-keyword"&gt;int</span> matchType = <span class="hljs-number"&gt;1</span>)
        {
            HashSet<String> sensitiveWordList = <span class="hljs-keyword"&gt;new</span> HashSet<String>();

            <span class="hljs-keyword"&gt;for</span> (<span class="hljs-keyword"&gt;int</span> i = <span class="hljs-number"&gt;0</span>; i < txt.Length; i++)
            {

                <span class="hljs-comment"&gt;// 判断是否包含敏感字符</span>
                <span class="hljs-keyword"&gt;int</span> length = CheckSensitiveWord(txt,matchType);

                <span class="hljs-comment"&gt;// 存在,加入list中</span>
                <span class="hljs-keyword"&gt;if</span> (length > <span class="hljs-number"&gt;0</span>)
                {
                    sensitiveWordList.Add(txt.Substring(i,length));

                    <span class="hljs-comment"&gt;// 减1的原因,是因为for会自增</span>
                    i = i + length - <span class="hljs-number"&gt;1</span>;
                }
            }

            <span class="hljs-keyword"&gt;return</span> sensitiveWordList;
        }

        <span class="hljs-javadoc"&gt;/**
         * 替换敏感字字符
         * 
         *<span class="hljs-javadoctag"&gt; @param</span> txt
         *<span class="hljs-javadoctag"&gt; @param</span> matchType
         *<span class="hljs-javadoctag"&gt; @param</span> replaceChar
         *<span class="hljs-javadoctag"&gt; @return</span>
         */</span>
        <span class="hljs-keyword"&gt;public</span> String <span class="hljs-title"&gt;replaceSensitiveWord</span>(String txt,String replaceChar,<span class="hljs-keyword"&gt;int</span> matchType = <span class="hljs-number"&gt;1</span>)
        {
            StringBuilder sb = <span class="hljs-keyword"&gt;new</span> StringBuilder(txt);
            <span class="hljs-keyword"&gt;for</span> (<span class="hljs-keyword"&gt;int</span> i = <span class="hljs-number"&gt;0</span>; i < txt.Length; i++)
            {

                <span class="hljs-comment"&gt;// 判断是否包含敏感字符</span>
                <span class="hljs-keyword"&gt;int</span> length = CheckSensitiveWord(txt,加入list中</span>
                <span class="hljs-keyword"&gt;if</span> (length > <span class="hljs-number"&gt;0</span>)
                {
                    var ttxt = txt.Substring(i,length);
                    sb.Replace(ttxt,getReplaceChars(replaceChar,ttxt.Length),length);

                    <span class="hljs-comment"&gt;// 减1的原因,是因为for会自增</span>
                    i = i + length - <span class="hljs-number"&gt;1</span>;
                }
            }

            <span class="hljs-keyword"&gt;return</span> sb.ToString();
        }

        <span class="hljs-javadoc"&gt;/**
         * 获取替换字符串
         * 
         *<span class="hljs-javadoctag"&gt; @param</span> replaceChar
         *<span class="hljs-javadoctag"&gt; @param</span> length
         *<span class="hljs-javadoctag"&gt; @return</span>
         */</span>
        <span class="hljs-keyword"&gt;private</span> String <span class="hljs-title"&gt;getReplaceChars</span>(String replaceChar,<span class="hljs-keyword"&gt;int</span> length)
        {
            StringBuilder sb = <span class="hljs-keyword"&gt;new</span> StringBuilder();
            <span class="hljs-keyword"&gt;for</span> (<span class="hljs-keyword"&gt;int</span> i = <span class="hljs-number"&gt;0</span>; i < length; i++)
            {
                sb.Append(replaceChar);
            }

            <span class="hljs-keyword"&gt;return</span> sb.ToString();
        }

        <span class="hljs-javadoc"&gt;/**
         * 检查文字中是否包含敏感字符,检查规则如下:<br>
         * 如果存在,则返回敏感词字符的长度,不存在返回0
         * 
         *<span class="hljs-javadoctag"&gt; @param</span> txt
         *<span class="hljs-javadoctag"&gt; @param</span> beginIndex
         *<span class="hljs-javadoctag"&gt; @param</span> matchType
         *<span class="hljs-javadoctag"&gt; @return</span>
         */</span>
        <span class="hljs-keyword"&gt;public</span> <span class="hljs-keyword"&gt;int</span> <span class="hljs-title"&gt;CheckSensitiveWord</span>(String txt,<span class="hljs-keyword"&gt;int</span> beginIndex,<span class="hljs-keyword"&gt;int</span> matchType)
        {

            <span class="hljs-comment"&gt;// 敏感词结束标识位:用于敏感词只有1位的情况</span>
            bool flag = <span class="hljs-keyword"&gt;false</span>;

            <span class="hljs-comment"&gt;// 匹配标识数默认为0</span>
            <span class="hljs-keyword"&gt;int</span> matchFlag = <span class="hljs-number"&gt;0</span>;
            Dictionary<<span class="hljs-keyword"&gt;char</span>,object> nowMap = sensitiveWordMap;
            <span class="hljs-keyword"&gt;int</span> tempFlag = <span class="hljs-number"&gt;0</span>;
            Dictionary<<span class="hljs-keyword"&gt;char</span>,object> tempMapForBack = <span class="hljs-keyword"&gt;new</span> Dictionary<<span class="hljs-keyword"&gt;char</span>,object>();
            <span class="hljs-keyword"&gt;int</span> len = txt.Length;
            <span class="hljs-keyword"&gt;for</span> (<span class="hljs-keyword"&gt;int</span> i = beginIndex; i < len; i++)
            {
                <span class="hljs-keyword"&gt;char</span> word = txt[i];

                <span class="hljs-keyword"&gt;if</span> (word == IsEndChar)
                    <span class="hljs-keyword"&gt;continue</span>;

                <span class="hljs-comment"&gt;// 获取指定key</span>
                Object tempMap;
                <span class="hljs-comment"&gt;// 获取</span>
                nowMap.TryGetValue(word,out tempMap);

                <span class="hljs-keyword"&gt;if</span> (tempFlag == <span class="hljs-number"&gt;0</span>)
                    tempMapForBack = nowMap;

                <span class="hljs-comment"&gt;// 如果存在该key,直接赋值</span>
                <span class="hljs-keyword"&gt;if</span> (tempMap != <span class="hljs-keyword"&gt;null</span>)
                {
                    nowMap = (Dictionary<<span class="hljs-keyword"&gt;char</span>,object>)tempMap;
                }
                <span class="hljs-keyword"&gt;else</span>
                {
                    <span class="hljs-keyword"&gt;if</span> (tempFlag > <span class="hljs-number"&gt;0</span>)
                    {
                        matchFlag = matchFlag - (i - tempFlag);
                        i = tempFlag - <span class="hljs-number"&gt;1</span>;
                        nowMap = tempMapForBack;
                        <span class="hljs-keyword"&gt;continue</span>;
                    }
                    <span class="hljs-keyword"&gt;else</span>
                    {
                        nowMap = <span class="hljs-keyword"&gt;null</span>;
                    }
                }

                <span class="hljs-comment"&gt;// 存在,则判断是否为最后一个</span>
                <span class="hljs-keyword"&gt;if</span> (nowMap != <span class="hljs-keyword"&gt;null</span>)
                {

                    <span class="hljs-comment"&gt;// 找到相应key,匹配标识+1</span>
                    matchFlag++;

                    object value;

                    <span class="hljs-keyword"&gt;if</span> (nowMap.TryGetValue(IsEndChar,out value))
                    {
                        <span class="hljs-keyword"&gt;if</span> (value is string)
                        {
                            <span class="hljs-comment"&gt;// 如果为最后一个匹配规则,结束循环,返回匹配标识数</span>
                            <span class="hljs-keyword"&gt;if</span> (<span class="hljs-string"&gt;"1"</span> == (string)value)
                            {
                                <span class="hljs-keyword"&gt;if</span> (nowMap.Keys.Count == <span class="hljs-number"&gt;1</span> || tempFlag != <span class="hljs-number"&gt;0</span> || i == len - <span class="hljs-number"&gt;1</span>)
                                {
                                    <span class="hljs-comment"&gt;// 结束标志位为true</span>
                                    flag = <span class="hljs-keyword"&gt;true</span>;

                                    <span class="hljs-comment"&gt;// 最小规则,直接返回,最大规则还需继续查找</span>
                                    <span class="hljs-keyword"&gt;if</span> (SensitivewordFilter.minMatchTYpe == matchType)
                                    {
                                        <span class="hljs-keyword"&gt;break</span>;
                                    }
                                }
                                <span class="hljs-keyword"&gt;else</span>
                                {
                                    tempFlag = i;
                                }
                            }
                        }
                    }
                }
                <span class="hljs-comment"&gt;// 不存在,直接返回</span>
                <span class="hljs-keyword"&gt;else</span>
                {
                    <span class="hljs-keyword"&gt;break</span>;
                }
            }

            <span class="hljs-comment"&gt;// 长度必须大于等于1,为词</span>
            <span class="hljs-keyword"&gt;if</span> (matchFlag < <span class="hljs-number"&gt;2</span> || !flag)
            {
                matchFlag = <span class="hljs-number"&gt;0</span>;
            }
            <span class="hljs-keyword"&gt;return</span> matchFlag;
        }
    }
}

}

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;

namespace SensitiveWordFilter
{
class Program
{
<span class="hljs-keyword">static <span class="hljs-keyword">void Main(string[] args)
{
SensitiveWord.SensitivewordFilter filter = SensitiveWord.SensitivewordFilter.getInstance();
String txt = <span class="hljs-string">"$fuckfuck you你麻痹e菜太菜了fuckyou从飞啊 fuck you";
String hou = filter.replaceSensitiveWord(txt,<span class="hljs-string">"*");
Console.WriteLine(<span class="hljs-string">"替换前的文字为:" + txt);
Console.WriteLine(<span class="hljs-string">"替换后的文字为:" + hou);
Console.ReadKey();
}
}
}

(编辑:李大同)

【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容!

    推荐文章
      热点阅读