-
敏感词过滤
2018-09-11 10:12:40敏感词过滤 -
网站敏感词过滤的实现(附敏感词库)
2017-07-10 17:50:05现在基本上所有的网站都需要设置敏感词过滤,似乎已经成了一个网站的标配,如果你的网站没有,或者你没有做相应的处理,那么小心相关部门请你喝茶哦。 最近在调研Java web网站的敏感词过滤的实现,网上找了相关资料...现在基本上所有的网站都需要设置敏感词过滤,似乎已经成了一个网站的标配,如果你的网站没有,或者你没有做相应的处理,那么小心相关部门请你喝茶哦。
最近在调研Java web网站的敏感词过滤的实现,网上找了相关资料,经过我的验证,把我的调研结果写出来,供大家参考。一、敏感词过滤工具类
把敏感词词库内容加载到ArrayList集合中,通过双层循环,查找与敏感词列表相匹配的字符串,如果找到以*号替换,最终得到替换后的字符串。
此种方式匹配度较高,匹配速度良好。
初始化敏感词库:
//初始化敏感词库 public void InitializationWork() { replaceAll = new StringBuilder(replceSize); for(int x=0;x < replceSize;x++) { replaceAll.append(replceStr); } //加载词库 arrayList = new ArrayList<String>(); InputStreamReader read = null; BufferedReader bufferedReader = null; try { read = new InputStreamReader(SensitiveWord.class.getClassLoader().getResourceAsStream(fileName),encoding); bufferedReader = new BufferedReader(read); for(String txt = null;(txt = bufferedReader.readLine()) != null;){ if(!arrayList.contains(txt)) arrayList.add(txt); } } catch (UnsupportedEncodingException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); }finally{ try { if(null != bufferedReader) bufferedReader.close(); } catch (IOException e) { e.printStackTrace(); } try { if(null != read) read.close(); } catch (IOException e) { e.printStackTrace(); } } }
过滤敏感词信息:
public String filterInfo(String str) { sensitiveWordSet = new HashSet<String>(); sensitiveWordList= new ArrayList<>(); StringBuilder buffer = new StringBuilder(str); HashMap<Integer, Integer> hash = new HashMap<Integer, Integer>(arrayList.size()); String temp; for(int x = 0; x < arrayList.size();x++) { temp = arrayList.get(x); int findIndexSize = 0; for(int start = -1;(start=buffer.indexOf(temp,findIndexSize)) > -1;) { //System.out.println("###replace="+temp); findIndexSize = start+temp.length();//从已找到的后面开始找 Integer mapStart = hash.get(start);//起始位置 if(mapStart == null || (mapStart != null && findIndexSize > mapStart))//满足1个,即可更新map { hash.put(start, findIndexSize); //System.out.println("###敏感词:"+buffer.substring(start, findIndexSize)); } } } Collection<Integer> values = hash.keySet(); for(Integer startIndex : values) { Integer endIndex = hash.get(startIndex); //获取敏感词,并加入列表,用来统计数量 String sensitive = buffer.substring(startIndex, endIndex); //System.out.println("###敏感词:"+sensitive); if (!sensitive.contains("*")) {//添加敏感词到集合 sensitiveWordSet.add(sensitive); sensitiveWordList.add(sensitive); } buffer.replace(startIndex, endIndex, replaceAll.substring(0,endIndex-startIndex)); } hash.clear(); return buffer.toString(); }
下载地址:SensitiveWord
链接: https://pan.baidu.com/s/12RcZ8-jNHMAR__VscRUDfQ 密码: qmcw (如果失效,请使用文末地址下载)二、Java关键词过滤
这个方式采用的是正则表达式匹配,速度上比第一种稍慢,匹配度良好。
主要代码:
// 从words.properties初始化正则表达式字符串 private static void initPattern() { StringBuffer patternBuffer = new StringBuffer(); try { //words.properties InputStream in = KeyWordFilter.class.getClassLoader().getResourceAsStream("keywords.properties"); Properties property = new Properties(); property.load(in); Enumeration<?> enu = property.propertyNames(); patternBuffer.append("("); while (enu.hasMoreElements()) { String scontent = (String) enu.nextElement(); patternBuffer.append(scontent + "|"); //System.out.println(scontent); keywordsCount ++; } patternBuffer.deleteCharAt(patternBuffer.length() - 1); patternBuffer.append(")"); //System.out.println(patternBuffer); // unix换成UTF-8 // pattern = Pattern.compile(new // String(patternBuf.toString().getBytes("ISO-8859-1"), "UTF-8")); // win下换成gb2312 // pattern = Pattern.compile(new String(patternBuf.toString() // .getBytes("ISO-8859-1"), "gb2312")); // 装换编码 pattern = Pattern.compile(patternBuffer.toString()); } catch (IOException ioEx) { ioEx.printStackTrace(); } } private static String doFilter(String str) { Matcher m = pattern.matcher(str); // while (m.find()) {// 查找符合pattern的字符串 // System.out.println("The result is here :" + m.group()); // } // 选择替换方式,这里以* 号代替 str = m.replaceAll("*"); return str; }
下载地址:KeyWordFilter
链接: http://pan.baidu.com/s/1kVBl803 密码: xi24 (如果失效,请使用文末地址下载)三、DFA算法进行过滤
这种方式听起来高大上,采用DFA算法,这个算法个人不太懂,经测试发现,匹配度不行,速度良好。或许可以改良,还请大神进行改良。
主要有两个文件:SensitivewordFilter.java 和 SensitiveWordInit.java
主要代码:
public int CheckSensitiveWord(String txt,int beginIndex,int matchType){ boolean flag = false; //敏感词结束标识位:用于敏感词只有1位的情况 int matchFlag = 0; //匹配标识数默认为0 char word = 0; Map nowMap = sensitiveWordMap; for(int i = beginIndex; i < txt.length() ; i++){ word = txt.charAt(i); nowMap = (Map) nowMap.get(word); //获取指定key if(nowMap != null){ //存在,则判断是否为最后一个 matchFlag++; //找到相应key,匹配标识+1 if("1".equals(nowMap.get("isEnd"))){ //如果为最后一个匹配规则,结束循环,返回匹配标识数 flag = true; //结束标志位为true if(SensitivewordFilter.minMatchTYpe == matchType){ //最小规则,直接返回,最大规则还需继续查找 break; } } } else{ //不存在,直接返回 break; } } if(matchFlag < 2 || !flag){ //长度必须大于等于1,为词 matchFlag = 0; } return matchFlag; }
下载地址:SensitivewordFilter
链接: http://pan.baidu.com/s/1ccsa66 密码: mc1x (如果失效,请使用文末地址下载)四、多叉树查找算法
这个方式采用了多叉树查找算法,至于这个算法是怎么回事,大家可以去查看数据结构相关内容。提供了jar包,直接调用进行过滤。
经测试,这个方法匹配度良好,速度稍慢。
调用方式:
//敏感词过滤 FilteredResult result = WordFilterUtil.filterText(str, '*'); //获取过滤后的内容 System.out.println("替换后的字符串为:\n"+result.getFilteredContent()); //获取原始字符串 System.out.println("原始字符串为:\n"+result.getOriginalContent()); //获取替换的敏感词 System.out.println("替换的敏感词为:\n"+result.getBadWords());
下载地址:WordFilterUtil
链接: http://pan.baidu.com/s/1nvftzeD 密码: 5t2h (如果失效,请使用文末地址下载)以上就是我的调研结果,希望对大家有所帮助。
最后,附上大量敏感词库下载地址:
最新下载地址: https://t00y.com/file/1764647-442914556参考了以下文章:
其他
- 个人博客:http://www.sendtion.cn
- CSDN:http://blog.csdn.net/shuyou612
- GitHub:https://github.com/sendtion
-
java敏感词过滤_java敏感词过滤
2021-03-05 15:48:32java敏感词过滤敏感词:“美元”,“中国”,“北京大学”,“北大”,“南京大学”DFAUtils`import java.util.HashMap;import java.util.LinkedList;import java.util.Map;public class DFAUtils {/*** 添加敏感词到...java敏感词过滤
敏感词:“美元”,“中国”,“北京大学”,“北大”,“南京大学”
DFAUtils`import java.util.HashMap;
import java.util.LinkedList;
import java.util.Map;
public class DFAUtils {
/**
* 添加敏感词到算法树
*/
public static void addSensitiveWord(String sensitiveWord) {
if (null == sensitiveWord || sensitiveWord.length() == 0) {
return;
}
char[] chars = sensitiveWord.toCharArray();
Map parentMap = sensitiveWordsMap;
Map current = null;
synchronized (lock) {
for (int i = 0; i < chars.length; i++) {
if (i == 0) {
if (sensitiveWordsMap.size() == 0) {
/* 添加第一个敏感词的第一个字符执行此code */
if (chars.length == 1) {
Map endMap = new HashMap<>(1);
endMap.put(null, null);
sensitiveWordsMap.put(chars[0], endMap);
} else {
sensitiveWordsMap.put(chars[0], null);
}
} else {
current = parentMap.get(chars[0]);
if (null == current) {
if (chars.length == 1) {
Map endMap = new HashMap<>(1);
endMap.put(null, null);
sensitiveWordsMap.put(chars[0], endMap);
break;
} else {
sensitiveWordsMap.put(chars[0], null);
}
} else {
if (chars.length == 1) {
current.put(null, null);
break;
}
}
}
} else {
if (null == current) {
Map childMap = new HashMap();
if (i == chars.length - 1) {
Map endMap = new HashMap<>(1);
endMap.put(null, null);
childMap.put(chars[i], endMap);
parentMap.put(chars[i - 1], childMap);
break;
} else {
childMap.put(chars[i], null);
parentMap.put(chars[i - 1], childMap);
parentMap = childMap;
current = null;
}
} else {
Map childMap = current.get(chars[i]);
if (null == childMap) {
if (i == chars.length - 1) {
Map endMap = new HashMap<>(1);
endMap.put(null, null);
current.put(chars[i], endMap);
} else {
current.put(chars[i], null);
parentMap = current;
current = null;
}
} else {
if (i == chars.length - 1) {
childMap.put(null, null);
} else {
parentMap = current;
current = childMap;
}
}
}
}
}
}
}
/**
* 检查敏感词(找到符合敏感词则返回--单个字符敏感词前后不是中文字符才算敏感词)
*/
public static String checkSensitiveWord(String content) {
if (null == content || content.length() == 0 || sensitiveWordsMap.size() == 0) {
return null;
}
char[] chars = content.toCharArray();
boolean isContain = Boolean.FALSE;
StringBuilder sbResult = new StringBuilder();
for (int i = 0; i < chars.length; i++) {
if (sensitiveWordsMap.containsKey(chars[i])) {
Map currentMap = sensitiveWordsMap.get(chars[i]);
sbResult.append(chars[i]);
if (null == currentMap) {
break;
} else {
if (currentMap.containsKey(null)) {
if (sbResult.length() == 1) {
/* 前一个字符或后一个字符是否是中文字符 */
boolean before = Boolean.FALSE;
if (i - 1 < 0) {
before = Boolean.TRUE;
} else {
if (chars[i - 1] < 13312 || chars[i - 1] > 40895) {
before = Boolean.TRUE;
}
}
boolean after = Boolean.FALSE;
if (i + 1 >= chars.length) {
after = Boolean.TRUE;
} else {
if (chars[i + 1] < 13312 || chars[i + 1] > 40895) {
after = Boolean.TRUE;
}
}
if (before && after) {
isContain = Boolean.TRUE;
break;
}
/* From当前index开始匹配是否存在敏感词 */
int j = i + 1;
for (; j < chars.length; j++) {
if (currentMap.containsKey(chars[j])) {
sbResult.append(chars[j]);
currentMap = currentMap.get(chars[j]);
if (currentMap.containsKey(null)) {
isContain = Boolean.TRUE;
break;
} else {
continue;
}
} else {
break;
}
}
} else {
isContain = Boolean.TRUE;
break;
}
} else {
/* From当前index开始匹配是否存在敏感词 */
int j = i + 1;
for (; j < chars.length; j++) {
if (currentMap.containsKey(chars[j])) {
sbResult.append(chars[j]);
currentMap = currentMap.get(chars[j]);
if (currentMap.containsKey(null)) {
isContain = Boolean.TRUE;
break;
} else {
continue;
}
} else {
break;
}
}
}
if (isContain) {
break;
} else {
sbResult.setLength(0);
}
}
}
}
if (isContain) {
return sbResult.toString();
} else {
return null;
}
}
/**
* 删除算法树的敏感词
*/
public static void delSensitiveWord(String sensitiveWord) {
if (null == sensitiveWord || sensitiveWord.length() == 0 || sensitiveWordsMap.size() == 0) {
return;
}
int delIndex = 0;
char[] chars = sensitiveWord.toCharArray();
Map current = sensitiveWordsMap;
synchronized (lock) {
int i = 0;
for (; i < chars.length; i++) {
if (current.containsKey(chars[i])) {
if (current.get(chars[i]).size() > 1) {
delIndex = i;
}
} else {
break;
}
current = current.get(chars[i]);
}
if (!current.containsKey(null)) {
return;
}
current = sensitiveWordsMap;
if (i == chars.length) {
for (i = 0; i < delIndex; i++) {
current = current.get(chars[i]);
}
if (i == chars.length) {
current.remove(chars[i]);
} else {
if (i == 0 && chars.length == 1) {
if (current.get(chars[i]).size() == 1) {
current.remove(chars[i]);
} else {
current.get(chars[i]).remove(null);
}
} else {
if (i + 1 == chars.length) {
current.get(chars[i]).remove(null);
} else {
current.get(chars[i]).remove(chars[i + 1]);
}
}
}
}
}
}
/**
* 获取算法树的敏感词
*/
public static LinkedList getSevsitiveWords() {
LinkedList listWords = new LinkedList();
if (sensitiveWordsMap.size() == 0) {
return listWords;
}
StringBuilder sbWord = new StringBuilder();
getSevsitiveWords(sensitiveWordsMap, listWords, sbWord);
return listWords;
}
/**
* 算法树是否包含对应的敏感词
*/
public static boolean containSensitiveWord(String sensitiveWord) {
if (null == sensitiveWord || sensitiveWord.length() == 0 || sensitiveWordsMap.size() == 0) {
return false;
}
return sensitiveWord.equals(checkSensitiveWord(sensitiveWord));
}
/**
* 清空算法树
*/
public static void clearSensitiveWord() {
synchronized (lock) {
sensitiveWordsMap = new HashMap();
}
}
/**
* 递归获取算法树的敏感词
*/
private static void getSevsitiveWords(Map childMap, LinkedList listWords,
StringBuilder sbWord) {
if (childMap.size() == 1 && childMap.containsKey(null)) {
listWords.add(sbWord.toString());
sbWord.setLength(sbWord.length() - 1);
return;
}
for (Map.Entry entry : childMap.entrySet()) {
Character keyChar = entry.getKey();
Map valueMap = entry.getValue();
if (null == keyChar) {
continue;
}
sbWord.append(keyChar);
if (valueMap.containsKey(null)) {
listWords.add(sbWord.toString());
if (valueMap.size() == 1) {
sbWord.setLength(sbWord.length() - 1);
} else {
getSevsitiveWords(valueMap, listWords, sbWord);
sbWord.setLength(sbWord.length() - 1);
}
} else {
getSevsitiveWords(valueMap, listWords, sbWord);
sbWord.setLength(sbWord.length() - 1);
}
}
}
private final static Object lock = new Object();
private static Map sensitiveWordsMap = new HashMap();
}`
DFAUtilsTestimport org.junit.Assert;
import org.junit.Test;
import java.util.LinkedList;
public class DFAUtilsTest {
/*==========================AddSensitiveWord-start==========================*/
@Test
public void testAddSensitiveWord01() {
DFAUtils.clearSensitiveWord();
LinkedList listWords = null;
DFAUtils.addSensitiveWord("中");
listWords = DFAUtils.getSevsitiveWords();
Assert.assertEquals(1, listWords.size());
DFAUtils.addSensitiveWord("中哈");
listWords = DFAUtils.getSevsitiveWords();
Assert.assertEquals(2, listWords.size());
DFAUtils.addSensitiveWord("中");
listWords = DFAUtils.getSevsitiveWords();
Assert.assertEquals(2, listWords.size());
DFAUtils.addSensitiveWord("中哈");
listWords = DFAUtils.getSevsitiveWords();
Assert.assertEquals(2, listWords.size());
DFAUtils.delSensitiveWord("中");
listWords = DFAUtils.getSevsitiveWords();
Assert.assertEquals(1, listWords.size());
DFAUtils.addSensitiveWord("中中");
listWords = DFAUtils.getSevsitiveWords();
Assert.assertEquals(2, listWords.size());
DFAUtils.addSensitiveWord("中中中");
listWords = DFAUtils.getSevsitiveWords();
Assert.assertEquals(3, listWords.size());
DFAUtils.addSensitiveWord("人");
listWords = DFAUtils.getSevsitiveWords();
Assert.assertEquals(4, listWords.size());
DFAUtils.addSensitiveWord("中");
listWords = DFAUtils.getSevsitiveWords();
Assert.assertEquals(5, listWords.size());
}
/*==========================AddSensitiveWord-end============================*/
/*==========================CheckSensitiveWord-start==========================*/
@Test
public void testCheckSensitiveWord01() {
DFAUtils.clearSensitiveWord();
String sencitivaWord = null;
LinkedList listWords = null;
DFAUtils.addSensitiveWord("大");
DFAUtils.addSensitiveWord("大学");
DFAUtils.addSensitiveWord("中中中国中中中");
listWords = DFAUtils.getSevsitiveWords();
Assert.assertEquals(8, listWords.size());
sencitivaWord = DFAUtils.checkSensitiveWord("滚");
Assert.assertEquals("滚", sencitivaWord);
sencitivaWord = DFAUtils.checkSensitiveWord("翻滚");
Assert.assertEquals(null, sencitivaWord);
sencitivaWord = DFAUtils.checkSensitiveWord("滚 ");
Assert.assertEquals("滚", sencitivaWord);
sencitivaWord = DFAUtils.checkSensitiveWord(" 滚");
Assert.assertEquals("滚", sencitivaWord);
sencitivaWord = DFAUtils.checkSensitiveWord("体操");
Assert.assertEquals(null, sencitivaWord);
sencitivaWord = DFAUtils.checkSensitiveWord("你好滚滚");
Assert.assertEquals("滚滚", sencitivaWord);
sencitivaWord = DFAUtils.checkSensitiveWord("滚你好滚");
Assert.assertEquals(null, sencitivaWord);
sencitivaWord = DFAUtils.checkSensitiveWord("滚轮胎");
Assert.assertEquals(null, sencitivaWord);
sencitivaWord = DFAUtils.checkSensitiveWord("你你国国");
Assert.assertEquals(null, sencitivaWord);
sencitivaWord = DFAUtils.checkSensitiveWord("中中国中中 中中中中国中中中");
Assert.assertEquals("中中中国中中中", sencitivaWord);
}
/*==========================CheckSensitiveWord-start==========================*/
/*==========================DelSensitiveWor-start==========================*/
@Test
public void testDelSensitiveWord01() {
DFAUtils.clearSensitiveWord();
LinkedList listWords = null;
DFAUtils.addSensitiveWord("中");
listWords = DFAUtils.getSevsitiveWords();
Assert.assertEquals(1, listWords.size());
DFAUtils.delSensitiveWord("");
listWords = DFAUtils.getSevsitiveWords();
Assert.assertEquals(1, listWords.size());
DFAUtils.delSensitiveWord("国");
listWords = DFAUtils.getSevsitiveWords();
Assert.assertEquals(1, listWords.size());
DFAUtils.delSensitiveWord("中");
listWords = DFAUtils.getSevsitiveWords();
Assert.assertEquals(0, listWords.size());
}
@Test
public void testDelSensitiveWord02() {
DFAUtils.clearSensitiveWord();
LinkedList listWords = null;
DFAUtils.addSensitiveWord("中中");
listWords = DFAUtils.getSevsitiveWords();
Assert.assertEquals(1, listWords.size());
DFAUtils.delSensitiveWord("");
listWords = DFAUtils.getSevsitiveWords();
Assert.assertEquals(1, listWords.size());
DFAUtils.delSensitiveWord("中");
listWords = DFAUtils.getSevsitiveWords();
Assert.assertEquals(1, listWords.size());
DFAUtils.delSensitiveWord("中中中");
listWords = DFAUtils.getSevsitiveWords();
Assert.assertEquals(1, listWords.size());
DFAUtils.delSensitiveWord("中中");
listWords = DFAUtils.getSevsitiveWords();
Assert.assertEquals(0, listWords.size());
}
@Test
public void testDelSensitiveWord03() {
DFAUtils.clearSensitiveWord();
LinkedList listWords = null;
DFAUtils.addSensitiveWord("中中");
DFAUtils.addSensitiveWord("中");
listWords = DFAUtils.getSevsitiveWords();
Assert.assertEquals(2, listWords.size());
DFAUtils.delSensitiveWord("");
listWords = DFAUtils.getSevsitiveWords();
Assert.assertEquals(2, listWords.size());
DFAUtils.delSensitiveWord("中中中");
listWords = DFAUtils.getSevsitiveWords();
Assert.assertEquals(2, listWords.size());
DFAUtils.delSensitiveWord(" 中中");
listWords = DFAUtils.getSevsitiveWords();
Assert.assertEquals(2, listWords.size());
DFAUtils.delSensitiveWord("中中 ");
listWords = DFAUtils.getSevsitiveWords();
Assert.assertEquals(2, listWords.size());
DFAUtils.delSensitiveWord("中");
listWords = DFAUtils.getSevsitiveWords();
Assert.assertEquals(1, listWords.size());
DFAUtils.delSensitiveWord("中中");
listWords = DFAUtils.getSevsitiveWords();
Assert.assertEquals(0, listWords.size());
DFAUtils.addSensitiveWord("中");
listWords = DFAUtils.getSevsitiveWords();
Assert.assertEquals(1, listWords.size());
DFAUtils.delSensitiveWord("中中中");
listWords = DFAUtils.getSevsitiveWords();
Assert.assertEquals(1, listWords.size());
DFAUtils.delSensitiveWord("中中");
listWords = DFAUtils.getSevsitiveWords();
Assert.assertEquals(1, listWords.size());
DFAUtils.delSensitiveWord("中");
listWords = DFAUtils.getSevsitiveWords();
Assert.assertEquals(0, listWords.size());
}
@Test
public void testDelSensitiveWord04() {
DFAUtils.clearSensitiveWord();
LinkedList listWords = null;
DFAUtils.addSensitiveWord("中中中111");
DFAUtils.addSensitiveWord("中中");
DFAUtils.addSensitiveWord("中");
listWords = DFAUtils.getSevsitiveWords();
Assert.assertEquals(3, listWords.size());
DFAUtils.delSensitiveWord("");
listWords = DFAUtils.getSevsitiveWords();
Assert.assertEquals(3, listWords.size());
DFAUtils.delSensitiveWord("中中中111");
listWords = DFAUtils.getSevsitiveWords();
Assert.assertEquals(2, listWords.size());
DFAUtils.addSensitiveWord("中中中111");
DFAUtils.delSensitiveWord("中中中");
listWords = DFAUtils.getSevsitiveWords();
Assert.assertEquals(3, listWords.size());
DFAUtils.delSensitiveWord("中中");
listWords = DFAUtils.getSevsitiveWords();
Assert.assertEquals(2, listWords.size());
DFAUtils.addSensitiveWord("中中 ");
listWords = DFAUtils.getSevsitiveWords();
Assert.assertEquals(3, listWords.size());
DFAUtils.delSensitiveWord("中");
listWords = DFAUtils.getSevsitiveWords();
Assert.assertEquals(2, listWords.size());
DFAUtils.delSensitiveWord("中中");
listWords = DFAUtils.getSevsitiveWords();
Assert.assertEquals(2, listWords.size());
}
/*==========================DelSensitiveWor-end============================*/
/*==========================ContainSensitiveWord-start==========================*/
@Test
public void testContainSensitiveWord01() {
DFAUtils.clearSensitiveWord();
LinkedList listWords = null;
DFAUtils.addSensitiveWord("滚");
DFAUtils.addSensitiveWord("中中中国中中中");
listWords = DFAUtils.getSevsitiveWords();
Assert.assertEquals(7, listWords.size());
Assert.assertEquals(false, DFAUtils.containSensitiveWord(" "));
Assert.assertEquals(true, DFAUtils.containSensitiveWord("操"));
}
/*==========================ContainSensitiveWord-end============================*/
}