敏感词检测算法
思路:DFA算法
確定性有窮自動機,用于正則表達式的匹配,最長左子式匹配
/*** 檢測敏感詞** @param scriptText* @param matchType* @return*/public static Set<String> checkSensitiveWord(String scriptText, int matchType) {Set<String> sensitiveWordSet = new HashSet<>();for (int i = 0; i < scriptText.length(); i++) {int length = testSensitiveWord(scriptText, i, matchType, sensitiveWordMap);if (length > 0) {sensitiveWordSet.add(scriptText.substring(i, i + length));i = i + length - 1;}}return sensitiveWordSet;}構建敏感詞map
public static void initSensitiveWordMap(List<WordSenstive> wordSenstives) {log.info("開始初始化敏感詞map");List<String> collect = wordSenstives.stream().map(a -> a.getSenstiveWord()).collect(Collectors.toList());Set<String> keyWordSet = new HashSet<String>(collect);Map<String, String> newWorMap = null;String key = null;Map nowMap = null;sensitiveWordMap = new HashMap(keyWordSet.size());Iterator<String> iterator = keyWordSet.iterator();while (iterator.hasNext()) {key = iterator.next();if (key == null) {continue;}nowMap = sensitiveWordMap;for (int i = 0; i < key.length(); i++) {char keyChar = key.charAt(i);Object wordMap = nowMap.get(keyChar);if (wordMap != null) {nowMap = (Map) wordMap;} else {newWorMap = new HashMap<String, String>();newWorMap.put("isEnd", "0");nowMap.put(keyChar, newWorMap);nowMap = newWorMap;}if (i == key.length() - 1) {nowMap.put("deepCount", i + 1 + "");nowMap.put("isEnd", "1");}}}log.info("敏感詞map構建完成");}匹配敏感詞
private static int testSensitiveWord(String scriptText, int index, int matchType, Map sensitiveWordMap) {boolean flag = false;int matchFlag = 0;char word = 0;Map nowMap = sensitiveWordMap;for (int i = index; i < scriptText.length(); i++) {word = scriptText.charAt(i);nowMap = (Map) nowMap.get(word);if (nowMap != null) {matchFlag++;//找到相應的key,匹配標識+1if ("1".equals(nowMap.get("isEnd"))) {Integer deepCount = Integer.valueOf((String) nowMap.get("deepCount"));flag = isWord(scriptText, i, deepCount);if (1 == matchType || flag) {//1:最小匹配,2:全匹配break;}}} else {break;}}if (matchFlag < 2 || !flag) {matchFlag = 0;}return matchFlag;}匹配是否是單詞
private static boolean isWord(String scriptText, int i, int deepCount) {boolean isWord = true;if (i - deepCount >= 0 && scriptText.charAt(i - deepCount) > 96 && scriptText.charAt(i - deepCount) < 123) {isWord = false;}return isWord;}總結
- 上一篇: 双缓冲 android,Android
- 下一篇: python 字符串format使用