package com.ruoyi.common.utils;
|
|
import com.hankcs.hanlp.HanLP;
|
import com.hankcs.hanlp.seg.common.Term;
|
import org.apache.commons.lang3.StringUtils;
|
|
import java.util.*;
|
import java.util.stream.Collectors;
|
|
import static com.hankcs.hanlp.utility.TextUtility.isChinese;
|
|
/**
|
* 医院信息分词工具类
|
* 使用 HanLP 专业中文分词库进行分词处理
|
*
|
* @author ruoyi
|
* @date 2026-01-20
|
*/
|
public class HospitalTokenizerUtil {
|
|
/**
|
* 停用词集合(需要过滤的常见词汇)
|
* 注意:“区”、“中”等在医院名称中有意义,不应过滤
|
*/
|
private static final Set<String> STOP_WORDS = new HashSet<>(Arrays.asList(
|
"医院", "诊所", "卫生", "镇", "乡",
|
"街道", "路", "号", "栋", "单元", "室", "层", "楼", "的", "了",
|
"在", "与", "和", "及", "等", "之", "于", "为", "有", "无","(",")","(",")","、",",","。","!","?",";",":","“","”","‘","’"
|
));
|
|
/**
|
* 高权重词语(医疗机构特征词)
|
* 注意:地区名不再放在高权重词中,避免分院因包含其他地区名而获得额外加分
|
*/
|
private static final Set<String> HIGH_WEIGHT_WORDS = new HashSet<>(Arrays.asList(
|
"人民", "中医", "中西医", "中西医结合", "医疗", "妇幼", "儿童", "肤科",
|
"口腔", "眼科", "骨科", "整形", "精神", "康复", "急救", "医学院",
|
"医科大学", "专科",
|
"军区", "军医", "中心", "附属", "省立", "市立", "区立", "脑科", "总院", "慈善", "保健院", "口腔", "祈福", "眼科", "铁路", "附一", "附二", "附三", "附四", "附五", "附六",
|
"第一", "第二", "第三", "第四", "第五", "第六", "第七", "第八", "第九", "第十",
|
"肿瘤"
|
));
|
|
/**
|
* 医院名称分词的高频关键词字典(用于强制提取完整医疗相关短语)
|
* 仅包含医疗机构相关词,不包含具体行政地名,避免地区硬编码
|
*/
|
private static final Set<String> HOSPITAL_KEYWORD_DICT = new HashSet<>(Arrays.asList(
|
"中医院", "中医医院", "市医院", "省医院", "人民医院", "中心医院", "口腔医院",
|
"华侨医院", "儿童医院", "眼科中心", "福利院", "门诊部", "中山大学", "附属医院",
|
"孙逸仙","门诊"
|
));
|
|
/** 组合词生成的最小字符长度 */
|
private static final int MIN_COMBINED_LEN = 4;
|
/** 组合词生成的最大字符长度 */
|
private static final int MAX_COMBINED_LEN = 30;
|
/** 组合词生成时包含的最大分词数量(深度) */
|
private static final int MAX_COMBINED_WORDS = 10;
|
|
/**
|
* 对医院信息进行分词(使用 HanLP)
|
*
|
* @param hospName 医院名称
|
* @param hospShort 医院简称
|
* @param province 省份
|
* @param city 城市
|
* @param area 区域
|
* @param address 详细地址
|
* @return 分词结果(逗号分隔的关键词字符串)
|
*/
|
public static String tokenize(String hospName, String hospShort, String province,
|
String city, String area, String address) {
|
Set<String> keywords = new LinkedHashSet<>();
|
|
// 1. 行政区划:只作为独立关键词,不参与组合
|
if (StringUtils.isNotBlank(province)) {
|
keywords.add(province.trim());
|
}
|
if (StringUtils.isNotBlank(city)) {
|
keywords.add(city.trim());
|
}
|
if (StringUtils.isNotBlank(area)) {
|
keywords.add(area.trim());
|
}
|
|
// 2. 医院名称:去掉省、市前缀,只对“区+医院主体”做分词和组合
|
if (StringUtils.isNotBlank(hospName)) {
|
String nameForSeg = hospName.trim();
|
|
// 去掉前面的省份
|
if (StringUtils.isNotBlank(province) && nameForSeg.startsWith(province)) {
|
nameForSeg = nameForSeg.substring(province.length());
|
}
|
// 再去掉城市
|
if (StringUtils.isNotBlank(city) && nameForSeg.startsWith(city)) {
|
nameForSeg = nameForSeg.substring(city.length());
|
}
|
// 区保留:例如 "越秀区中医医院",这样可以生成 "越秀区中医院"、"中医院" 等组合词
|
keywords.addAll(extractKeywordsByHanLP(nameForSeg));
|
// 基于医院全称,强制提取高频医疗关键词(如“中医院”“儿童医院”等)
|
addDictPhrases(hospName, keywords);
|
}
|
|
// 3. 医院简称:通常不带省市区,直接分词
|
if (StringUtils.isNotBlank(hospShort)) {
|
keywords.addAll(extractKeywordsByHanLP(hospShort));
|
addDictPhrases(hospShort, keywords);
|
}
|
|
// 4. 过滤停用词和无效词
|
keywords = keywords.stream()
|
.filter(keyword -> !STOP_WORDS.contains(keyword))
|
.filter(keyword -> keyword.length() > 0)
|
.filter(HospitalTokenizerUtil::isValidKeyword)
|
.collect(java.util.stream.Collectors.toCollection(LinkedHashSet::new));
|
|
return String.join(",", keywords);
|
}
|
|
/**
|
* 使用 HanLP 从文本中提取关键词
|
*
|
* @param text 文本
|
* @return 关键词集合
|
*/
|
private static Set<String> extractKeywordsByHanLP(String text) {
|
Set<String> keywords = new LinkedHashSet<>();
|
|
if (StringUtils.isBlank(text)) {
|
return keywords;
|
}
|
|
try {
|
// 使用 HanLP 进行分词
|
List<Term> terms = HanLP.segment(text.trim());
|
|
// 添加完整文本(如果不太长)
|
if (text.length() <= 20) {
|
keywords.add(text.trim());
|
}
|
|
// 提取分词结果
|
List<String> validWords = new ArrayList<>();
|
for (Term term : terms) {
|
String word = term.word;
|
|
// 过滤单字符(除非是重要的中文字符)
|
if (word.length() == 1 && !isChinese(word.charAt(0))) {
|
continue;
|
}
|
|
// 添加有效的分词
|
if (isValidKeyword(word)) {
|
keywords.add(word);
|
validWords.add(word);
|
}
|
}
|
|
// 【关键优化】生成连续组合词
|
// 但要过滤掉括号内容,避免生成无意义的分院组合词
|
// 例如:["越秀区", "中医", "院"] → 生成 "越秀区中医", "中医院", "越秀区中医院"
|
|
// 移除括号内容用于生成组合词
|
String textWithoutBrackets = text
|
.replaceAll("([^)]*)", "") // 移除中文括号
|
.replaceAll("\\([^\\)]*\\)", "") // 移除英文括号
|
.replaceAll("【[^】]*】", "") // 移除方括号
|
.trim();
|
|
// 对移除括号后的文本重新分词
|
List<Term> cleanTerms = HanLP.segment(textWithoutBrackets);
|
List<String> cleanValidWords = new ArrayList<>();
|
for (Term term : cleanTerms) {
|
String word = term.word;
|
if (word.length() == 1 && !isChinese(word.charAt(0))) {
|
continue;
|
}
|
if (isValidKeyword(word)) {
|
cleanValidWords.add(word);
|
}
|
}
|
|
// 基于干净的分词生成组合词
|
for (int len = 2; len <= Math.min(MAX_COMBINED_WORDS, cleanValidWords.size()); len++) {
|
for (int i = 0; i <= cleanValidWords.size() - len; i++) {
|
StringBuilder combined = new StringBuilder();
|
for (int j = i; j < i + len; j++) {
|
combined.append(cleanValidWords.get(j));
|
}
|
String combinedWord = combined.toString();
|
|
// 只添加长度合理的组合词
|
if (combinedWord.length() >= MIN_COMBINED_LEN && combinedWord.length() <= MAX_COMBINED_LEN) {
|
keywords.add(combinedWord);
|
// 针对“越秀区中医院”这类模式,额外生成去掉“区”的简化关键词,如“越秀中医院”
|
String simplified = simplifyDistrictInKeyword(combinedWord);
|
if (simplified != null && simplified.length() >= MIN_COMBINED_LEN && simplified.length() <= MAX_COMBINED_LEN) {
|
keywords.add(simplified);
|
}
|
}
|
}
|
}
|
|
} catch (Exception e) {
|
// HanLP 分词失败时,降级使用简单分词
|
keywords.addAll(extractKeywordsByNGram(text));
|
}
|
|
return keywords;
|
}
|
|
/**
|
* 降级方案:使用简单的 N-Gram 分词
|
*
|
* @param text 文本
|
* @return 关键词集合
|
*/
|
private static Set<String> extractKeywordsByNGram(String text) {
|
Set<String> keywords = new LinkedHashSet<>();
|
|
if (StringUtils.isBlank(text)) {
|
return keywords;
|
}
|
|
text = text.trim();
|
int length = text.length();
|
|
// 生成2-4字符的N-Gram
|
for (int n = 2; n <= 4 && n <= length; n++) {
|
for (int i = 0; i <= length - n; i++) {
|
String ngram = text.substring(i, i + n);
|
if (isValidKeyword(ngram)) {
|
keywords.add(ngram);
|
}
|
}
|
}
|
|
return keywords;
|
}
|
|
/**
|
* 判断关键词是否有效
|
*
|
* @param keyword 关键词
|
* @return 是否有效
|
*/
|
private static boolean isValidKeyword(String keyword) {
|
if (StringUtils.isBlank(keyword)) {
|
return false;
|
}
|
|
// 过滤纯数字
|
if (keyword.matches("^\\d+$")) {
|
return false;
|
}
|
|
// 过滤纯符号
|
if (keyword.matches("^[\\p{P}\\p{S}]+$")) {
|
return false;
|
}
|
|
// 至少包含一个中文或字母
|
return keyword.matches(".*[\\u4e00-\\u9fa5a-zA-Z].*");
|
}
|
|
/**
|
* 针对含有“区中医院/区中医”的组合词,生成去掉“区”的简化形式
|
* 例如:"越秀区中医院" → "越秀中医院","越秀区中医" → "越秀中医"
|
*/
|
private static String simplifyDistrictInKeyword(String keyword) {
|
if (StringUtils.isBlank(keyword)) {
|
return null;
|
}
|
// 通用规则:去掉“区”这个行政层级标识,但仅限于“区中医院/区中医”这种医疗场景
|
if (keyword.contains("区中医院")) {
|
return keyword.replaceFirst("区中医院", "中医院");
|
}
|
if (keyword.contains("区中医")) {
|
return keyword.replaceFirst("区中医", "中医");
|
}
|
return null;
|
}
|
|
/**
|
* 基于医院名称/简称,强制提取医院关键词字典中的短语
|
*/
|
private static void addDictPhrases(String text, Set<String> keywords) {
|
if (StringUtils.isBlank(text) || keywords == null) {
|
return;
|
}
|
for (String phrase : HOSPITAL_KEYWORD_DICT) {
|
if (text.contains(phrase)) {
|
keywords.add(phrase);
|
}
|
}
|
}
|
|
/* 移除医院名称中的地域前缀(省/市/自治区等)
|
* 通用处理,不硬编码具体地名
|
*
|
* @param hospName 医院名称
|
* @return 移除地域前缀后的名称
|
*/
|
private static String removeLocationPrefixes(String hospName) {
|
if (StringUtils.isBlank(hospName)) {
|
return hospName;
|
}
|
|
String result = hospName;
|
|
// 移除常见的行政区划后缀
|
// 省级: XX省、XX市(直辖市)、XX自治区
|
result = result.replaceFirst("^[\\u4e00-\\u9fa5]{2,10}省", "");
|
result = result.replaceFirst("^[\\u4e00-\\u9fa5]{2,10}自治区", "");
|
|
// 地级市:XX市
|
result = result.replaceFirst("^[\\u4e00-\\u9fa5]{2,10}市", "");
|
|
// 县级:XX区、XX县、XX市(县级市)
|
result = result.replaceFirst("^[\\u4e00-\\u9fa5]{2,10}区", "");
|
result = result.replaceFirst("^[\\u4e00-\\u9fa5]{2,10}县", "");
|
|
return result.trim();
|
}
|
|
/**
|
* 计算两个分词集合的匹配度(优化版)
|
* 考虑因素:
|
* 0. 【核心】完整搜索文本在keywords中存在 → 高分(+100分)
|
* 1. 完整匹配加分(单个词匹配)
|
* 1.5 超级加分:完整搜索文本包含在医院名中(+80分),未匹配内容渐进惩罚
|
* 2. 词语权重(重要词汇加分)
|
* 3. 连续匹配加分
|
* 4. 字符相似度
|
* 5. 负向匹配惩罚(医院名中出现搜索词之外的地区名 -30分)
|
* 6. 分院轻微降权(-10分)
|
* 7. 括号内容轻微惩罚(-5分)
|
*
|
* @param searchKeywords 搜索分词(逗号分隔)
|
* @param hospitalKeywords 医院分词(逗号分隔)
|
* @param hospName 医院名称(用于完整匹配判断)
|
* @param districtNames 地区名称集合(用于负向匹配检查,可为null)
|
* @return 匹配分数
|
*/
|
public static int calculateMatchScore(String searchKeywords, String hospitalKeywords, String hospName, Set<String> districtNames) {
|
if (StringUtils.isBlank(searchKeywords) || StringUtils.isBlank(hospitalKeywords)) {
|
return 0;
|
}
|
|
List<String> searchWords = Arrays.asList(searchKeywords.split(","));
|
List<String> hospWords = Arrays.asList(hospitalKeywords.split(","));
|
Set<String> searchWordsSet = new HashSet<>(searchWords);
|
Set<String> hospWordsSet = new HashSet<>(hospWords);
|
|
int totalScore = 0;
|
|
// 0. 【核心优化】首先判断是否存在“完整匹配”
|
// 约定:searchKeywords 的第一个分词为原始搜索文本
|
String fullSearchText = searchWords.get(0);
|
boolean keywordFullMatch = hospWordsSet.contains(fullSearchText);
|
boolean nameFullMatch = (hospName != null && hospName.contains(fullSearchText));
|
|
if (keywordFullMatch || nameFullMatch) {
|
// 完整匹配优先:直接给固定极高分,确保排在最前面
|
totalScore = 1000; // 提升基础分为1000,作为分数天花板
|
|
// 对完整匹配结果,仍然可以应用地区惩罚和分院/括号轻微降权,保证语义正确
|
if (districtNames != null && !districtNames.isEmpty()) {
|
totalScore -= calculateNegativeMatchPenalty(searchWordsSet, districtNames, hospName);
|
}
|
|
// if (isBranchHospital(hospName)) {
|
// totalScore -= 10; // 分院扣10分
|
// }
|
|
// if (hospName != null && (hospName.contains("(") || hospName.contains("(") || hospName.contains("【"))) {
|
// totalScore -= 5; // 括号内容轻微扣分
|
// }
|
|
return Math.max(0, totalScore);
|
}
|
|
// 1. 完整匹配加分(单个词匹配)
|
for (String searchWord : searchWords) {
|
if (searchWord.length() >= 4 && hospName != null && hospName.contains(searchWord)) {
|
totalScore += 50; // 完整词匹配加分
|
}
|
}
|
|
// 1.5 超级加分:搜索文本与医院名的完整相似度
|
if (hospName != null) {
|
// 完全包含加分
|
if (hospName.contains(fullSearchText)) {
|
totalScore += 500; // 提升包含关系的分数,确保包含搜索全称的结果排名靠前
|
} else {
|
// 计算整体相似度
|
int similarity = calculateStringSimilarity(fullSearchText, hospName);
|
if (similarity > 80) {
|
totalScore += similarity / 2; // 高度相似也加分,但权重降低
|
}
|
}
|
|
// 未匹配内容渐进惩罚:医院名中有搜索词之外的内容
|
String cleanedHospName = removeLocationPrefixes(hospName);
|
int unmatchedLength = cleanedHospName.length() - fullSearchText.length();
|
if (unmatchedLength > 0) {
|
// 渐进惩罚:1-5字扣1分/字,6-10字扣2分/字,11+字扣3分/字
|
if (unmatchedLength <= 5) {
|
totalScore -= unmatchedLength * 1;
|
} else if (unmatchedLength <= 10) {
|
totalScore -= 5 + (unmatchedLength - 5) * 2;
|
} else {
|
totalScore -= 5 + 10 + (unmatchedLength - 10) * 3;
|
}
|
}
|
}
|
|
// 2. 分词匹配计分(优先匹配较长的搜索词,命中即止)
|
List<String> sortedSearchWords = new ArrayList<>(searchWords);
|
sortedSearchWords.sort((a, b) -> Integer.compare(b.length(), a.length())); // 按长度从长到短
|
boolean anyMatch = false;
|
|
for (String searchWord : sortedSearchWords) {
|
boolean isLong = searchWord.length() >= 4;
|
if (hospWords.contains(searchWord)) {
|
int wordScore;
|
if (isLong) {
|
// 长词完整匹配:高分
|
wordScore = 40 + searchWord.length() * 4;
|
} else {
|
// 短词完整匹配:低分
|
wordScore = 10 + searchWord.length() * 2;
|
}
|
|
// 高权重词额外加分
|
if (HIGH_WEIGHT_WORDS.contains(searchWord)) {
|
wordScore += 15;
|
}
|
|
totalScore += wordScore;
|
anyMatch = true;
|
|
// 【核心修改】只要匹配到一个分词(无论长短),就中断后续匹配,遵循长词优先原则
|
break;
|
} else {
|
// 2.3 部分匹配(包含关系),只对较长搜索词考虑
|
if (isLong) {
|
for (String hospWord : hospWords) {
|
if (hospWord.contains(searchWord) || searchWord.contains(hospWord)) {
|
int partialScore = Math.min(searchWord.length(), hospWord.length()) * 2;
|
totalScore += partialScore;
|
anyMatch = true;
|
break;
|
}
|
}
|
if (anyMatch) {
|
break; // 命中即止
|
}
|
}
|
}
|
}
|
|
// 如果已经有匹配,则应用负向惩罚、分院/括号调整并返回
|
if (anyMatch) {
|
if (districtNames != null && !districtNames.isEmpty()) {
|
totalScore -= calculateNegativeMatchPenalty(searchWordsSet, districtNames, hospName);
|
}
|
if (isBranchHospital(hospName)) {
|
totalScore -= 10;
|
}
|
if (hospName != null && (hospName.contains("(") || hospName.contains("(") || hospName.contains("【"))) {
|
totalScore -= 5;
|
}
|
return Math.max(0, totalScore);
|
}
|
|
// 3. 连续匹配加分
|
totalScore += calculateContinuousMatchBonus(searchWords, hospWords);
|
|
// 4. 字符相似度加分(对于长词)
|
for (String searchWord : searchWords) {
|
if (searchWord.length() >= 4) {
|
for (String hospWord : hospWords) {
|
if (hospWord.length() >= 4) {
|
int similarity = calculateStringSimilarity(searchWord, hospWord);
|
if (similarity > 70) { // 相似度超过70%
|
totalScore += similarity / 10;
|
}
|
}
|
}
|
}
|
}
|
|
// 5. 负向匹配惩罚:医院名中包含搜索词之外的高权重地区名
|
if (districtNames != null && !districtNames.isEmpty()) {
|
totalScore -= calculateNegativeMatchPenalty(searchWordsSet, districtNames, hospName);
|
}
|
|
// 6. 分院轻微降权:主院优先,但不要过度惩罚
|
if (isBranchHospital(hospName)) {
|
totalScore -= 10; // 分院扣10分(改为固定扣分,而非打折)
|
}
|
|
// 7. 括号内容轻微惩罚:括号内通常是次要信息
|
if (hospName != null && (hospName.contains("(") || hospName.contains("(") || hospName.contains("【"))) {
|
totalScore -= 5; // 包含括号扣5分(改为固定扣分)
|
}
|
|
return Math.max(0, totalScore); // 确保分数不为负
|
}
|
|
/**
|
* 计算负向匹配惩罚
|
* 如果医院名中包含搜索词之外的地区名称,应该降低排名
|
*
|
* @param searchWords 搜索词集合
|
* @param districtNames 所有医院的地区名称集合(从医院表的 hopsArea 字段提取)
|
* @param hospName 当前医院名称
|
* @return 惩罚分数
|
*/
|
private static int calculateNegativeMatchPenalty(Set<String> searchWords, Set<String> districtNames, String hospName) {
|
if (hospName == null || districtNames == null || districtNames.isEmpty()) {
|
return 0;
|
}
|
|
int penalty = 0;
|
|
// 检查医院名中的地区名
|
for (String district : districtNames) {
|
if (StringUtils.isBlank(district)) {
|
continue;
|
}
|
|
// 如果医院名包含该地区名
|
if (hospName.contains(district)) {
|
// 检查是否在搜索词中出现
|
boolean inSearchWords = false;
|
|
// 1. 直接匹配:搜索词集合中包含该地区名
|
if (searchWords.contains(district)) {
|
inSearchWords = true;
|
} else {
|
// 2. 部分匹配:搜索词的任何一个词包含该地区名
|
for (String searchWord : searchWords) {
|
if (searchWord.contains(district)) {
|
inSearchWords = true;
|
break;
|
}
|
}
|
}
|
|
// 如果医院名包含该地区名,但搜索词中没有,则扣分
|
if (!inSearchWords) {
|
penalty += 30; // 包含不相关地区名,扣30分
|
}
|
}
|
}
|
|
return penalty;
|
}
|
|
/**
|
* 判断是否为分院
|
*/
|
private static boolean isBranchHospital(String hospName) {
|
if (hospName == null) {
|
return false;
|
}
|
|
// 分院特征关键词
|
String[] branchKeywords = {
|
"分院", "分部", "门诊部","门诊", "社区卫生", "卫生站", "卫生服务中心",
|
"东院", "西院", "南院", "北院", "新院", "老院",
|
"人民医院","附属医院","福利院","分院"
|
|
|
};
|
|
for (String keyword : branchKeywords) {
|
if (hospName.contains(keyword)) {
|
return true;
|
}
|
}
|
|
// 包含具体路名/街道名也可能是分院
|
String[] roadKeywords = {
|
"路分院", "街分院", "道分院", "大道分院"
|
};
|
|
for (String keyword : roadKeywords) {
|
if (hospName.contains(keyword)) {
|
return true;
|
}
|
}
|
|
return false;
|
}
|
|
/**
|
* 计算连续匹配加分
|
*/
|
private static int calculateContinuousMatchBonus(List<String> searchWords, List<String> hospWords) {
|
int bonus = 0;
|
int consecutiveCount = 0;
|
|
for (int i = 0; i < searchWords.size() - 1; i++) {
|
String word1 = searchWords.get(i);
|
String word2 = searchWords.get(i + 1);
|
|
// 判断是否在医院分词中连续出现
|
boolean found = false;
|
for (int j = 0; j < hospWords.size() - 1; j++) {
|
if (hospWords.get(j).equals(word1) && hospWords.get(j + 1).equals(word2)) {
|
consecutiveCount++;
|
found = true;
|
break;
|
}
|
}
|
|
if (found) {
|
bonus += consecutiveCount * 5; // 连续越长加分越多
|
} else {
|
consecutiveCount = 0;
|
}
|
}
|
|
return bonus;
|
}
|
|
/**
|
* 计算字符串相似度(使用Levenshtein距离)
|
*
|
* @param s1 字符串1
|
* @param s2 字符串2
|
* @return 相似度百分比 (0-100)
|
*/
|
private static int calculateStringSimilarity(String s1, String s2) {
|
if (s1.equals(s2)) {
|
return 100;
|
}
|
|
int maxLen = Math.max(s1.length(), s2.length());
|
if (maxLen == 0) {
|
return 100;
|
}
|
|
int distance = levenshteinDistance(s1, s2);
|
return (int) ((1 - (double) distance / maxLen) * 100);
|
}
|
|
/**
|
* 计算Levenshtein距离(编辑距离)
|
*/
|
private static int levenshteinDistance(String s1, String s2) {
|
int len1 = s1.length();
|
int len2 = s2.length();
|
|
int[][] dp = new int[len1 + 1][len2 + 1];
|
|
for (int i = 0; i <= len1; i++) {
|
dp[i][0] = i;
|
}
|
|
for (int j = 0; j <= len2; j++) {
|
dp[0][j] = j;
|
}
|
|
for (int i = 1; i <= len1; i++) {
|
for (int j = 1; j <= len2; j++) {
|
int cost = s1.charAt(i - 1) == s2.charAt(j - 1) ? 0 : 1;
|
dp[i][j] = Math.min(
|
Math.min(dp[i - 1][j] + 1, dp[i][j - 1] + 1),
|
dp[i - 1][j - 1] + cost
|
);
|
}
|
}
|
|
return dp[len1][len2];
|
}
|
|
/**
|
* 对文本进行分词(前端传入的搜索关键词)
|
*
|
* @param text 搜索文本
|
* @return 分词结果(逗号分隔)
|
*/
|
public static String tokenizeSearchText(String text) {
|
if (StringUtils.isBlank(text)) {
|
return "";
|
}
|
|
Set<String> keywords = extractKeywordsByHanLP(text.trim());
|
|
// 过滤停用词
|
keywords = keywords.stream()
|
.filter(keyword -> !STOP_WORDS.contains(keyword))
|
.filter(keyword -> keyword.length() > 0)
|
.collect(java.util.stream.Collectors.toCollection(LinkedHashSet::new));
|
|
return String.join(",", keywords);
|
}
|
}
|