Javascript 从两个字符串中查找最匹配单词的算法

Javascript 从两个字符串中查找最匹配单词的算法,javascript,arrays,string,Javascript,Arrays,String,我有两个字符串,我的目标是匹配尽可能多的单词 问题是这两个字符串相似但不相等(例如,其中一个字符串缺少一个单词,或单词拼写错误) 例如: var str1 = "I like this soup because it is very tasty, like the one that my grandma used to make"; var str2 = "I really lie this soup, it is very tasty, like the one

我有两个字符串,我的目标是匹配尽可能多的单词

问题是这两个字符串相似但不相等(例如,其中一个字符串缺少一个单词,或单词拼写错误)

例如:

var str1 = "I like this soup because it is very tasty, like the one that my grandma used to make";
var str2 = "I really lie this soup, it is very tasty, like the one that my grandma use to make";
在本例中,“str1”是正确的字符串,因此我必须匹配来自“str2”的尽可能多的单词,这些单词包含“真的”这是不必要的,而“like”则拼写错误为“lie”

现在,一个简单的解决办法是检查像这样的每个单词

var split1 = str1.split(/[\s,]+/);
var split2 = str2.split(/[\s,]+/);
var i , j = 0;
var found;
for(i = 0 ; i < split1.length ; i++){
   found=false;
   for( ; j < split2.length && !found; j++){
      if(split1[i]==split2[j]){
         found=true;
         //do something here
      }
   }
}
var split1=str1.split(/[\s,]+/);
var split2=str2.split(/[\s,]+/);
var i,j=0;
var发现;
对于(i=0;i
但实际上有一个大问题:str2的第二个“like”可能与str1的第一个“like”匹配


最后,如果算法的目标是匹配尽可能多的单词,如果我找不到匹配的单词,则继续执行算法。

如果我全面掌握了您的需求,那么对象
freqC
应该满足您的需要

const str1='我喜欢这汤,因为它很好吃,就像我奶奶以前做的那种汤';
const str2=“我真的很喜欢这汤,它很好吃,就像我奶奶做的汤一样”;
const freq1=str1.toLowerCase().split(“”).reduce((累加器,键)=>
赋值(累加器,{[key]:(累加器[key]|0)+1}),{});
const freqC=str2.toLowerCase().split(“”).reduce((累加器,键)=>
赋值(累加器,{[key]:(累加器[key]|0)+1}),freq1);

控制台信息(freqC)
如果我全面掌握了您的需求,那么对象
freqC
应该满足您的需求

const str1='我喜欢这汤,因为它很好吃,就像我奶奶以前做的那种汤';
const str2=“我真的很喜欢这汤,它很好吃,就像我奶奶做的汤一样”;
const freq1=str1.toLowerCase().split(“”).reduce((累加器,键)=>
赋值(累加器,{[key]:(累加器[key]|0)+1}),{});
const freqC=str2.toLowerCase().split(“”).reduce((累加器,键)=>
赋值(累加器,{[key]:(累加器[key]|0)+1}),freq1);
控制台信息(freqC)您可以申请检查两个单词是否相似。我随后将其表示为Ld。现在,如果您知道str1是正确的,那么您可以执行以下操作:

function getMostMatches(correct, interesting, limit, cIndex, iIndex) {
    if (!cIndex) cIndex = 0;
    if (!iIndex) iIndex = 0;
    var maxScore = 0;
    while (cIndex < correct.length) {
        while (iIndex < interesting.length) {
            if (Ld(correct[cIndex], interesting[iIndex]) < limit) {
                var score = 1 + getMostMatches(correct, interesting, limit, cIndex + 1, iIndex + 1);
                if (score > maxScore) maxScore = score;
            }
            iIndex = iIndex + 1;
        }
        cIndex++;
    }
    return maxScore;
}

var correct = str1.split(" ");
var interesting = str2.split(" ");
函数getmost匹配(正确、有趣、限制、cIndex、iIndex){
如果(!cIndex)cIndex=0;
如果(!iIndex)iIndex=0;
var maxScore=0;
而(cIndex<正确长度){
而(iIndex<有趣的长度){
如果(Ld(正确[cIndex],有趣[iIndex])<极限){
var得分=1+获取的最大匹配(正确、有趣、极限、cIndex+1、iIndex+1);
如果(分数>最大分数)最大分数=分数;
}
指数=指数+1;
}
cIndex++;
}
返回maxScore;
}
var correct=str1.split(“”);
var=str2.split(“”);
您可以申请检查两个单词是否相似。我随后将其表示为Ld。现在,如果您知道str1是正确的,那么您可以执行以下操作:

function getMostMatches(correct, interesting, limit, cIndex, iIndex) {
    if (!cIndex) cIndex = 0;
    if (!iIndex) iIndex = 0;
    var maxScore = 0;
    while (cIndex < correct.length) {
        while (iIndex < interesting.length) {
            if (Ld(correct[cIndex], interesting[iIndex]) < limit) {
                var score = 1 + getMostMatches(correct, interesting, limit, cIndex + 1, iIndex + 1);
                if (score > maxScore) maxScore = score;
            }
            iIndex = iIndex + 1;
        }
        cIndex++;
    }
    return maxScore;
}

var correct = str1.split(" ");
var interesting = str2.split(" ");
函数getmost匹配(正确、有趣、限制、cIndex、iIndex){
如果(!cIndex)cIndex=0;
如果(!iIndex)iIndex=0;
var maxScore=0;
而(cIndex<正确长度){
而(iIndex<有趣的长度){
如果(Ld(正确[cIndex],有趣[iIndex])<极限){
var得分=1+获取的最大匹配(正确、有趣、极限、cIndex+1、iIndex+1);
如果(分数>最大分数)最大分数=分数;
}
指数=指数+1;
}
cIndex++;
}
返回maxScore;
}
var correct=str1.split(“”);
var=str2.split(“”);

要扩展我之前的评论,使用patienceDiff/patienceDiffPlus算法(请参阅)可能非常适合您的情况,因为patienceDiff算法通常适用于突出显示两个字符串之间的增量,这两个字符串非常相似,只有一些细微的差异。您案例中的算法可以如下使用,第一步是删除逗号并将句子拆分为单词数组

var str1 = "I like this soup because it is very tasty, like the one that my grandma used to make";
var str2 = "I really lie this soup, it is very tasty, like the one that my grandma use to make";

let a = str1.split( ',' ).join( '' ).split( ' ');
let b = str2.split( ',' ).join( '' ).split( ' ');
let pdp = patienceDiffPlus( a, b )

console.log( pdp );
…导致

Object
  lineCountDeleted: 3
  lineCountInserted: 3
  lineCountMoved: 0
  lines: Array(21)
    0: {line: "I", aIndex: 0, bIndex: 0}
    1: {line: "like", aIndex: 1, bIndex: -1}
    2: {line: "really", aIndex: -1, bIndex: 1}
    3: {line: "lie", aIndex: -1, bIndex: 2}
    4: {line: "this", aIndex: 2, bIndex: 3}
    5: {line: "soup", aIndex: 3, bIndex: 4}
    6: {line: "because", aIndex: 4, bIndex: -1}
    7: {line: "it", aIndex: 5, bIndex: 5}
    8: {line: "is", aIndex: 6, bIndex: 6}
    9: {line: "very", aIndex: 7, bIndex: 7}
    10: {line: "tasty", aIndex: 8, bIndex: 8}
    11: {line: "like", aIndex: 9, bIndex: 9}
    12: {line: "the", aIndex: 10, bIndex: 10}
    13: {line: "one", aIndex: 11, bIndex: 11}
    14: {line: "that", aIndex: 12, bIndex: 12}
    15: {line: "my", aIndex: 13, bIndex: 13}
    16: {line: "grandma", aIndex: 14, bIndex: 14}
    17: {line: "used", aIndex: 15, bIndex: -1}
    18: {line: "use", aIndex: -1, bIndex: 15}
    19: {line: "to", aIndex: 16, bIndex: 16}
    20: {line: "make", aIndex: 17, bIndex: 17}
    length: 21
…其中:

  • 如果aIndex=-1,则
    a
    数组在
    b
    数组中没有相应的值
  • 如果bIndex=-1,则
    b
    数组在
    a
    数组中没有相应的值
  • 如果aIndex和bIndex都为正值,则在数组的相应索引处找到匹配项
还请注意,如果您逐个字符执行
patienceDiff
,即将句子拆分为字符数组

let a = str1.split( '' );
let a = str2.split( '' );
let pdp = patienceDiff( a, b )

console.log( pdp );
…那么结果将是

0: {line: "I", aIndex: 0, bIndex: 0}
1: {line: " ", aIndex: 1, bIndex: 1}
2: {line: "r", aIndex: -1, bIndex: 2}
3: {line: "e", aIndex: -1, bIndex: 3}
4: {line: "a", aIndex: -1, bIndex: 4}
5: {line: "l", aIndex: -1, bIndex: 5}
6: {line: "l", aIndex: -1, bIndex: 6}
7: {line: "y", aIndex: -1, bIndex: 7}
8: {line: " ", aIndex: -1, bIndex: 8}
9: {line: "l", aIndex: 2, bIndex: 9}
10: {line: "i", aIndex: 3, bIndex: 10}
11: {line: "k", aIndex: 4, bIndex: -1}
12: {line: "e", aIndex: 5, bIndex: 11}
13: {line: " ", aIndex: 6, bIndex: 12}
14: {line: "t", aIndex: 7, bIndex: 13}
15: {line: "h", aIndex: 8, bIndex: 14}
16: {line: "i", aIndex: 9, bIndex: 15}
17: {line: "s", aIndex: 10, bIndex: 16}
18: {line: " ", aIndex: 11, bIndex: 17}
      o
      o
      o
84: {line: " ", aIndex: 76, bIndex: 74}
85: {line: "t", aIndex: 77, bIndex: 75}
86: {line: "o", aIndex: 78, bIndex: 76}
87: {line: " ", aIndex: 79, bIndex: 77}
88: {line: "m", aIndex: 80, bIndex: 78}
89: {line: "a", aIndex: 81, bIndex: 79}
90: {line: "k", aIndex: 82, bIndex: 80}
91: {line: "e", aIndex: 83, bIndex: 81}

…这表明在
b
数组中添加了单词'really',并且单词
中的
b
数组中缺少了'k'。根据您希望匹配单词的级别,逐个字符使用patienceDiff算法可能更适合您的需要。

要扩展我之前的评论,使用patienceDiff/patienceDiffPlus算法(请参阅)可能非常适合您的情况,由于patienceDiff算法通常适用于突出显示两个字符串之间的增量,这两个字符串非常相似,只有一些细微的差异。您案例中的算法可以如下使用,第一步是删除逗号并将句子拆分为单词数组

var str1 = "I like this soup because it is very tasty, like the one that my grandma used to make";
var str2 = "I really lie this soup, it is very tasty, like the one that my grandma use to make";

let a = str1.split( ',' ).join( '' ).split( ' ');
let b = str2.split( ',' ).join( '' ).split( ' ');
let pdp = patienceDiffPlus( a, b )

console.log( pdp );
…导致

Object
  lineCountDeleted: 3
  lineCountInserted: 3
  lineCountMoved: 0
  lines: Array(21)
    0: {line: "I", aIndex: 0, bIndex: 0}
    1: {line: "like", aIndex: 1, bIndex: -1}
    2: {line: "really", aIndex: -1, bIndex: 1}
    3: {line: "lie", aIndex: -1, bIndex: 2}
    4: {line: "this", aIndex: 2, bIndex: 3}
    5: {line: "soup", aIndex: 3, bIndex: 4}
    6: {line: "because", aIndex: 4, bIndex: -1}
    7: {line: "it", aIndex: 5, bIndex: 5}
    8: {line: "is", aIndex: 6, bIndex: 6}
    9: {line: "very", aIndex: 7, bIndex: 7}
    10: {line: "tasty", aIndex: 8, bIndex: 8}
    11: {line: "like", aIndex: 9, bIndex: 9}
    12: {line: "the", aIndex: 10, bIndex: 10}
    13: {line: "one", aIndex: 11, bIndex: 11}
    14: {line: "that", aIndex: 12, bIndex: 12}
    15: {line: "my", aIndex: 13, bIndex: 13}
    16: {line: "grandma", aIndex: 14, bIndex: 14}
    17: {line: "used", aIndex: 15, bIndex: -1}
    18: {line: "use", aIndex: -1, bIndex: 15}
    19: {line: "to", aIndex: 16, bIndex: 16}
    20: {line: "make", aIndex: 17, bIndex: 17}
    length: 21
…其中:

  • 如果aIndex=-1,则
    a
    数组在
    b
    数组中没有相应的值