Javascript/jQuery查找重复的文本_Javascript_Jquery_Text_Duplicates

Javascript/jQuery查找重复的文本

javascript jquery text

Javascript/jQuery查找重复的文本,javascript,jquery,text,duplicates,Javascript,Jquery,Text,Duplicates,如何在文本文档中查找重复项。副本可以是一组连续的单词或句子。句子不必以点结尾。假设页面包含一个200行的文档，其中有两个句子是相同的，我们希望在单击“检查重复按钮”时将这两个句子作为重复突出显示有趣的问题-这里有一个关于我如何做到这一点的想法：-无论如何都不优化 var text = $('p').text(), words = text.split(' '), sortedWords = words.slice(0).sort(), duplicateWords =

如何在文本文档中查找重复项。副本可以是一组连续的单词或句子。句子不必以点结尾。假设页面包含一个200行的文档，其中有两个句子是相同的，我们希望在单击“检查重复按钮”时将这两个句子作为重复突出显示

有趣的问题-这里有一个关于我如何做到这一点的想法：-无论如何都不优化

var text = $('p').text(), words = text.split(' '), sortedWords = words.slice(0).sort(), duplicateWords = [], sentences = text.split('.'), sortedSentences = sentences.slice(0).sort(), duplicateSentences = []; for (var i=0; i<sortedWords.length-1; i++) { if (sortedWords[i+1] == sortedWords[i]) { duplicateWords.push(sortedWords[i]); } } duplicateWords = $.unique(duplicateWords); for (var i=0; i<sortedSentences.length-1; i++) { if (sortedSentences[i+1] == sortedSentences[i]) { duplicateSentences.push(sortedSentences[i]); } } duplicateSentences = $.unique(duplicateSentences); $('a.words').click(function(){ var highlighted = $.map(words, function(word){ if ($.inArray(word, duplicateWords) > -1) return '' + word + ''; else return word; }); $('p').html(highlighted.join(' ')); return false; }); $('a.sentences').click(function(){ var highlighted = $.map(sentences, function(sentence){ if ($.inArray(sentence, duplicateSentences) > -1) return '' + sentence + ''; else return sentence; }); $('p').html(highlighted.join('.')); return false; });
更新2
我的正则表达式fu很弱，但这个（相当混乱！）版本似乎可以正常工作：-我很确定可能有更好的方法来实现这一点，但现在我必须别管它了！：祝你好运
更新3
仔细想想，我不认为以前更新的代码有什么好处。这就是我把它拿走的原因。您仍然可以在这里找到它：要点是使用正则表达式匹配单词，大致如下：

/^word(\.?)$/

以下是使用后缀树的解决方案：

function SuffixTree(text) { var regex = /\b\w+/g; var words = text.match(regex); var wave = []; var words_l = words.length; if (words_l == 0) return false; this.tree = this.node("", false); for (var i = 0; i < words_l; ++i) { var x = words[i] + "_"; wave.push(this.tree); var wave_l = wave.length; for (var j = 0; j < wave_l; ++j) { var y = wave[j]; if (typeof y[x] != 'undefined') y[x].count++; else y[x] = this.node(words[i], y); wave[j] = y[x]; } } } SuffixTree.prototype = { dummy: {count: 1}, node: function(word, num, parent) { return { count: 1, word: word, parent: parent }; }, duplicates: function(h) { this.dups = []; this.bypass(this.tree, h, 0); var l = this.dups.length; this.dups.sort(function(d1, d2) { return d1.depth > d2.depth ? 1 : -1; }); for (var i = 0; i < l; ++i) { var d = this.dups[i]; this.dups[i] = { s: " " + this.sentence(d.a) + " ", depth: d.depth, count: d.a.count }; } for (var i = 0; i < l; ++i) { var d = this.dups[i]; console.log(i, d.s); } for (var i = 0; i < l; ++i) { var d = this.dups[i]; var fl = true; for (var j = i + 1; j < l; ++j) { if (this.dups[j].s.indexOf(d.s) != -1) fl = false; } if (fl) h(d.s.substr(1, d.s.length - 2), d.count); } }, bypass: function(a, h, depth) { if (a.constructor != Object) return; var fl = true; for (var i in a) { if (i == 'parent') continue; var b = a[i]; if (b.count == a.count) fl = false; this.bypass(b, h, depth + 1); } if (fl && a.count > 1) { this.dups.push({ a: a, depth: depth }); } }, sentence: function(a) { var s = a.word; while (a = a.parent) { s = a.word + " " + s; } return s; } }; var text = "This is a text with some duplicates: words, sentences of different length. For example here is a duplicate word. This sentence has some duplicates. But not all of us can find clones."; var T = new SuffixTree(text); var h = function(s, c) { document.write(s + "[" + c + "] "); }; T.duplicates(h);

函数后缀树（文本）{ var regex=/\b\w+/g； var words=text.match（regex）； var波=[]； var words\u l=words.length； if（words_l==0）返回false； this.tree=this.node（“，false）；对于（变量i=0；id2.depth？1:-1；}）；对于（变量i=0；i1）{ this.dups.push（{a:a，depth:depth}）； } }, 句子：功能（a）{ var s=一个词； while（a=a.parent）{ s=a.word+“”+s； } 返回s； } }; var text=“这是一个有一些重复的文本：单词、不同长度的句子。例如，这里有一个重复的单词。这个句子有一些重复。但不是所有人都能找到克隆。”； var T=新后缀树（文本）； var h=函数（s，c）{ 文件。写（s+“[”+c+“] ”； }; T.重复（h）；
1）将输入文本拆分为单词数组。 2）构建后缀树。 3）查找树的最长后缀。 4）删除包含在其他句子中的句子（即删除作为“this is a”一部分的“is”）
您可以将正则表达式更改为考虑html标记
我希望这对你有帮助

p.S.
h
是找到重复项的回调。
您的javascript包含对名为jQuery的javascript库的引用
您没有将其包含在HTML中，因此它将失败。你可以通过

今天的提示：使用浏览器中的开发人员工具。在控制台中，您可以看到javascript的哪些部分出现故障。
比如说<代码>…文本……
+1感谢您的努力。我有一个相似的概念来寻找重复的单词。这里的问题是句子。在您的情况下，它必须以一个点结束，才能被识别。因此，如果我们在不同的区域有相同的两个句子，但是一个如果没有后跟一个点，它将不会被识别。我猜这更像是一个单词序列而不是一个句子，不是吗；）。没错。寻找想法和最好的方法来解决这个问题。谢谢你的努力，很好的方法。我喜欢你在第三次更新中所做的，虽然我可以改进，但总体来说是一个良好的开端。
function SuffixTree(text) { var regex = /\b\w+/g; var words = text.match(regex); var wave = []; var words_l = words.length; if (words_l == 0) return false; this.tree = this.node("", false); for (var i = 0; i < words_l; ++i) { var x = words[i] + "_"; wave.push(this.tree); var wave_l = wave.length; for (var j = 0; j < wave_l; ++j) { var y = wave[j]; if (typeof y[x] != 'undefined') y[x].count++; else y[x] = this.node(words[i], y); wave[j] = y[x]; } } } SuffixTree.prototype = { dummy: {count: 1}, node: function(word, num, parent) { return { count: 1, word: word, parent: parent }; }, duplicates: function(h) { this.dups = []; this.bypass(this.tree, h, 0); var l = this.dups.length; this.dups.sort(function(d1, d2) { return d1.depth > d2.depth ? 1 : -1; }); for (var i = 0; i < l; ++i) { var d = this.dups[i]; this.dups[i] = { s: " " + this.sentence(d.a) + " ", depth: d.depth, count: d.a.count }; } for (var i = 0; i < l; ++i) { var d = this.dups[i]; console.log(i, d.s); } for (var i = 0; i < l; ++i) { var d = this.dups[i]; var fl = true; for (var j = i + 1; j < l; ++j) { if (this.dups[j].s.indexOf(d.s) != -1) fl = false; } if (fl) h(d.s.substr(1, d.s.length - 2), d.count); } }, bypass: function(a, h, depth) { if (a.constructor != Object) return; var fl = true; for (var i in a) { if (i == 'parent') continue; var b = a[i]; if (b.count == a.count) fl = false; this.bypass(b, h, depth + 1); } if (fl && a.count > 1) { this.dups.push({ a: a, depth: depth }); } }, sentence: function(a) { var s = a.word; while (a = a.parent) { s = a.word + " " + s; } return s; } }; var text = "This is a text with some duplicates: words, sentences of different length. For example here is a duplicate word. This sentence has some duplicates. But not all of us can find clones."; var T = new SuffixTree(text); var h = function(s, c) { document.write(s + "[" + c + "] "); }; T.duplicates(h);