用于过滤字符串中常见单词的JavaScript代码_Javascript_String

用于过滤字符串中常见单词的JavaScript代码

javascript string

用于过滤字符串中常见单词的JavaScript代码,javascript,string,Javascript,String,我正在尝试构建JavaScript代码，该代码读取一个字符串（比如一个英语文本句子），然后输出另一个字符串（逗号分隔的）单词，这些单词是“不常见的”。比如： var sentence="The dog ran to the other side of the field."; var common_words="the, it is, we all, a, an, by, to, you, me, he, she, they, we, how, it, i, are, to,

我正在尝试构建JavaScript代码，该代码读取一个字符串（比如一个英语文本句子），然后输出另一个字符串（逗号分隔的）单词，这些单词是“不常见的”。比如：

    var sentence="The dog ran to the other side of the field."; 

    var common_words="the, it is, we all, a, an, by, to, you, me, he, she, they, we, how, it, i, are, to, for, of";

--一些JavaScript代码--

我如何才能做到这一点？

首先构建一个普通单词的关联数组，然后对序列进行标记以输出其中未包含的任何单词。例如

这个怎么样

sentence.replace(/\b(?:the|it is|we all|an?|by|to|you|[mh]e|she|they|we...)\b/ig, '');

这将从你的句子中删除所有常用词。只需按您想要的方式拆分剩余字符串。

给您：

function getUncommon(sentence, common) {
    var wordArr = sentence.match(/\w+/g),
        commonObj = {},
        uncommonArr = [],
        word, i;

    common = common.split(',');
    for ( i = 0; i < common.length; i++ ) {
        commonObj[ common[i].trim() ] = true;
    }

    for ( i = 0; i < wordArr.length; i++ ) {
        word = wordArr[i].trim().toLowerCase();
        if ( !commonObj[word] ) {
            uncommonArr.push(word);
        }
    }

    return uncommonArr;
}

函数（句子，常用）{
var wordArr=句子匹配（/\w+/g），
commonObj={}，
uncomonarr=[]，
字，我；
common=common.split（'，'）；
对于（i=0；i


现场演示：
我想这是一个开始：
var sentence_arr = sentence.split(/(?=\w)\b|\W/);
var common_arr = common_words.split(', ');

var uncommon_arr = array();
for(var i = 0; i < sentence_arr.length; i++) {
    for ( var j = 0; j < common_arr.length; j++ ) {
        if ( sentence_arr[i].toLowerCase() != common_arr[j].toLowerCase() ) {
            uncommon_arr.push(sentence_arr[i].toLowerCase());
    }
}

var uncommon_words = uncommon_arr.join(', ');

var-statement\u-arr=statement.split（/（？=\w）\b |\w/）；
var common_arr=常用词。拆分（'，'）；
var_arr=array（）；
对于（变量i=0；i<句子长度；i++）{
对于（var j=0；j

完全未经测试，但关键是您将两个句子分开，并针对列表中的每个成员分别检查每个单词。这有点幼稚，完全不可缩放，但对于这样的小示例也可以。
String#diff函数返回一个差异列表（不常见术语）。术语可以作为数组或字符串提供
您可以这样称呼它：语句.diff（terms）
。下面是一个单元测试：
var sentence = 'The dog ran to the other side of the field.';
var terms    = 'the, it is, we all, a, an, by, to, you, me, he, she, they, we, how, it, i, are, to, for, of';
// NOTE: The "terms" variable could also be an array.

(sentence.diff(terms).toString() === 'dog,ran,other,side,field')
  ? console.log('pass')
  : console.log('fail');

下面是“字符串.diff”函数定义：
String.prototype.diff = function(terms){
  if (!terms) {
    return [];
  }

  if (typeof terms === 'string') {
    terms = terms.split(/,[\s]*/);
  }

  if (typeof terms !== 'object' || !Array.isArray(terms)) {
    return [];
  }

  terms = terms.map(function(term){
    return term.toLowerCase();
  });

  var words = this.split(/[\W]/).filter(function(word){
    return word.length;
  });

  return words.filter(function(word){
    return terms.indexOf(word.toLowerCase()) < 0;
  });
};

String.prototype.diff=函数（术语）{
如果（！条款）{
返回[]；
}
如果（术语类型==='string'）{
terms=terms.split（/，[\s]*/）；
}
if（术语类型！='object'| |！Array.isArray（术语））{
返回[]；
}
术语=术语.map（函数（术语）{
返回项.toLowerCase（）；
});
var words=this.split（/[\W]/）.filter（函数（word）{
返回单词长度；
});
返回单词。过滤器（函数（单词）{
返回terms.indexOf（word.toLowerCase（））<0；
});
};
要删除的单词称为停止单词，它是：
["a", "able", "about", "across", "after", "all", "almost", "also", "am", "among", "an", "and", "any", "are", "as", "at", "be", "because", "been", "but", "by", "can", "cannot", "could", "dear", "did", "do", "does", "either", "else", "ever", "every", "for", "from", "get", "got", "had", "has", "have", "he", "her", "hers", "him", "his", "how", "however", "i", "if", "in", "into", "is", "it", "its", "just", "least", "let", "like", "likely", "may", "me", "might", "most", "must", "my", "neither", "no", "nor", "not", "of", "off", "often", "on", "only", "or", "other", "our", "own", "rather", "said", "say", "says", "she", "should", "since", "so", "some", "than", "that", "the", "their", "them", "then", "there", "these", "they", "this", "tis", "to", "too", "twas", "us", "wants", "was", "we", "were", "what", "when", "where", "which", "while", "who", "whom", "why", "will", "with", "would", "yet", "you", "your", "ain't", "aren't", "can't", "could've", "couldn't", "didn't", "doesn't", "don't", "hasn't", "he'd", "he'll", "he's", "how'd", "how'll", "how's", "i'd", "i'll", "i'm", "i've", "isn't", "it's", "might've", "mightn't", "must've", "mustn't", "shan't", "she'd", "she'll", "she's", "should've", "shouldn't", "that'll", "that's", "there's", "they'd", "they'll", "they're", "they've", "wasn't", "we'd", "we'll", "we're", "weren't", "what'd", "what's", "when'd", "when'll", "when's", "where'd", "where'll", "where's", "who'd", "who'll", "who's", "why'd", "why'll", "why's", "won't", "would've", "wouldn't", "you'd", "you'll", "you're", "you've"]

资料来源如下：

所以你的代码应该是
function getNoneStopWords(sentence) {
        var common = getStopWords();
        var wordArr = sentence.match(/\w+/g),
            commonObj = {},
            uncommonArr = [],
            word, i;

        for (i = 0; i < common.length; i++) {
            commonObj[ common[i].trim() ] = true;
        }

        for (i = 0; i < wordArr.length; i++) {
            word = wordArr[i].trim().toLowerCase();
            if (!commonObj[word]) {
                uncommonArr.push(word);
            }
        }
        return uncommonArr;
    }

    function getStopWords() {
        return ["a", "able", "about", "across", "after", "all", "almost", "also", "am", "among", "an", "and", "any", "are", "as", "at", "be", "because", "been", "but", "by", "can", "cannot", "could", "dear", "did", "do", "does", "either", "else", "ever", "every", "for", "from", "get", "got", "had", "has", "have", "he", "her", "hers", "him", "his", "how", "however", "i", "if", "in", "into", "is", "it", "its", "just", "least", "let", "like", "likely", "may", "me", "might", "most", "must", "my", "neither", "no", "nor", "not", "of", "off", "often", "on", "only", "or", "other", "our", "own", "rather", "said", "say", "says", "she", "should", "since", "so", "some", "than", "that", "the", "their", "them", "then", "there", "these", "they", "this", "tis", "to", "too", "twas", "us", "wants", "was", "we", "were", "what", "when", "where", "which", "while", "who", "whom", "why", "will", "with", "would", "yet", "you", "your", "ain't", "aren't", "can't", "could've", "couldn't", "didn't", "doesn't", "don't", "hasn't", "he'd", "he'll", "he's", "how'd", "how'll", "how's", "i'd", "i'll", "i'm", "i've", "isn't", "it's", "might've", "mightn't", "must've", "mustn't", "shan't", "she'd", "she'll", "she's", "should've", "shouldn't", "that'll", "that's", "there's", "they'd", "they'll", "they're", "they've", "wasn't", "we'd", "we'll", "we're", "weren't", "what'd", "what's", "when'd", "when'll", "when's", "where'd", "where'll", "where's", "who'd", "who'll", "who's", "why'd", "why'll", "why's", "won't", "would've", "wouldn't", "you'd", "you'll", "you're", "you've"];
    }

函数getNoneStopWords（句子）{
var common=getStopWords（）；
var wordArr=句子匹配（/\w+/g），
commonObj={}，
uncomonarr=[]，
字，我；
对于（i=0；i

当您在“”上拆分序列时，这不会捕获“it is”或“we all”。此外，您排除的数组不需要布尔值，并且在给定键值对时必须转换为对象。另外，您不是每次在两个循环中循环时都调用split吗？它看起来不像空格是单词的一部分。Correct about Object vs.Array.Split不是在循环中调用的，它定义了循环。我在原始问题中指的是“common_words=”the，it is，we all…“-虽然变量名是“words”，但该列表包含空格分隔的短语。

["a", "able", "about", "across", "after", "all", "almost", "also", "am", "among", "an", "and", "any", "are", "as", "at", "be", "because", "been", "but", "by", "can", "cannot", "could", "dear", "did", "do", "does", "either", "else", "ever", "every", "for", "from", "get", "got", "had", "has", "have", "he", "her", "hers", "him", "his", "how", "however", "i", "if", "in", "into", "is", "it", "its", "just", "least", "let", "like", "likely", "may", "me", "might", "most", "must", "my", "neither", "no", "nor", "not", "of", "off", "often", "on", "only", "or", "other", "our", "own", "rather", "said", "say", "says", "she", "should", "since", "so", "some", "than", "that", "the", "their", "them", "then", "there", "these", "they", "this", "tis", "to", "too", "twas", "us", "wants", "was", "we", "were", "what", "when", "where", "which", "while", "who", "whom", "why", "will", "with", "would", "yet", "you", "your", "ain't", "aren't", "can't", "could've", "couldn't", "didn't", "doesn't", "don't", "hasn't", "he'd", "he'll", "he's", "how'd", "how'll", "how's", "i'd", "i'll", "i'm", "i've", "isn't", "it's", "might've", "mightn't", "must've", "mustn't", "shan't", "she'd", "she'll", "she's", "should've", "shouldn't", "that'll", "that's", "there's", "they'd", "they'll", "they're", "they've", "wasn't", "we'd", "we'll", "we're", "weren't", "what'd", "what's", "when'd", "when'll", "when's", "where'd", "where'll", "where's", "who'd", "who'll", "who's", "why'd", "why'll", "why's", "won't", "would've", "wouldn't", "you'd", "you'll", "you're", "you've"]

function getNoneStopWords(sentence) {
        var common = getStopWords();
        var wordArr = sentence.match(/\w+/g),
            commonObj = {},
            uncommonArr = [],
            word, i;

        for (i = 0; i < common.length; i++) {
            commonObj[ common[i].trim() ] = true;
        }

        for (i = 0; i < wordArr.length; i++) {
            word = wordArr[i].trim().toLowerCase();
            if (!commonObj[word]) {
                uncommonArr.push(word);
            }
        }
        return uncommonArr;
    }

    function getStopWords() {
        return ["a", "able", "about", "across", "after", "all", "almost", "also", "am", "among", "an", "and", "any", "are", "as", "at", "be", "because", "been", "but", "by", "can", "cannot", "could", "dear", "did", "do", "does", "either", "else", "ever", "every", "for", "from", "get", "got", "had", "has", "have", "he", "her", "hers", "him", "his", "how", "however", "i", "if", "in", "into", "is", "it", "its", "just", "least", "let", "like", "likely", "may", "me", "might", "most", "must", "my", "neither", "no", "nor", "not", "of", "off", "often", "on", "only", "or", "other", "our", "own", "rather", "said", "say", "says", "she", "should", "since", "so", "some", "than", "that", "the", "their", "them", "then", "there", "these", "they", "this", "tis", "to", "too", "twas", "us", "wants", "was", "we", "were", "what", "when", "where", "which", "while", "who", "whom", "why", "will", "with", "would", "yet", "you", "your", "ain't", "aren't", "can't", "could've", "couldn't", "didn't", "doesn't", "don't", "hasn't", "he'd", "he'll", "he's", "how'd", "how'll", "how's", "i'd", "i'll", "i'm", "i've", "isn't", "it's", "might've", "mightn't", "must've", "mustn't", "shan't", "she'd", "she'll", "she's", "should've", "shouldn't", "that'll", "that's", "there's", "they'd", "they'll", "they're", "they've", "wasn't", "we'd", "we'll", "we're", "weren't", "what'd", "what's", "when'd", "when'll", "when's", "where'd", "where'll", "where's", "who'd", "who'll", "who's", "why'd", "why'll", "why's", "won't", "would've", "wouldn't", "you'd", "you'll", "you're", "you've"];
    }