Javascript HTML和JS：用标记围绕文档中的每个单词_Javascript_Html_Regex

Javascript HTML和JS：用标记围绕文档中的每个单词

javascript html regex

Javascript HTML和JS：用标记围绕文档中的每个单词,javascript,html,regex,Javascript,Html,Regex,我正在尝试使用Javascript修改现有的HTML文档，这样我就可以用一个带有计数器的span标记来包围网页中的每个文本字。这是一个非常具体的问题，因此我将提供一个示例： <body><p>hello, <br> change this</p> <img src="lorempixel.com/200/200> <br></body></html> 这应改为： <body><

我正在尝试使用Javascript修改现有的HTML文档，这样我就可以用一个带有计数器的span标记来包围网页中的每个文本字。这是一个非常具体的问题，因此我将提供一个示例：

<body><p>hello, <br>
change this</p> 
<img src="lorempixel.com/200/200> <br></body></html>

这应改为：

  <body><p><span id="1">hello,</span>
  <br> <span id="2"> change</span><span id="3"> this</span> </p>
  <br> <img src="lorempixel.com/200/200> <br></body></html>

我正在考虑或regex解决方案，但它们变得非常复杂，我不知道如何忽略标记和更改文本而不完全破坏页面

任何想法都很感激

不要在原始HTML上使用正则表达式。只在文本上使用它。这是因为regex是一种上下文无关的解析器，而HTML是一种递归语言。您需要一个递归下降解析器来正确处理HTML

首先是DOM的一些有用特性：

document.body是DOM的根 DOM的每个节点都有一个childNodes数组，甚至包括注释、文本和属性元素节点（如或）不包含文本，而是包含包含文本的文本节点。所有节点都具有nodeType属性，文本节点的类型为3。所有节点都有一个nodeValue属性，该属性根据节点的类型保存不同的内容。对于文本节点，nodeValue包含实际文本。因此，使用上面的信息，我们可以用一个跨度来包围所有单词

首先是一个简单的实用程序函数，它允许我们处理DOM：

// First a simple implementation of recursive descent,
// visit all nodes in the DOM and process it with a callback:
function walkDOM (node,callback) {
    if (node.nodeName != 'SCRIPT') { // ignore javascript
        callback(node);
        for (var i=0; i<node.childNodes.length; i++) {
            walkDOM(node.childNodes[i],callback);
        }
    }
}

请注意，我分两个步骤来完成这项工作，以避免将单词包装两次

现在我们可以处理文本节点：

var textNodes = [];
walkDOM(document.body,function(n){
    if (n.nodeType == 3) {
        textNodes.push(n);
    }
});

// simple utility functions to avoid a lot of typing:
function insertBefore (new_element, element) {
    element.parentNode.insertBefore(new_element,element);
}
function removeElement (element) {
    element.parentNode.removeChild(element);
}
function makeSpan (txt, attrs) {
    var s = document.createElement('span');
    for (var i in attrs) {
        if (attrs.hasOwnProperty(i)) s[i] = attrs[i];
    }
    s.appendChild(makeText(txt));
    return s;
}
function makeText (txt) {return document.createTextNode(txt)}

var id_count = 1;
for (var i=0; i<textNodes.length; i++) {
    var n = textNodes[i];
    var txt = n.nodeValue;
    var words = txt.split(' ');

    // Insert span surrounded words:
    insertBefore(makeSpan(words[0],{id:id_count++}),n);
    for (var j=1; j<words.length; j++) {
        insertBefore(makeText(' '),n); // join the words with spaces
        insertBefore(makeSpan(words[j],{id:id_count++}),n);
    }
    // Now remove the original text node:
    removeElement(n);
}

给你。它很麻烦，但100%安全-它永远不会损坏页面中的其他javascript标记。我上面的许多实用函数都可以用您选择的库替换。但是不要走捷径，把整个文档当作一个巨大的HTML字符串。除非您愿意用纯javascript编写HTML解析器。

这种处理总是比您想象的要复杂得多。以下内容将包装与\S+非空格序列匹配的字符序列，而不会包装与\S+空格匹配的字符序列

它还允许跳过某些元素的内容，例如脚本、输入、按钮、选择等。请注意，childNodes返回的live集合必须转换为静态数组，否则会受到添加的新节点的影响。另一种方法是使用element.querySelectorAll，但childNodes有更广泛的支持

// Copy numeric properties of Obj from 0 to length
// to an array
function toArray(obj) {
  var arr = [];
  for (var i=0, iLen=obj.length; i<iLen; i++) {
    arr.push(obj[i]);
  }
  return arr;
}


// Wrap the words of an element and child elements in a span
// Recurs over child elements, add an ID and class to the wrapping span
// Does not affect elements with no content, or those to be excluded
var wrapContent = (function() {
  var count = 0;

  return function(el) {

    // If element provided, start there, otherwise use the body
    el = el && el.parentNode? el : document.body;

    // Get all child nodes as a static array
    var node, nodes = toArray(el.childNodes);
    var frag, parent, text;
    var re = /\S+/;
    var sp, span = document.createElement('span');

    // Tag names of elements to skip, there are more to add
    var skip = {'script':'', 'button':'', 'input':'', 'select':'',
                'textarea':'', 'option':''};

    // For each child node...
    for (var i=0, iLen=nodes.length; i<iLen; i++) {
      node = nodes[i];

      // If it's an element, call wrapContent
      if (node.nodeType == 1 && !(node.tagName.toLowerCase() in skip)) {
        wrapContent(node);

      // If it's a text node, wrap words
      } else if (node.nodeType == 3) {

        // Match sequences of whitespace and non-whitespace
        text = node.data.match(/\s+|\S+/g);

        if (text) {

          // Create a fragment, handy suckers these
          frag = document.createDocumentFragment();

          for (var j=0, jLen=text.length; j<jLen; j++) {

            // If not whitespace, wrap it and append to the fragment
            if (re.test(text[j])) {
              sp = span.cloneNode(false);
              sp.id = count++;
              sp.className = 'foo';
              sp.appendChild(document.createTextNode(text[j]));
              frag.appendChild(sp);

            // Otherwise, just append it to the fragment
            } else {
              frag.appendChild(document.createTextNode(text[j]));
            }
          }
        }

        // Replace the original node with the fragment
        node.parentNode.replaceChild(frag, node);
      }
    }
  }
}());

window.onload = wrapContent;

以上仅针对最常见的情况，它需要更多的工作和彻底的测试。

只是好奇，您想稍后对spans做些什么？关于a@ROX中已经存在的单词，我们希望稍后能够使用Javascript，能够在这个特定的id上调用innerHTML方法来改变span的背景@但愿你能试试看。也许这正是你想要的；警告：此处的代码未经测试，但理论正确。在将其引入生产代码之前，请先测试它是否存在bug。注意：由于walkDOM是递归的，因此可以从您选择的元素/节点开始，而不是从document.body开始。我认为在循环之前，您需要将childNodes转换为某种数组或静态对象。它是活动的，因此当您添加节点时，它的长度将以与添加节点和递增i不协调的方式发生变化。它还需要跳过的不仅仅是脚本元素，例如textarea、option和button元素的内容不是标记。与其自己遍历DOM，不如使用TreeWalker。@RobG:这就是为什么我要分两个阶段进行操作的原因。当我添加节点时，我不再在childNodes上迭代：哦，酷，对我来说，没有理由的否决投票就像+10-我今天要检查这个解决方案！昨天我不得不去睡觉。