Warning: file_get_contents(/data/phpspider/zhask/data//catemap/9/javascript/388.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Javascript 清理HTML标记属性_Javascript_Regex_Html Parsing_Tag Soup - Fatal编程技术网

Javascript 清理HTML标记属性

Javascript 清理HTML标记属性,javascript,regex,html-parsing,tag-soup,Javascript,Regex,Html Parsing,Tag Soup,我需要使用JavaScript浏览大量HTML,将属性引号语法调整为全部双引号。我不需要担心“disabled”或“selected”等仅关键点的属性 以下是我当前的测试用例: var text = "<input class=daily_input type='hidden' size='1' value=3 disabled />"; var regex = /<([\w]+)([^>]+)([\w]+)=['"]?([^'\s|"\s|\s]*)['"]?([^&g

我需要使用JavaScript浏览大量HTML,将属性引号语法调整为全部双引号。我不需要担心“disabled”或“selected”等仅关键点的属性

以下是我当前的测试用例:

var text = "<input class=daily_input type='hidden' size='1' value=3 disabled />";
var regex = /<([\w]+)([^>]+)([\w]+)=['"]?([^'\s|"\s|\s]*)['"]?([^>]+)>/gi;
text = text.replace( regex, "<$1$2$3=\"$4\"$5>" );

console.log(text); // logs <input class=daily_input type='hidden' size='1' value="3" disabled />
我如何更改它以捕获和调整每个属性,而不仅仅是最后一个属性?已经摆弄了很长一段时间没有结果了。感谢您的帮助

text.replace(/='([^']*)'/g, '="$1"').replace(/=([^"'][^ >]*)/g, '="$1"')
或(一个替换):


我知道这是一个迟来的答案,但如果您可以始终使用它,它是为node编写的,但是非常确定您可以运行库(或者您的代码)

注意,它使用lodash,因此如果您已经在使用它,那么您可能需要调整包

这个例子比你想要的更多。。。我使用这个库来清理输入代码,并将其转换为标记以存储在db中。从这里开始,我将通过

//convert/html-to-filtered-markdown.js
"严格使用",;
var sanitize=require('sanitize-html')//https://www.npmjs.org/package/sanitize-html
,toMarkdown=require('to-markdown')。toMarkdown
;
module.exports=函数转换器HtmlToFilteredMarkdown(输入,选项){
如果(!input)返回“”;
选项=选项| |{};
//基本清理、规范化行尾、规范化/减少空白和额外的行尾
var response=(输入| |“”).toString().trim()
.replace(/(\r\n |\r |\n)/g,'\n')//规范化行尾
.replace(/“/g,”)//删除花哨的引号
.replace(/“/g,”)//删除花哨的引号
.replace(//'/g,'\'')//删除花哨的引号
.replace(//'/g,'\'')//删除花哨的引号
;
//清理html输入
响应=净化(响应{
//不允许使用表元素
允许的标签:['h1','h2','h3','h4','h5','h6','blockquote','p','a','ul','ol','nl','li','b','i','strong','em','STREK','code','hr','br','div','table','THAD','caption','tbody','tr','th td','pre'],
//制作订单列表
转换标签:{
‘ol’:‘ul’
}
}).replace(//\r\n |\r |\n/g,'\n')//规范化行尾;
如果(!options.tables){
response=response.replace(//[\s\n]*\[\s\n]*/g,'\n\n')//将divs/tables块替换为段落
}
//进一步清理输入
响应=响应
.将(//[\s\n]*\[\s\n]*/g,'\n\n')///divs和p替换为简单的多行表达式
.replace(//\>\\\\\\\\\\\/g,'\n\n')//cleanup在结束标记后,例如:…\n\n将通过消毒剂减少
.replace(/\\s+\\s+\n?/,'>\n')//在标记关闭后删除空间
.replace(/\&?nbsp\;?/g',)//将nbsp还原为空格

.replace(/\首先,谢谢!这很有效。我唯一的问题是是否可以在一个replace()中完成所有操作。HTML文件可能非常大,效率是关键。不过,我会处理它。@thechriskelley:在一个“replace”中添加了一个解决方案
text.replace(/='([^']*)'/g, '="$1"').replace(/=([^"'][^ >]*)/g, '="$1"')
text.replace(/='([^']*)'|=([^"'][^ >]*)/g, '="$1"')
// convert/html-to-filtered-markdown.js

'use strict';

var sanitize = require('sanitize-html') //https://www.npmjs.org/package/sanitize-html
    ,toMarkdown = require('to-markdown').toMarkdown
    ;

module.exports = function convertHtmlToFilteredMarkdown(input, options) {
  if (!input) return '';

  options = options || {};

  //basic cleanup, normalize line endings, normalize/reduce whitespace and extra line endings
  var response = (input || '').toString().trim()
    .replace(/(\r\n|\r|\n)/g, '\n') //normalize line endings
    .replace(/“/g, '"') //remove fancy quotes
    .replace(/”/g, '"') //remove fancy quotes
    .replace(/‘/g, '\'') //remove fancy quotes
    .replace(/’/g, '\'') //remove fancy quotes
    ;

  //sanitize html input
  response = sanitize(response, {
    //don't allow table elements
    allowedTags: [ 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'blockquote', 'p', 'a', 'ul', 'ol', 'nl', 'li', 'b', 'i', 'strong', 'em', 'strike', 'code', 'hr', 'br', 'div', 'table', 'thead', 'caption', 'tbody', 'tr', 'th', 'td', 'pre' ],

    //make orderd lists
    transformTags: {
      'ol': 'ul'
    }
  }).replace(/\r\n|\r|\n/g,'\n') //normalize line endings;

  if (!options.tables) {
    response = response.replace(/[\s\n]*\<(\/?)(table|thead|tbody|tr|th|td)\>[\s\n]*/g, '\n\n') //replace divs/tables blocks as paragraphs
  }

  //cleanup input further
  response = response
    .replace(/[\s\n]*\<(\/?)(div|p)\>[\s\n]*/g, '\n\n') //divs and p's to simple multi-line expressions
    .replace(/\>#/g, '\n\n#') //cleanup #'s' after closing tag, ex: <a>...</a>\n\n# will be reduced via sanitizer
    .replace(/\\s+\</,'<') //remove space before a tag open
    .replace(/\>\s+\n?/,'>\n') //remove space after a tag close
    .replace(/\&?nbsp\;?/g,' ') //revert nbsp to space
    .replace(/\<\h[12]/g,'<h3').replace(/\<\/\h[12]/g,'</h3') //reduce h1/h2 to h3
    ;

  //convert response to markdown
  response = toMarkdown(response);

  //normalize line endings
  response = response
    .replace(/(?:^|\n)##?[\b\s]/g,'\n### ') //reduce h1 and h2 to h3
    .replace(/(\r\n|\r|\n)/g, '\n') //normalize line endings
    .trim()

  return response + '\n';
}