Xml 解析HTML并在Google脚本中使用xPath请求_Xml_Xpath_Google Apps Script_Google Sheets Api

Xml 解析HTML并在Google脚本中使用xPath请求

xml xpath google-apps-script

Xml 解析HTML并在Google脚本中使用xPath请求,xml,xpath,google-apps-script,google-sheets-api,Xml,Xpath,Google Apps Script,Google Sheets Api,在我的电子表格中，我需要能够从网站中提取HTML，然后对其使用xPath请求。为此，我编写了以下代码： var artistPages = []; _.forEach(artistLinks, function (d, i) { var pageHtml = readRemoteXML(d); Logger.log(xPath("/html/body[@class='margin50']/div[@class='container main-page']/div[@class='row'

在我的电子表格中，我需要能够从网站中提取HTML，然后对其使用xPath请求。为此，我编写了以下代码：

var artistPages = [];
_.forEach(artistLinks, function (d, i) {
  var pageHtml = readRemoteXML(d);
  Logger.log(xPath("/html/body[@class='margin50']/div[@class='container main-page']/div[@class='row']/div[@class='col-xs-12 col-md-6 text-center']/div[@id='listAlbum']/div[@class='album']", pageHtml));
});

它使用XPathXMLParsing库，可在以下位置找到：

这些功能的代码如下所示：

function xPath(path, xmlFile) {
  var root = xmlFile.getRootElement();
  return xPathStep(path, root);
}

function xPathStep(path, node) {
  // if node is an array, return the result for each entry
  if (Array.isArray(node)) {
    return node.map(function(singleNode) {
      return xPathStep(path, singleNode);
    });
  }

  if (!node) {
    return;
  }

  var nextNode, nodeValue;
  var paths = path.split('/');
  var firstChild = paths[0];
  var remainingPath = paths.slice(1).join('/');

  // if child ends with [\d] - find a list, return this index
  var indexMatch = firstChild.match(/(\w+)\[(\d+)\]/);
  var attributeMatch = firstChild.match(/@(\w+)/);

  if (indexMatch) {
    var tagName = indexMatch[1];
    var index = indexMatch[2];

    var children = node.getChildren(tagName);
    nextNode = children[index];
  } else if (firstChild === '') {
    // if another name is next, use as a matching tag (and remove from path)
    var tagName = '';
    if (paths.length > 1) {
      tagName = paths[1];
      remainingPath = paths.slice(2).join('/');
    }
    nextNode = node.getChildren(tagName);
  } else if (attributeMatch) {
    // @ means attribute
    var attributeName = attributeMatch[1];
    nodeValue = node.getAttribute(attributeName).getValue();
  } else {
    nextNode = node.getChild(firstChild);
  }

  var result;
  if (nodeValue) {
    result = nodeValue;
  } else if (remainingPath !== '') {
    result = xPathStep(remainingPath, nextNode);
  } else {
    result = nextNode.getText && nextNode.getText();
  }

  return result;
}

function readRemoteXML(xmlFileUrl) {
  var content = UrlFetchApp.fetch(xmlFileUrl).getContentText();
  return XmlService.parse(content);
}

唯一的问题是，该库只在纯XML上工作，我正在尝试解析HTML，这意味着解析器会遇到各种问题，例如

标记未关闭

UrlFetchApp.fetch（）.getContentText（）

返回一个字符串，然后需要将其解析为XML对象，以便在其上运行xPath查询。因此，我本质上要求的是运行xPath查询的替代方法，或者在XmlService解析XML之前修复所有XML问题的方法。提前谢谢你

你不能使用吗？jQuery可以用于google脚本吗？有一些技巧，我找不到任何jQuery google脚本包，你说的“一些技巧”是什么意思？我可以在回答中详细说明，但只有在与你相关的情况下。那么，解析HTML就足够了吗？