快速解析html表

快速解析html表,html,swift,ios8,Html,Swift,Ios8,我正在尝试将此信息从网页导入iOS应用程序 HTML代码: 我们的目标是以类似这样的格式输出 Production Line - Time - Delay Production Line 1 - 9:00 minutes (10 min) - No delay Production Line 2 - 7:57 minutes (4 min) - +3:57 编辑/更新:Swift 4.x extension

我正在尝试将此信息从网页导入iOS应用程序

HTML代码:

我们的目标是以类似这样的格式输出

Production Line  -      Time  -                     Delay
Production Line 1 -     9:00 minutes  (10 min) -    No delay
Production Line 2 -     7:57 minutes  (4 min) -     +3:57
编辑/更新:Swift 4.x

extension Data {
    var html2AttributedString: NSAttributedString? {
        return try? NSAttributedString(data: self, options: [.documentType: NSAttributedString.DocumentType.html, .characterEncoding: String.Encoding.utf8.rawValue], documentAttributes: nil)

    }
    var html2String: String {
        return html2AttributedString?.string ?? ""
    }
}


XPath中存在一些问题,请参阅以下代码:

let html = "<table border='0' cellpadding='3' cellspacing='0' width='85%'><tr><td width='100%' colspan='3' bgcolor='#C9C9E7'><b>Update as of 3:57:00 PM (CDT) Thu., Apr. 16, 2015</b><br></td></tr><tr>" +
        "<td width='50%' bgcolor='#FFFFFF'><b>Production Line 1</b></td>" +
        "<td width='35%' bgcolor='#FFFFFF'><b>9:00 minutes</b><b>(10 min)&nbsp;</b></td>" +
        "<td width='15%' bgcolor='#FFFFFF'><b>No delay</b></td>" +
        "</tr><tr>" +
        "<td width='50%' bgcolor='#FFFFFF'><b>Production Line 2</b></td>" +
        "<td width='35%' bgcolor='#FFFFFF'><b>7:57 minutes  </b><b>(4 min)&nbsp;</b></td>" +
        "<td width='15%' bgcolor='#FFFFFF'><b>+3:57</b></td>" +
        "</tr><tr>" +
        "<td width='50%' bgcolor='#FFFFFF'><b>Production Line 3</b></td>" +
        "<td width='35%' bgcolor='#FFFFFF'><b>10:35 minutes  </b><b>(8 min)&nbsp;</b></td>" +
        "<td width='15%' bgcolor='#FFFFFF'><b>+2:35</b></td>" +
    "</tr></table>"



    var err : NSError?
    var parser  = HTMLParser(html: html, error: &err)
    if err != nil {
        println(err)
        exit(1)
    }        

    var table = parser.html

    // avoid the first <td> tag 
    if let inputNodes = table?.xpath("//td[position() > 1]/b") {

        println("Production Line  -      Time  -                     Delay")            
        for (index, node) in enumerate(inputNodes) {
            if index % 4 == 0 {
                println("\n")
            }
            print(node.contents + "-    ")
        }
    }

您可以根据需要对输出进行个性化设置。我希望这对你有帮助

你真的没有另一种可能获得数据吗?HTML不是交换格式,而是用于内容布局和显示的标记语言。根据我的经验,从任何网站(甚至你自己的网站)解析HTML都是一件痛苦的事情。。因为每一次设计的改变都会把你的导入弄得一团糟。不,我没有,我希望我有,html的布局不会改变。如果有人能帮我把它转换成cvs、xml或其他我可以放在表格中的东西,我会很高兴。我看到了这个扩展:)@MwcsMac关于XPath查询,你的代码错了。我希望我能把这两个都标记为正确答案。我喜欢这两种方法。我希望我能把这两种方法都标为正确答案。两种方法我都喜欢。
extension Data {
    var html2AttributedString: NSAttributedString? {
        return try? NSAttributedString(data: self, options: [.documentType: NSAttributedString.DocumentType.html, .characterEncoding: String.Encoding.utf8.rawValue], documentAttributes: nil)

    }
    var html2String: String {
        return html2AttributedString?.string ?? ""
    }
}
let data = Data("""
<table border="0" cellpadding="3" cellspacing="0" width="85%"><tr><td width="100%" colspan="3" bgcolor="#C9C9E7"><b>Update as of 3:57:00 PM (CDT) Thu., Apr. 16, 2015</b><br></td></tr><tr>
<td width="50%" bgcolor="#FFFFFF">Production Line 1</td>
<td width="35%" bgcolor="#FFFFFF">9:00 minutes  (10 min)&nbsp;</td>
<td width="15%" bgcolor="#FFFFFF">No delay</td>
</tr><tr>
<td width="50%" bgcolor="#FFFFFF"><b>Production Line 2</b></td>
<td width="35%" bgcolor="#FFFFFF"><b>7:57 minutes  </b><b>(4 min)&nbsp;</b></td>
<td width="15%" bgcolor="#FFFFFF"><b>+3:57</b></td>
</tr><tr>
<td width="50%" bgcolor="#FFFFFF"><b>Production Line 3</b></td>
<td width="35%" bgcolor="#FFFFFF"><b>10:35 minutes  </b><b>(8 min)&nbsp;</b></td>
<td width="15%" bgcolor="#FFFFFF"><b>+2:35</b></td>
</tr></table><table border="0" cellpadding="3" cellspacing="0" width="85%"><tr><td width="100%" colspan="3" bgcolor="#C9C9E7"><b>Update as of 3:57:00 PM (CDT) Thu., Apr. 16, 2015</b><br></td></tr><tr>
<td width="50%" bgcolor="#FFFFFF">Production Line 1</td>
<td width="35%" bgcolor="#FFFFFF">9:00 minutes  (10 min)&nbsp;</td>
<td width="15%" bgcolor="#FFFFFF">No delay</td>
</tr><tr>
<td width="50%" bgcolor="#FFFFFF"><b>Production Line 2</b></td>
<td width="35%" bgcolor="#FFFFFF"><b>7:57 minutes  </b><b>(4 min)&nbsp;</b></td>
<td width="15%" bgcolor="#FFFFFF"><b>+3:57</b></td>
</tr><tr>
<td width="50%" bgcolor="#FFFFFF"><b>Production Line 3</b></td>
<td width="35%" bgcolor="#FFFFFF"><b>10:35 minutes  </b><b>(8 min)&nbsp;</b></td>
<td width="15%" bgcolor="#FFFFFF"><b>+2:35</b></td>
</tr></table>
""".utf8)
let output = data.html2String
let components = output.components(separatedBy: .newlines)
for index in stride(from: 1, to: 9, by: 3) {
    let line = components[index]
    let time = components[index+1]
    let delay = components[index+2]
    print( line + " - " + time + " - " + delay )
}
let html = "<table border='0' cellpadding='3' cellspacing='0' width='85%'><tr><td width='100%' colspan='3' bgcolor='#C9C9E7'><b>Update as of 3:57:00 PM (CDT) Thu., Apr. 16, 2015</b><br></td></tr><tr>" +
        "<td width='50%' bgcolor='#FFFFFF'><b>Production Line 1</b></td>" +
        "<td width='35%' bgcolor='#FFFFFF'><b>9:00 minutes</b><b>(10 min)&nbsp;</b></td>" +
        "<td width='15%' bgcolor='#FFFFFF'><b>No delay</b></td>" +
        "</tr><tr>" +
        "<td width='50%' bgcolor='#FFFFFF'><b>Production Line 2</b></td>" +
        "<td width='35%' bgcolor='#FFFFFF'><b>7:57 minutes  </b><b>(4 min)&nbsp;</b></td>" +
        "<td width='15%' bgcolor='#FFFFFF'><b>+3:57</b></td>" +
        "</tr><tr>" +
        "<td width='50%' bgcolor='#FFFFFF'><b>Production Line 3</b></td>" +
        "<td width='35%' bgcolor='#FFFFFF'><b>10:35 minutes  </b><b>(8 min)&nbsp;</b></td>" +
        "<td width='15%' bgcolor='#FFFFFF'><b>+2:35</b></td>" +
    "</tr></table>"



    var err : NSError?
    var parser  = HTMLParser(html: html, error: &err)
    if err != nil {
        println(err)
        exit(1)
    }        

    var table = parser.html

    // avoid the first <td> tag 
    if let inputNodes = table?.xpath("//td[position() > 1]/b") {

        println("Production Line  -      Time  -                     Delay")            
        for (index, node) in enumerate(inputNodes) {
            if index % 4 == 0 {
                println("\n")
            }
            print(node.contents + "-    ")
        }
    }
Production Line  -      Time  -                     Delay

Production Line 1-    9:00 minutes-    (10 min) -    No delay-    

Production Line 2-    7:57 minutes  -    (4 min) -    +3:57-    

Production Line 3-    10:35 minutes  -    (8 min) -    +2:35-