Java 如何使用HTMLDocument查找属性？_Java_Html_Dom_Attributes

Java 如何使用HTMLDocument查找属性？

java html dom

Java 如何使用HTMLDocument查找属性？,java,html,dom,attributes,Java,Html,Dom,Attributes,HTML的术语可能与XML的术语不同，但下面是一个HTML文档，从中检索属性。这里属性a1、a2、a3是主体标记的一部分 <html> <head> Hello World </head> <body a1="ABC" a2="3974" a3="A1B2"> <------These attributes <H1>Start He

HTML的术语可能与XML的术语不同，但下面是一个HTML文档，从中检索属性。这里属性a1、a2、a3是主体标记的一部分

<html>
  <head>
      Hello World
  </head>
  <body a1="ABC" a2="3974" a3="A1B2">     <------These attributes
    <H1>Start Here<H1>
    <p>This is the body</p>
  </body>
</html>


你好，世界
我不知道HtmlKit
，但是使用regex可以获得类似的结果
public static void main(String[] args) throws UnirestException {
    String html = "<html>\r\n" + 
            "  <head>\r\n" + 
            "      Hello World\r\n" + 
            "  </head>\r\n" + 
            "  <body a1=\"ABC\" a2=\"3974\" a3=\"A1B2\">\r\n" + 
            "    <H1>Start Here<H1>\r\n" + 
            "    <p>This is the body</p>\r\n" + 
            "  </body>\r\n" + 
            "</html>";
    Pattern regexBodyPattern = Pattern.compile("<body[^>]*>", Pattern.MULTILINE);
    Matcher matcher = regexBodyPattern.matcher(html);
    
    while(matcher.find()) {
        String bodyTag = matcher.group();
        Pattern regexBodyAttrPattern = Pattern.compile("(\\S*)=(\\\"\\w*\\\")", Pattern.MULTILINE);
        Matcher attrMatcher = regexBodyAttrPattern.matcher(bodyTag);
        while(attrMatcher.find()) {
            System.out.println("Key :: "+attrMatcher.group(1)+" , Value "+attrMatcher.group(2));
        }
    }       
}

也许这会有帮助：
import java.io.*;
import java.net.*;
import java.util.*;
import javax.swing.*;
import javax.swing.text.*;
import javax.swing.text.html.*;

class AttributeHTML
{
    public static void main(String[] args)
    {
        EditorKit kit = new HTMLEditorKit();
        Document doc = kit.createDefaultDocument();

        // The Document class does not yet handle charset's properly.
        doc.putProperty("IgnoreCharsetDirective", Boolean.TRUE);

        try
        {
            // Create a reader on the HTML content.
            Reader rd = getReader(args[0]);

            // Parse the HTML.
            kit.read(rd, doc, 0);

            // Iterate through the elements of the HTML document.

            ElementIterator it = new ElementIterator(doc);
            Element elem = null;

            while ( (elem = it.next()) != null )
            {
                if (elem.getName().equals("body"))
                {

                    AttributeSet as = elem.getAttributes();

                    Enumeration enum1 = as.getAttributeNames();

                    while( enum1.hasMoreElements() )
                    {
                        Object name = enum1.nextElement();
                        Object value = as.getAttribute( name );

                        System.out.println( "\t" + name + " : " + value );
                    }

                }
            }

        }
        catch (Exception e)
        {
            e.printStackTrace();
        }

        System.exit(1);
    }

    // Returns a reader on the HTML data. If 'uri' begins
    // with "http:", it's treated as a URL; otherwise,
    // it's assumed to be a local filename.
    static Reader getReader(String uri)
        throws IOException
    {
        // Retrieve from Internet.
        if (uri.startsWith("http:"))
        {
            URLConnection conn = new URL(uri).openConnection();
            return new InputStreamReader(conn.getInputStream());
        }
        // Retrieve from file.
        else
        {
            return new FileReader(uri);
        }
    }
}

测试使用：
java AttributeHTML yourFile.html

要检索属性，可以提供自己的ParserCallback
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.List;

import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTML.Tag;
import javax.swing.text.html.HTMLEditorKit.ParserCallback;
import javax.swing.text.html.parser.ParserDelegator;

public class HTMLParserTest2
{

  public static void main(String args[]) throws Exception {

    Reader reader = new FileReader("d:/temp/Example.html");
    BufferedReader br = new BufferedReader(reader);
    
    System.out.println(HTMLParserTest2.extractTagsAttributes(br));
    // output :  [title-_implied_=true, body-a1=ABC, body-a2=3974, body-a3=A1B2]
    System.exit(0);
  }
  
  public static List<String> extractTagsAttributes(Reader r) throws IOException {
     final ArrayList<String> list = new ArrayList<String>();

     ParserDelegator parserDelegator = new ParserDelegator();
     ParserCallback parserCallback = new ParserCallback() {
       @Override
      public void handleText(final char[] data, final int pos) {  }
       @Override
      public void handleStartTag(Tag tag, MutableAttributeSet attribute, int pos) { 
          Enumeration<?> e=attribute.getAttributeNames();
          while(e.hasMoreElements()) {
             Object name=e.nextElement();
             Object value=attribute.getAttribute(name);
             list.add(tag.toString() + "-" + name + "=" +value);
          }
       }
      @Override
      public void handleEndTag(Tag t, final int pos) {  }
      @Override
      public void handleSimpleTag(Tag t, MutableAttributeSet a, final int pos) { }
      @Override
      public void handleComment(final char[] data, final int pos) { }
      @Override
      public void handleError(final java.lang.String errMsg, final int pos) { }
     };
     parserDelegator.parse(r, parserCallback, true);
     return list;
  }
}

导入java.io.BufferedReader；
导入java.io.FileReader；
导入java.io.IOException；
导入java.io.Reader；
导入java.util.ArrayList；
导入java.util.Enumeration；
导入java.util.List；
导入javax.swing.text.MutableAttributeSet；
导入javax.swing.text.html.html.Tag；
导入javax.swing.text.html.HTMLEditorKit.ParserCallback；
导入javax.swing.text.html.parser.ParserDelegator；
公共类HTMLParserTest2
{
公共静态void main（字符串args[]）引发异常{
Reader Reader=newfilereader（“d:/temp/Example.html”）；
BufferedReader br=新的BufferedReader（读卡器）；
System.out.println（HTMLParserTest2.extractTagsAttributes（br））；
//输出：[title-_=true，body-a1=ABC，body-a2=3974，body-a3=A1B2]
系统出口（0）；
}
公共静态列表ExtractTagsAttribute（读取器r）引发IOException{
最终ArrayList=新ArrayList（）；
ParserDelegator ParserDelegator=新的ParserDelegator（）；
ParserCallback ParserCallback=新的ParserCallback（）{
@凌驾
public void handleText（final char[]数据，final int pos）{}
@凌驾
public void handleStartTag（标记标记，可变属性集属性，int pos）{
枚举e=attribute.getAttributeNames（）；
而（e.hasMoreElements（））{
对象名称=e.nextElement（）；
对象值=attribute.getAttribute（名称）；
list.add（tag.toString（）+“-”+name+“=”+value）；
}
}
@凌驾
public void handleEndTag（Tag t，final int pos）{}
@凌驾
public void handleSimpleTag（标记t，可变属性集a，final int pos）{}
@凌驾
public void handleComment（final char[]data，final int pos）{}
@凌驾
public void handleError（final java.lang.String errMsg，final int pos）{}
};
parse（r，parserCallback，true）；
退货清单；
}
}
寻找与HTML5兼容的现代解析器。Java有很多，谢谢。我唯一关心的是，如果内容发生变化，可能需要修改regex模式。（不确定，因为没有太多使用它。）@Unhandled Exception如果必须查找body标记的属性，则此正则表达式将用于读取所有属性。谢谢。这就是我试图做的，但显然失败得很惨。效果很好。
import java.io.*;
import java.net.*;
import java.util.*;
import javax.swing.*;
import javax.swing.text.*;
import javax.swing.text.html.*;

class AttributeHTML
{
    public static void main(String[] args)
    {
        EditorKit kit = new HTMLEditorKit();
        Document doc = kit.createDefaultDocument();

        // The Document class does not yet handle charset's properly.
        doc.putProperty("IgnoreCharsetDirective", Boolean.TRUE);

        try
        {
            // Create a reader on the HTML content.
            Reader rd = getReader(args[0]);

            // Parse the HTML.
            kit.read(rd, doc, 0);

            // Iterate through the elements of the HTML document.

            ElementIterator it = new ElementIterator(doc);
            Element elem = null;

            while ( (elem = it.next()) != null )
            {
                if (elem.getName().equals("body"))
                {

                    AttributeSet as = elem.getAttributes();

                    Enumeration enum1 = as.getAttributeNames();

                    while( enum1.hasMoreElements() )
                    {
                        Object name = enum1.nextElement();
                        Object value = as.getAttribute( name );

                        System.out.println( "\t" + name + " : " + value );
                    }

                }
            }

        }
        catch (Exception e)
        {
            e.printStackTrace();
        }

        System.exit(1);
    }

    // Returns a reader on the HTML data. If 'uri' begins
    // with "http:", it's treated as a URL; otherwise,
    // it's assumed to be a local filename.
    static Reader getReader(String uri)
        throws IOException
    {
        // Retrieve from Internet.
        if (uri.startsWith("http:"))
        {
            URLConnection conn = new URL(uri).openConnection();
            return new InputStreamReader(conn.getInputStream());
        }
        // Retrieve from file.
        else
        {
            return new FileReader(uri);
        }
    }
}

java AttributeHTML yourFile.html

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.List;

import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTML.Tag;
import javax.swing.text.html.HTMLEditorKit.ParserCallback;
import javax.swing.text.html.parser.ParserDelegator;

public class HTMLParserTest2
{

  public static void main(String args[]) throws Exception {

    Reader reader = new FileReader("d:/temp/Example.html");
    BufferedReader br = new BufferedReader(reader);
    
    System.out.println(HTMLParserTest2.extractTagsAttributes(br));
    // output :  [title-_implied_=true, body-a1=ABC, body-a2=3974, body-a3=A1B2]
    System.exit(0);
  }
  
  public static List<String> extractTagsAttributes(Reader r) throws IOException {
     final ArrayList<String> list = new ArrayList<String>();

     ParserDelegator parserDelegator = new ParserDelegator();
     ParserCallback parserCallback = new ParserCallback() {
       @Override
      public void handleText(final char[] data, final int pos) {  }
       @Override
      public void handleStartTag(Tag tag, MutableAttributeSet attribute, int pos) { 
          Enumeration<?> e=attribute.getAttributeNames();
          while(e.hasMoreElements()) {
             Object name=e.nextElement();
             Object value=attribute.getAttribute(name);
             list.add(tag.toString() + "-" + name + "=" +value);
          }
       }
      @Override
      public void handleEndTag(Tag t, final int pos) {  }
      @Override
      public void handleSimpleTag(Tag t, MutableAttributeSet a, final int pos) { }
      @Override
      public void handleComment(final char[] data, final int pos) { }
      @Override
      public void handleError(final java.lang.String errMsg, final int pos) { }
     };
     parserDelegator.parse(r, parserCallback, true);
     return list;
  }
}