Java 解析ApacheTikaXML输出返回未知标记_Java_Xml_Sax_Saxparser_Apache Tika

Java 解析ApacheTikaXML输出返回未知标记

java xml

Java 解析ApacheTikaXML输出返回未知标记,java,xml,sax,saxparser,apache-tika,Java,Xml,Sax,Saxparser,Apache Tika,基本上，我解析来自ApacheTika的几个xml输出，以使用获取元数据（通过元标记）和嵌入式文件列表。但是，我发现我的地图有几个键未知标记（0x…）。我想知道这是否是由Tika不完整的标记输出引起的，因为我得到的错误只与未关闭的标记相关——我怀疑这是在XML正文中，而不是我想要的输出（meta，div）。然而，如果写入映射的代码只有meta标记和div（带有嵌入式类），那么这是相当不合逻辑的，因为这只是文档的一小部分 public class Parse { private class

基本上，我解析来自ApacheTika的几个xml输出，以使用

获取元数据（通过元标记）和嵌入式文件列表。但是，我发现我的地图有几个键

未知标记（0x…

）。我想知道这是否是由Tika不完整的标记输出引起的，因为我得到的错误只与未关闭的标记相关——我怀疑这是在XML正文中，而不是我想要的输出（meta，div）。然而，如果写入映射的代码只有meta标记和div（带有嵌入式类），那么这是相当不合逻辑的，因为这只是文档的一小部分

public class Parse {
    private class internalXMLReader extends DefaultHandler{
        public final Map<String, Object> entityList = new HashMap<>();

        @Override
        public void startElement(String uri, String localname, String qName, Attributes attributes) throws SAXException{
            String key, content;
            if(qName.equalsIgnoreCase("meta")){
                key = attributes.getValue("name");
                content = attributes.getValue("content");
                if(key.contains("Content-Type")){
                    String tmp[] = attributes.getValue("content").replace(' ', '\0').split(";");
                    if(tmp.length > 1){
                        content = tmp[0];
                    }
                }
                entityList.put(key, content);
            }
            else if(qName.equalsIgnoreCase("div")){
                if((attributes.getValue("class") != null) && (attributes.getValue("class").equalsIgnoreCase("embedded"))){
                    key = "embedded";
                    List<String> inlist;
                    if(entityList.containsKey("embedded") && (entityList.get("embedded") instanceof List)){
                        inlist = (List) entityList.get(key);
                    }
                    else{
                        inlist = new LinkedList<>();
                        entityList.put(key, inlist);
                    }
                    inlist.add(attributes.getValue("id"));
                }
            }
        }

        @Override
        public void endElement(String uri, String localname, String qName) throws SAXException{
            //no, i just did not want to validate or such..
        }

        @Override
        public void characters(char ch[], int start, int length) throws SAXException{
            //no, we don't actually read <something>this</something> yet
        }
    }
    public Entity parse(String xml, Entity in){
        try{
            InputSource xmlinput = new InputSource(new StringReader(xml));
            SAXParserFactory factory = SAXParserFactory.newInstance();
            SAXParser parser = factory.newSAXParser();
            internalXMLReader handler = new internalXMLReader();
            parser.parse(xmlinput, handler);
            in.addMeta(handler.entityList);
        }
        catch(IOException | ParserConfigurationException | SAXException ex){
            Logger.getLogger(TikaParseNCluste.class.getName()).log(Level.SEVERE, null, ex);
        }
        return in;
    }
}

公共类解析{
私有类internalXMLReader扩展了DefaultHandler{
public final Map entityList=new HashMap（）；
@凌驾
public void startElement（字符串uri、字符串localname、字符串qName、属性）引发SAXException{
字符串键，内容；
if（qName.equalsIgnoreCase（“meta”））{
key=attributes.getValue（“名称”）；
content=attributes.getValue（“内容”）；
if（key.contains（“内容类型”））{
字符串tmp[]=attributes.getValue（“内容”）.replace（“”，“\0”）.split（“”）；
如果（tmp.length>1）{
内容=tmp[0]；
}
}
entityList.put（键、内容）；
}
else if（qName.equalsIgnoreCase（“div”））{
if（（attributes.getValue（“class”）！=null）和&（attributes.getValue（“class”）.equalsIgnoreCase（“嵌入式”））{
key=“嵌入式”；
列表中的列表；
if（entityList.containsKey（“嵌入式”）和&（entityList.get（“嵌入式”）instanceof List））{
inlist=（List）entityList.get（key）；
}
否则{
inlist=新的LinkedList（）；
entityList.put（键，inlist）；
}
inlist.add（attributes.getValue（“id”）；
}
}
}
@凌驾
public void endElement（字符串uri、字符串localname、字符串qName）引发SAXException{
//不，我只是不想验证之类的。。
}
@凌驾
公共无效字符（char ch[]，int start，int length）引发异常{
//不，我们还没有读过这篇文章
}
}
公共实体解析（字符串xml，中的实体）{
试一试{
InputSource xmlinput=新的InputSource（新的StringReader（xml））；
SAXParserFactory=SAXParserFactory.newInstance（）；
SAXParser parser=factory.newSAXParser（）；
internalXMLReader处理程序=新的internalXMLReader（）；
parser.parse（xmlinput，handler）；
in.addMeta（handler.entityList）；
}
捕获（IOException | ParserConfiguration Exception | SAXException ex）{
Logger.getLogger（TikaParseNCluste.class.getName（））.log（Level.SEVERE，null，ex）；
}
返回；
}
}

也许我应该看看我的800多个xml文件。

谷歌没有这方面的信息，我很不耐烦。但是运行了

grep-i-l-r“name=\“unknown”。

我得到了几个jpg文件

也许这就是原因。我不希望ApacheTika会给出这样的输出。因此，我将代码更改为：

...
if(qName.equalsIgnoreCase("meta") && (attributes.getValue("name") != null)){
                key = attributes.getValue("name");
                if((key != null) && (!key.contains("Unknown"))){
                    content = attributes.getValue("content");
                    if(key.contains("Content-Type")){
                        String tmp[] = attributes.getValue("content").replace(' ', '\0').split(";");
                        if(tmp.length > 1){
                            content = tmp[0];
                        }
                    }
                    entityList.put(key, content);
                }
            }
...

我不知道这是一个bug还是别的什么。到目前为止，用关键字apache tika unknown tag快速查询谷歌搜索只会让我想到这里