使用Java从HTML中提取文本,包括源行号和代码

使用Java从HTML中提取文本,包括源行号和代码,java,html,html-parsing,jsoup,jericho-html-parser,Java,Html,Html Parsing,Jsoup,Jericho Html Parser,如何使用Java从HTML中提取文本的问题已经被查看和复制了无数次: 在我目前的状态中,我找到的答案是我正在使用 org.jsoup ) 并尝试一种工作方式,比如: intpos=html.indexOf(textNode.outerHtml()); 无法可靠地找到原始html。因此,我假设我可能不得不切换到另一个库或方法。正如上面的链接所指出的,“杰里科可以做到”。但是缺少指向实际工作代码的指针 我从耶利哥到了哪里: Source htmlSource=新源代码(html); 布尔bod

如何使用Java从HTML中提取文本的问题已经被查看和复制了无数次:

在我目前的状态中,我找到的答案是我正在使用


org.jsoup
)

并尝试一种工作方式,比如:

intpos=html.indexOf(textNode.outerHtml());
无法可靠地找到原始html。因此,我假设我可能不得不切换到另一个库或方法。正如上面的链接所指出的,“杰里科可以做到”。但是缺少指向实际工作代码的指针

我从耶利哥到了哪里:

Source htmlSource=新源代码(html);
布尔bodyFound=false;
//在所有元素上循环
for(net.htmlparser.jericho.Element el:htmlSource.getAllegements()){
if(el.getName().equals(“body”)){
bodyFound=true;
}
如果(bodyFound){
TagType TagType=el.getStartTag().getTagType();
if(tagType==StartTagType.NORMAL){
String text=el.getTextExtractor().toString();
如果(!text.trim()等于(“”){
int cpos=el.getBegin();
System.out.println(el.getName()+”(“+tagType.toString()+”)行“+htmlSource.getRow(cpos)+”:“+text);
}
}//如果
}//如果
}//为了
这已经相当不错了,因为它将为您提供如下输出:

body(normal) line 91: Some Header. Some Text
div(normal) line 93: Some Header
div(normal) line 95: Some Text
但现在接下来的问题是TextExtractor递归地输出所有子节点的整个文本,以便文本多次显示


什么样的工作解决方案可以像上面的JSoup解决方案一样进行过滤(请注意文本元素的正确顺序),但却像上面的Jericho代码片段那样显示源代码行?

您需要的功能和JSoup缺少的功能更难实现。 使用Jericho并实现类似的功能,用于查找即时文本节点

package main.java.com.adacom.task;

import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;

import net.htmlparser.jericho.Element;
import net.htmlparser.jericho.EndTag;
import net.htmlparser.jericho.Segment;
import net.htmlparser.jericho.Source;
import net.htmlparser.jericho.StartTag;
import net.htmlparser.jericho.StartTagType;
import net.htmlparser.jericho.Tag;
import net.htmlparser.jericho.TagType;

public class MainParser {

    /**
     * @param args
     */
    public static void main(String[] args) {

        String html = "<body><div>divtextA<span>spanTextA<p>pText</p>spanTextB</span>divTextB</div></body>";

        Source htmlSource=new Source(html);
        boolean bodyFound=false;
        // loop over all elements
        for (net.htmlparser.jericho.Element el:htmlSource.getAllElements()) {
            if (el.getName().equals("body")) {
                bodyFound=true;
            }
            if (bodyFound) {
                TagType tagType = el.getStartTag().getTagType();
                if (tagType==StartTagType.NORMAL) {
                    String text = getOwnTextSegmentsString(el);
                    if (!text.trim().equals("")) {
                        int cpos = el.getBegin();               
                        System.out.println(el.getName()+"("+tagType.toString()+") line "+   htmlSource.getRow(cpos)+":"+text);
                    }
                } // if
            } // if
        } // for

    }

    /**
     * this function is not used it's shown here only for reference
     */ 
    public static Iterator<Segment> getOwnTextSegmentsIterator(Element elem) {
        final Iterator<Segment> it = elem.getContent().getNodeIterator();
        final List<Segment> results = new LinkedList<Segment>();
        int tagCounter = 0;
        while (it.hasNext()) {
            Segment cur = it.next();            
            if(cur instanceof StartTag) 
                tagCounter++;
            else if(cur instanceof EndTag) 
                tagCounter--;

            if (!(cur instanceof Tag) && tagCounter == 0) {
                System.out.println(cur);
                results.add(cur);
            }
        }
        return results.iterator();
    }

    public static String getOwnTextSegmentsString(Element elem) {
        final Iterator<Segment> it = elem.getContent().getNodeIterator();
        StringBuilder strBuilder = new StringBuilder();
        int tagCounter = 0;
        while (it.hasNext()) {
            Segment cur = it.next();            
            if(cur instanceof StartTag) 
                tagCounter++;
            else if(cur instanceof EndTag) 
                tagCounter--;

            if (!(cur instanceof Tag) && tagCounter == 0) {
                strBuilder.append(cur.toString() + ' ');
            }
        }
        return strBuilder.toString().trim();
    }

}
package main.java.com.adacom.task;
导入java.util.Iterator;
导入java.util.LinkedList;
导入java.util.List;
导入net.htmlparser.jericho.Element;
导入net.htmlparser.jericho.EndTag;
导入net.htmlparser.jericho.Segment;
导入net.htmlparser.jericho.Source;
导入net.htmlparser.jericho.StartTag;
导入net.htmlparser.jericho.StartTagType;
导入net.htmlparser.jericho.Tag;
导入net.htmlparser.jericho.TagType;
公共类主解析器{
/**
*@param args
*/
公共静态void main(字符串[]args){
String html=“divtextAspanTextApText

spanTextBdivTextB”; 源htmlSource=新源(html); 布尔bodyFound=false; //在所有元素上循环 for(net.htmlparser.jericho.Element el:htmlSource.getAllegements()){ if(el.getName().equals(“body”)){ bodyFound=true; } 如果(bodyFound){ TagType TagType=el.getStartTag().getTagType(); if(tagType==StartTagType.NORMAL){ 字符串文本=getOwnTextSegmentsString(el); 如果(!text.trim()等于(“”){ int cpos=el.getBegin(); System.out.println(el.getName()+”(“+tagType.toString()+”)行“+htmlSource.getRow(cpos)+”:“+text); } }//如果 }//如果 }//为了 } /** *此函数未被使用。此处显示此函数仅供参考 */ 公共静态迭代器GetownTextSegmentSiteOrator(Element elem){ 最终迭代器it=elem.getContent().getNodeIterator(); 最终列表结果=新建LinkedList(); int tagCounter=0; while(it.hasNext()){ 段cur=it.next(); if(当前StartTag实例) tagCounter++; else if(当前EndTag实例) 标记计数器--; 如果(!(cur instanceof Tag)&&tagCounter==0){ 系统输出打印项次(cur); 结果:添加(cur); } } 返回结果。迭代器(); } 公共静态字符串getOwnTextSegmentsString(元素elem){ 最终迭代器it=elem.getContent().getNodeIterator(); StringBuilder strBuilder=新StringBuilder(); int tagCounter=0; while(it.hasNext()){ 段cur=it.next(); if(当前StartTag实例) tagCounter++; else if(当前EndTag实例) 标记计数器--; 如果(!(cur instanceof Tag)&&tagCounter==0){ strBuilder.append(cur.toString()+“”); } } 返回strBuilder.toString().trim(); } }
这里有一个Junit测试,测试预期的输出,还有一个基于Jericho的SourceTextExtractor,它使Junit测试能够工作,它基于原始Jericho TextExtractor源代码

@Test
public void testTextExtract() {
    // https://github.com/paepcke/CorEx/blob/master/src/extraction/HTMLUtils.java
    String htmls[] = {
            "<!DOCTYPE html>\n" + "<html>\n" + "<body>\n" + "\n"
                    + "<h1>My First Heading</h1>\n" + "\n"
                    + "<p>My first paragraph.</p>\n" + "\n" + "</body>\n" + "</html>",
            "<html>\n"
                    + "<body>\n"
                    + "\n"
                    + "<div id=\"myDiv\" name=\"myDiv\" title=\"Example Div Element\">\n"
                    + "  <h5>Subtitle</h5>\n"
                    + "  <p>This paragraph would be your content paragraph...</p>\n"
                    + "  <p>Here's another content article right here.</p>\n"
                    + "</div>" + "\n" + "Text at end of body</body>\n" + "</html>" };
    int expectedSize[] = { 2, 4 };
    String expectedInfo[][]={
        { 
            "line 5 col 5 to  line 5 col 21: My First Heading",
            "line 7 col 4 to  line 7 col 23: My first paragraph."
        },
        { 
            "line 5 col 7 to  line 5 col 15: Subtitle",
            "line 6 col 6 to  line 6 col 55: This paragraph would be your content paragraph...",
            "line 7 col 6 to  line 7 col 48: Here's another content article right here.",
            "line 8 col 7 to  line 9 col 20: Text at end of body"
        }
    };
    int i = 0;
    for (String html : htmls) {
        SourceTextExtractor extractor=new SourceTextExtractor();
        List<TextResult> textParts = extractor.extractTextSegments(html);
        // List<String> textParts = HTMLCleanerTextExtractor.extractText(html);
        int j=0;
        for (TextResult textPart : textParts) {
            System.out.println(textPart.getInfo());
            assertTrue(textPart.getInfo().startsWith(expectedInfo[i][j]));
            j++;
        }
        assertEquals(expectedSize[i], textParts.size());
        i++;
    }
}
@测试
公共void testTextExtract(){
// https://github.com/paepcke/CorEx/blob/master/src/extraction/HTMLUtils.java
字符串htmls[]={
“\n”+“\n”+“\n”+“\n”
+“我的第一个标题\n”+“\n”
+我的第一段。

\n“+”\n“+”\n“+”, “\n” +“\n” +“\n” +“\n” +“字幕\n” +“此段落将是您的内容段落…

\n” +“这里有另一篇内容文章。

\n” +正文末尾的“+”\n“+”文本\n“+”}; int expectedSize[]={2,4}; 字符串应为fo[][]={ { “第5行第5列至第5行第21列:我的第一个标题”, “第7行第4列至第7行第23列:我的第一段。” }, { “第5行第7列至第5行第15列:副标题”, “第6行第6列至第6行第55列:本段为您的
/**
 * TextExtractor that makes source line and col references available
 * http://grepcode.com/file_/repo1.maven.org/maven2/net.htmlparser.jericho/jericho-html/3.3/net/htmlparser/jericho/TextExtractor.java/?v=source
 */
public class SourceTextExtractor {

    public static class TextResult {
        private String text;
        private Source root;
        private Segment segment;
        private int line;
        private int col;

        /**
         * get a textResult
         * @param root
         * @param segment
         */
        public TextResult(Source root,Segment segment) {
            this.root=root;
            this.segment=segment;
            final StringBuilder sb=new StringBuilder(segment.length());
            sb.append(segment);
            setText(CharacterReference.decodeCollapseWhiteSpace(sb));
            int spos = segment.getBegin();  
            line=root.getRow(spos);
            col=root.getColumn(spos);

        }

        /**
         * gets info about this TextResult
         * @return
         */
        public String getInfo() {
            int epos=segment.getEnd();

            String result=
                    " line "+   line+" col "+col+
                    " to "+
                    " line "+   root.getRow(epos)+" col "+root.getColumn(epos)+
                    ":"+getText();
            return result;
        }

        /**
         * @return the text
         */
        public String getText() {
            return text;
        }

        /**
         * @param text the text to set
         */
        public void setText(String text) {
            this.text = text;
        }

        public int getLine() {
            return line;
        }

        public int getCol() {
            return col;
        }

    }

    /**
     * extract textSegments from the given html
     * @param html
     * @return
     */
    public List<TextResult> extractTextSegments(String html) {
        Source htmlSource=new Source(html);
        List<TextResult> result = extractTextSegments(htmlSource);
        return result;
    }

    /**
     * get the TextSegments from the given root segment
     * @param root
     * @return
     */
    public List<TextResult> extractTextSegments(Source root) {
        List<TextResult> result=new ArrayList<TextResult>();
        for (NodeIterator nodeIterator=new NodeIterator(root); nodeIterator.hasNext();) {
            Segment segment=nodeIterator.next();
            if (segment instanceof Tag) {
                final Tag tag=(Tag)segment;
                if (tag.getTagType().isServerTag()) {
                    // elementContainsMarkup should be made into a TagType property one day.
                    // for the time being assume all server element content is code, although this is not true for some Mason elements.
                    final boolean elementContainsMarkup=false;
                    if (!elementContainsMarkup) {
                        final net.htmlparser.jericho.Element element=tag.getElement();
                        if (element!=null && element.getEnd()>tag.getEnd()) nodeIterator.skipToPos(element.getEnd());
                    }
                    continue;
                }
                if (tag.getTagType()==StartTagType.NORMAL) {
                    final StartTag startTag=(StartTag)tag;
                    if (tag.name==HTMLElementName.SCRIPT || tag.name==HTMLElementName.STYLE ||  (!HTMLElements.getElementNames().contains(tag.name))) {
                        nodeIterator.skipToPos(startTag.getElement().getEnd());
                        continue;
                    }

                }
                // Treat both start and end tags not belonging to inline-level elements as whitespace:
                if (tag.getName()==HTMLElementName.BR || !HTMLElements.getInlineLevelElementNames().contains(tag.getName())) {
                    // sb.append(' ');
                }
            } else {
                if (!segment.isWhiteSpace())
                    result.add(new TextResult(root,segment));
            }
        }
        return result;
    }

    /**
     * extract the text from the given segment
     * @param segment
     * @return
     */
    public String extractText(net.htmlparser.jericho.Segment pSegment) {

        // http://grepcode.com/file_/repo1.maven.org/maven2/net.htmlparser.jericho/jericho-html/3.3/net/htmlparser/jericho/TextExtractor.java/?v=source
        // this would call the code above
        // String result=segment.getTextExtractor().toString();
        final StringBuilder sb=new StringBuilder(pSegment.length());
        for (NodeIterator nodeIterator=new NodeIterator(pSegment); nodeIterator.hasNext();) {
            Segment segment=nodeIterator.next();
            if (segment instanceof Tag) {
                final Tag tag=(Tag)segment;
                if (tag.getTagType().isServerTag()) {
                    // elementContainsMarkup should be made into a TagType property one day.
                    // for the time being assume all server element content is code, although this is not true for some Mason elements.
                    final boolean elementContainsMarkup=false;
                    if (!elementContainsMarkup) {
                        final net.htmlparser.jericho.Element element=tag.getElement();
                        if (element!=null && element.getEnd()>tag.getEnd()) nodeIterator.skipToPos(element.getEnd());
                    }
                    continue;
                }
                if (tag.getTagType()==StartTagType.NORMAL) {
                    final StartTag startTag=(StartTag)tag;
                    if (tag.name==HTMLElementName.SCRIPT || tag.name==HTMLElementName.STYLE ||  (!HTMLElements.getElementNames().contains(tag.name))) {
                        nodeIterator.skipToPos(startTag.getElement().getEnd());
                        continue;
                    }

                }
                // Treat both start and end tags not belonging to inline-level elements as whitespace:
                if (tag.getName()==HTMLElementName.BR || !HTMLElements.getInlineLevelElementNames().contains(tag.getName())) {
                    sb.append(' ');
                }
            } else {
                sb.append(segment);
            }
        }
        final String result=net.htmlparser.jericho.CharacterReference.decodeCollapseWhiteSpace(sb);
        return result;
    }
}