使用Java从HTML中提取文本，包括源行号和代码_Java_Html_Html Parsing_Jsoup_Jericho Html Parser

使用Java从HTML中提取文本，包括源行号和代码
java html
使用Java从HTML中提取文本，包括源行号和代码,java,html,html-parsing,jsoup,jericho-html-parser,Java,Html,Html Parsing,Jsoup,Jericho Html Parser,如何使用Java从HTML中提取文本的问题已经被查看和复制了无数次：在我目前的状态中，我找到的答案是我正在使用 org.jsoup ) 并尝试一种工作方式，比如： intpos=html.indexOf（textNode.outerHtml（））；无法可靠地找到原始html。因此，我假设我可能不得不切换到另一个库或方法。正如上面的链接所指出的，“杰里科可以做到”。但是缺少指向实际工作代码的指针我从耶利哥到了哪里： Source htmlSource=新源代码（html）；布尔bod
如何使用Java从HTML中提取文本的问题已经被查看和复制了无数次：
在我目前的状态中，我找到的答案是我正在使用

org.jsoup
)
并尝试一种工作方式，比如：
intpos=html.indexOf（textNode.outerHtml（））；

无法可靠地找到原始html。因此，我假设我可能不得不切换到另一个库或方法。正如上面的链接所指出的，“杰里科可以做到”。但是缺少指向实际工作代码的指针
我从耶利哥到了哪里：
Source htmlSource=新源代码（html）；
布尔bodyFound=false；
//在所有元素上循环
for（net.htmlparser.jericho.Element el:htmlSource.getAllegements（））{
if（el.getName（）.equals（“body”））{
bodyFound=true；
}
如果（bodyFound）{
TagType TagType=el.getStartTag（）.getTagType（）；
if（tagType==StartTagType.NORMAL）{
String text=el.getTextExtractor（）.toString（）；
如果（！text.trim（）等于（“”）{
int cpos=el.getBegin（）；
System.out.println（el.getName（）+”（“+tagType.toString（）+”）行“+htmlSource.getRow（cpos）+”：“+text）；
}
}//如果
}//如果
}//为了

这已经相当不错了，因为它将为您提供如下输出：
body(normal) line 91: Some Header. Some Text
div(normal) line 93: Some Header
div(normal) line 95: Some Text

但现在接下来的问题是TextExtractor递归地输出所有子节点的整个文本，以便文本多次显示
什么样的工作解决方案可以像上面的JSoup解决方案一样进行过滤（请注意文本元素的正确顺序），但却像上面的Jericho代码片段那样显示源代码行？您需要的功能和JSoup缺少的功能更难实现。
使用Jericho并实现类似的功能，用于查找即时文本节点
package main.java.com.adacom.task;

import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;

import net.htmlparser.jericho.Element;
import net.htmlparser.jericho.EndTag;
import net.htmlparser.jericho.Segment;
import net.htmlparser.jericho.Source;
import net.htmlparser.jericho.StartTag;
import net.htmlparser.jericho.StartTagType;
import net.htmlparser.jericho.Tag;
import net.htmlparser.jericho.TagType;

public class MainParser {

    /**
     * @param args
     */
    public static void main(String[] args) {

        String html = "<body><div>divtextA<span>spanTextA<p>pText</p>spanTextB</span>divTextB</div></body>";

        Source htmlSource=new Source(html);
        boolean bodyFound=false;
        // loop over all elements
        for (net.htmlparser.jericho.Element el:htmlSource.getAllElements()) {
            if (el.getName().equals("body")) {
                bodyFound=true;
            }
            if (bodyFound) {
                TagType tagType = el.getStartTag().getTagType();
                if (tagType==StartTagType.NORMAL) {
                    String text = getOwnTextSegmentsString(el);
                    if (!text.trim().equals("")) {
                        int cpos = el.getBegin();               
                        System.out.println(el.getName()+"("+tagType.toString()+") line "+   htmlSource.getRow(cpos)+":"+text);
                    }
                } // if
            } // if
        } // for

    }

    /**
     * this function is not used it's shown here only for reference
     */ 
    public static Iterator<Segment> getOwnTextSegmentsIterator(Element elem) {
        final Iterator<Segment> it = elem.getContent().getNodeIterator();
        final List<Segment> results = new LinkedList<Segment>();
        int tagCounter = 0;
        while (it.hasNext()) {
            Segment cur = it.next();            
            if(cur instanceof StartTag) 
                tagCounter++;
            else if(cur instanceof EndTag) 
                tagCounter--;

            if (!(cur instanceof Tag) && tagCounter == 0) {
                System.out.println(cur);
                results.add(cur);
            }
        }
        return results.iterator();
    }

    public static String getOwnTextSegmentsString(Element elem) {
        final Iterator<Segment> it = elem.getContent().getNodeIterator();
        StringBuilder strBuilder = new StringBuilder();
        int tagCounter = 0;
        while (it.hasNext()) {
            Segment cur = it.next();            
            if(cur instanceof StartTag) 
                tagCounter++;
            else if(cur instanceof EndTag) 
                tagCounter--;

            if (!(cur instanceof Tag) && tagCounter == 0) {
                strBuilder.append(cur.toString() + ' ');
            }
        }
        return strBuilder.toString().trim();
    }

}

package main.java.com.adacom.task；
导入java.util.Iterator；
导入java.util.LinkedList；
导入java.util.List；
导入net.htmlparser.jericho.Element；
导入net.htmlparser.jericho.EndTag；
导入net.htmlparser.jericho.Segment；
导入net.htmlparser.jericho.Source；
导入net.htmlparser.jericho.StartTag；
导入net.htmlparser.jericho.StartTagType；
导入net.htmlparser.jericho.Tag；
导入net.htmlparser.jericho.TagType；
公共类主解析器{
/**
*@param args
*/
公共静态void main（字符串[]args）{
String html=“divtextAspanTextApTextspanTextBdivTextB”；
源htmlSource=新源（html）；
布尔bodyFound=false；
//在所有元素上循环
for（net.htmlparser.jericho.Element el:htmlSource.getAllegements（））{
if（el.getName（）.equals（“body”））{
bodyFound=true；
}
如果（bodyFound）{
TagType TagType=el.getStartTag（）.getTagType（）；
if（tagType==StartTagType.NORMAL）{
字符串文本=getOwnTextSegmentsString（el）；
如果（！text.trim（）等于（“”）{
int cpos=el.getBegin（）；
System.out.println（el.getName（）+”（“+tagType.toString（）+”）行“+htmlSource.getRow（cpos）+”：“+text）；
}
}//如果
}//如果
}//为了
}
/**
*此函数未被使用。此处显示此函数仅供参考
*/ 
公共静态迭代器GetownTextSegmentSiteOrator（Element elem）{
最终迭代器it=elem.getContent（）.getNodeIterator（）；
最终列表结果=新建LinkedList（）；
int tagCounter=0；
while（it.hasNext（））{
段cur=it.next（）；
if（当前StartTag实例）
tagCounter++；
else if（当前EndTag实例）
标记计数器--；
如果（！（cur instanceof Tag）&&tagCounter==0）{
系统输出打印项次（cur）；
结果：添加（cur）；
}
}
返回结果。迭代器（）；
}
公共静态字符串getOwnTextSegmentsString（元素elem）{
最终迭代器it=elem.getContent（）.getNodeIterator（）；
StringBuilder strBuilder=新StringBuilder（）；
int tagCounter=0；
while（it.hasNext（））{
段cur=it.next（）；
if（当前StartTag实例）
tagCounter++；
else if（当前EndTag实例）
标记计数器--；
如果（！（cur instanceof Tag）&&tagCounter==0）{
strBuilder.append（cur.toString（）+“”）；
}
}
返回strBuilder.toString（）.trim（）；
}
}
这里有一个Junit测试，测试预期的输出，还有一个基于Jericho的SourceTextExtractor，它使Junit测试能够工作，它基于原始Jericho TextExtractor源代码
@Test
public void testTextExtract() {
    // https://github.com/paepcke/CorEx/blob/master/src/extraction/HTMLUtils.java
    String htmls[] = {
            "<!DOCTYPE html>\n" + "<html>\n" + "<body>\n" + "\n"
                    + "<h1>My First Heading</h1>\n" + "\n"
                    + "<p>My first paragraph.</p>\n" + "\n" + "</body>\n" + "</html>",
            "<html>\n"
                    + "<body>\n"
                    + "\n"
                    + "<div id=\"myDiv\" name=\"myDiv\" title=\"Example Div Element\">\n"
                    + "  <h5>Subtitle</h5>\n"
                    + "  <p>This paragraph would be your content paragraph...</p>\n"
                    + "  <p>Here's another content article right here.</p>\n"
                    + "</div>" + "\n" + "Text at end of body</body>\n" + "</html>" };
    int expectedSize[] = { 2, 4 };
    String expectedInfo[][]={
        { 
            "line 5 col 5 to  line 5 col 21: My First Heading",
            "line 7 col 4 to  line 7 col 23: My first paragraph."
        },
        { 
            "line 5 col 7 to  line 5 col 15: Subtitle",
            "line 6 col 6 to  line 6 col 55: This paragraph would be your content paragraph...",
            "line 7 col 6 to  line 7 col 48: Here's another content article right here.",
            "line 8 col 7 to  line 9 col 20: Text at end of body"
        }
    };
    int i = 0;
    for (String html : htmls) {
        SourceTextExtractor extractor=new SourceTextExtractor();
        List<TextResult> textParts = extractor.extractTextSegments(html);
        // List<String> textParts = HTMLCleanerTextExtractor.extractText(html);
        int j=0;
        for (TextResult textPart : textParts) {
            System.out.println(textPart.getInfo());
            assertTrue(textPart.getInfo().startsWith(expectedInfo[i][j]));
            j++;
        }
        assertEquals(expectedSize[i], textParts.size());
        i++;
    }
}

@测试
公共void testTextExtract（）{
// https://github.com/paepcke/CorEx/blob/master/src/extraction/HTMLUtils.java
字符串htmls[]={
“\n”+“\n”+“\n”+“\n”
+“我的第一个标题\n”+“\n”
+我的第一段。\n“+”\n“+”\n“+”，
“\n”
+“\n”
+“\n”
+“\n”
+“字幕\n”
+“此段落将是您的内容段落…\n”
+“这里有另一篇内容文章。\n”
+正文末尾的“+”\n“+”文本\n“+”}；
int expectedSize[]={2,4}；
字符串应为fo[][]={
{ 
“第5行第5列至第5行第21列：我的第一个标题”，
“第7行第4列至第7行第23列：我的第一段。”
},
{ 
“第5行第7列至第5行第15列：副标题”，
“第6行第6列至第6行第55列：本段为您的
/**
 * TextExtractor that makes source line and col references available
 * http://grepcode.com/file_/repo1.maven.org/maven2/net.htmlparser.jericho/jericho-html/3.3/net/htmlparser/jericho/TextExtractor.java/?v=source
 */
public class SourceTextExtractor {

    public static class TextResult {
        private String text;
        private Source root;
        private Segment segment;
        private int line;
        private int col;

        /**
         * get a textResult
         * @param root
         * @param segment
         */
        public TextResult(Source root,Segment segment) {
            this.root=root;
            this.segment=segment;
            final StringBuilder sb=new StringBuilder(segment.length());
            sb.append(segment);
            setText(CharacterReference.decodeCollapseWhiteSpace(sb));
            int spos = segment.getBegin();  
            line=root.getRow(spos);
            col=root.getColumn(spos);

        }

        /**
         * gets info about this TextResult
         * @return
         */
        public String getInfo() {
            int epos=segment.getEnd();

            String result=
                    " line "+   line+" col "+col+
                    " to "+
                    " line "+   root.getRow(epos)+" col "+root.getColumn(epos)+
                    ":"+getText();
            return result;
        }

        /**
         * @return the text
         */
        public String getText() {
            return text;
        }

        /**
         * @param text the text to set
         */
        public void setText(String text) {
            this.text = text;
        }

        public int getLine() {
            return line;
        }

        public int getCol() {
            return col;
        }

    }

    /**
     * extract textSegments from the given html
     * @param html
     * @return
     */
    public List<TextResult> extractTextSegments(String html) {
        Source htmlSource=new Source(html);
        List<TextResult> result = extractTextSegments(htmlSource);
        return result;
    }

    /**
     * get the TextSegments from the given root segment
     * @param root
     * @return
     */
    public List<TextResult> extractTextSegments(Source root) {
        List<TextResult> result=new ArrayList<TextResult>();
        for (NodeIterator nodeIterator=new NodeIterator(root); nodeIterator.hasNext();) {
            Segment segment=nodeIterator.next();
            if (segment instanceof Tag) {
                final Tag tag=(Tag)segment;
                if (tag.getTagType().isServerTag()) {
                    // elementContainsMarkup should be made into a TagType property one day.
                    // for the time being assume all server element content is code, although this is not true for some Mason elements.
                    final boolean elementContainsMarkup=false;
                    if (!elementContainsMarkup) {
                        final net.htmlparser.jericho.Element element=tag.getElement();
                        if (element!=null && element.getEnd()>tag.getEnd()) nodeIterator.skipToPos(element.getEnd());
                    }
                    continue;
                }
                if (tag.getTagType()==StartTagType.NORMAL) {
                    final StartTag startTag=(StartTag)tag;
                    if (tag.name==HTMLElementName.SCRIPT || tag.name==HTMLElementName.STYLE ||  (!HTMLElements.getElementNames().contains(tag.name))) {
                        nodeIterator.skipToPos(startTag.getElement().getEnd());
                        continue;
                    }

                }
                // Treat both start and end tags not belonging to inline-level elements as whitespace:
                if (tag.getName()==HTMLElementName.BR || !HTMLElements.getInlineLevelElementNames().contains(tag.getName())) {
                    // sb.append(' ');
                }
            } else {
                if (!segment.isWhiteSpace())
                    result.add(new TextResult(root,segment));
            }
        }
        return result;
    }

    /**
     * extract the text from the given segment
     * @param segment
     * @return
     */
    public String extractText(net.htmlparser.jericho.Segment pSegment) {

        // http://grepcode.com/file_/repo1.maven.org/maven2/net.htmlparser.jericho/jericho-html/3.3/net/htmlparser/jericho/TextExtractor.java/?v=source
        // this would call the code above
        // String result=segment.getTextExtractor().toString();
        final StringBuilder sb=new StringBuilder(pSegment.length());
        for (NodeIterator nodeIterator=new NodeIterator(pSegment); nodeIterator.hasNext();) {
            Segment segment=nodeIterator.next();
            if (segment instanceof Tag) {
                final Tag tag=(Tag)segment;
                if (tag.getTagType().isServerTag()) {
                    // elementContainsMarkup should be made into a TagType property one day.
                    // for the time being assume all server element content is code, although this is not true for some Mason elements.
                    final boolean elementContainsMarkup=false;
                    if (!elementContainsMarkup) {
                        final net.htmlparser.jericho.Element element=tag.getElement();
                        if (element!=null && element.getEnd()>tag.getEnd()) nodeIterator.skipToPos(element.getEnd());
                    }
                    continue;
                }
                if (tag.getTagType()==StartTagType.NORMAL) {
                    final StartTag startTag=(StartTag)tag;
                    if (tag.name==HTMLElementName.SCRIPT || tag.name==HTMLElementName.STYLE ||  (!HTMLElements.getElementNames().contains(tag.name))) {
                        nodeIterator.skipToPos(startTag.getElement().getEnd());
                        continue;
                    }

                }
                // Treat both start and end tags not belonging to inline-level elements as whitespace:
                if (tag.getName()==HTMLElementName.BR || !HTMLElements.getInlineLevelElementNames().contains(tag.getName())) {
                    sb.append(' ');
                }
            } else {
                sb.append(segment);
            }
        }
        final String result=net.htmlparser.jericho.CharacterReference.decodeCollapseWhiteSpace(sb);
        return result;
    }
}