使用Java从HTML中提取文本,包括源行号和代码
如何使用Java从HTML中提取文本的问题已经被查看和复制了无数次: 在我目前的状态中,我找到的答案是我正在使用使用Java从HTML中提取文本,包括源行号和代码,java,html,html-parsing,jsoup,jericho-html-parser,Java,Html,Html Parsing,Jsoup,Jericho Html Parser,如何使用Java从HTML中提取文本的问题已经被查看和复制了无数次: 在我目前的状态中,我找到的答案是我正在使用 org.jsoup ) 并尝试一种工作方式,比如: intpos=html.indexOf(textNode.outerHtml()); 无法可靠地找到原始html。因此,我假设我可能不得不切换到另一个库或方法。正如上面的链接所指出的,“杰里科可以做到”。但是缺少指向实际工作代码的指针 我从耶利哥到了哪里: Source htmlSource=新源代码(html); 布尔bod
org.jsoup
)
并尝试一种工作方式,比如:
intpos=html.indexOf(textNode.outerHtml());
无法可靠地找到原始html。因此,我假设我可能不得不切换到另一个库或方法。正如上面的链接所指出的,“杰里科可以做到”。但是缺少指向实际工作代码的指针
我从耶利哥到了哪里:
Source htmlSource=新源代码(html);
布尔bodyFound=false;
//在所有元素上循环
for(net.htmlparser.jericho.Element el:htmlSource.getAllegements()){
if(el.getName().equals(“body”)){
bodyFound=true;
}
如果(bodyFound){
TagType TagType=el.getStartTag().getTagType();
if(tagType==StartTagType.NORMAL){
String text=el.getTextExtractor().toString();
如果(!text.trim()等于(“”){
int cpos=el.getBegin();
System.out.println(el.getName()+”(“+tagType.toString()+”)行“+htmlSource.getRow(cpos)+”:“+text);
}
}//如果
}//如果
}//为了
这已经相当不错了,因为它将为您提供如下输出:
body(normal) line 91: Some Header. Some Text
div(normal) line 93: Some Header
div(normal) line 95: Some Text
但现在接下来的问题是TextExtractor递归地输出所有子节点的整个文本,以便文本多次显示
什么样的工作解决方案可以像上面的JSoup解决方案一样进行过滤(请注意文本元素的正确顺序),但却像上面的Jericho代码片段那样显示源代码行?您需要的功能和JSoup缺少的功能更难实现。
使用Jericho并实现类似的功能,用于查找即时文本节点
package main.java.com.adacom.task;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import net.htmlparser.jericho.Element;
import net.htmlparser.jericho.EndTag;
import net.htmlparser.jericho.Segment;
import net.htmlparser.jericho.Source;
import net.htmlparser.jericho.StartTag;
import net.htmlparser.jericho.StartTagType;
import net.htmlparser.jericho.Tag;
import net.htmlparser.jericho.TagType;
public class MainParser {
/**
* @param args
*/
public static void main(String[] args) {
String html = "<body><div>divtextA<span>spanTextA<p>pText</p>spanTextB</span>divTextB</div></body>";
Source htmlSource=new Source(html);
boolean bodyFound=false;
// loop over all elements
for (net.htmlparser.jericho.Element el:htmlSource.getAllElements()) {
if (el.getName().equals("body")) {
bodyFound=true;
}
if (bodyFound) {
TagType tagType = el.getStartTag().getTagType();
if (tagType==StartTagType.NORMAL) {
String text = getOwnTextSegmentsString(el);
if (!text.trim().equals("")) {
int cpos = el.getBegin();
System.out.println(el.getName()+"("+tagType.toString()+") line "+ htmlSource.getRow(cpos)+":"+text);
}
} // if
} // if
} // for
}
/**
* this function is not used it's shown here only for reference
*/
public static Iterator<Segment> getOwnTextSegmentsIterator(Element elem) {
final Iterator<Segment> it = elem.getContent().getNodeIterator();
final List<Segment> results = new LinkedList<Segment>();
int tagCounter = 0;
while (it.hasNext()) {
Segment cur = it.next();
if(cur instanceof StartTag)
tagCounter++;
else if(cur instanceof EndTag)
tagCounter--;
if (!(cur instanceof Tag) && tagCounter == 0) {
System.out.println(cur);
results.add(cur);
}
}
return results.iterator();
}
public static String getOwnTextSegmentsString(Element elem) {
final Iterator<Segment> it = elem.getContent().getNodeIterator();
StringBuilder strBuilder = new StringBuilder();
int tagCounter = 0;
while (it.hasNext()) {
Segment cur = it.next();
if(cur instanceof StartTag)
tagCounter++;
else if(cur instanceof EndTag)
tagCounter--;
if (!(cur instanceof Tag) && tagCounter == 0) {
strBuilder.append(cur.toString() + ' ');
}
}
return strBuilder.toString().trim();
}
}
package main.java.com.adacom.task;
导入java.util.Iterator;
导入java.util.LinkedList;
导入java.util.List;
导入net.htmlparser.jericho.Element;
导入net.htmlparser.jericho.EndTag;
导入net.htmlparser.jericho.Segment;
导入net.htmlparser.jericho.Source;
导入net.htmlparser.jericho.StartTag;
导入net.htmlparser.jericho.StartTagType;
导入net.htmlparser.jericho.Tag;
导入net.htmlparser.jericho.TagType;
公共类主解析器{
/**
*@param args
*/
公共静态void main(字符串[]args){
String html=“divtextAspanTextApTextspanTextBdivTextB”;
源htmlSource=新源(html);
布尔bodyFound=false;
//在所有元素上循环
for(net.htmlparser.jericho.Element el:htmlSource.getAllegements()){
if(el.getName().equals(“body”)){
bodyFound=true;
}
如果(bodyFound){
TagType TagType=el.getStartTag().getTagType();
if(tagType==StartTagType.NORMAL){
字符串文本=getOwnTextSegmentsString(el);
如果(!text.trim()等于(“”){
int cpos=el.getBegin();
System.out.println(el.getName()+”(“+tagType.toString()+”)行“+htmlSource.getRow(cpos)+”:“+text);
}
}//如果
}//如果
}//为了
}
/**
*此函数未被使用。此处显示此函数仅供参考
*/
公共静态迭代器GetownTextSegmentSiteOrator(Element elem){
最终迭代器it=elem.getContent().getNodeIterator();
最终列表结果=新建LinkedList();
int tagCounter=0;
while(it.hasNext()){
段cur=it.next();
if(当前StartTag实例)
tagCounter++;
else if(当前EndTag实例)
标记计数器--;
如果(!(cur instanceof Tag)&&tagCounter==0){
系统输出打印项次(cur);
结果:添加(cur);
}
}
返回结果。迭代器();
}
公共静态字符串getOwnTextSegmentsString(元素elem){
最终迭代器it=elem.getContent().getNodeIterator();
StringBuilder strBuilder=新StringBuilder();
int tagCounter=0;
while(it.hasNext()){
段cur=it.next();
if(当前StartTag实例)
tagCounter++;
else if(当前EndTag实例)
标记计数器--;
如果(!(cur instanceof Tag)&&tagCounter==0){
strBuilder.append(cur.toString()+“”);
}
}
返回strBuilder.toString().trim();
}
}
这里有一个Junit测试,测试预期的输出,还有一个基于Jericho的SourceTextExtractor,它使Junit测试能够工作,它基于原始Jericho TextExtractor源代码
@Test
public void testTextExtract() {
// https://github.com/paepcke/CorEx/blob/master/src/extraction/HTMLUtils.java
String htmls[] = {
"<!DOCTYPE html>\n" + "<html>\n" + "<body>\n" + "\n"
+ "<h1>My First Heading</h1>\n" + "\n"
+ "<p>My first paragraph.</p>\n" + "\n" + "</body>\n" + "</html>",
"<html>\n"
+ "<body>\n"
+ "\n"
+ "<div id=\"myDiv\" name=\"myDiv\" title=\"Example Div Element\">\n"
+ " <h5>Subtitle</h5>\n"
+ " <p>This paragraph would be your content paragraph...</p>\n"
+ " <p>Here's another content article right here.</p>\n"
+ "</div>" + "\n" + "Text at end of body</body>\n" + "</html>" };
int expectedSize[] = { 2, 4 };
String expectedInfo[][]={
{
"line 5 col 5 to line 5 col 21: My First Heading",
"line 7 col 4 to line 7 col 23: My first paragraph."
},
{
"line 5 col 7 to line 5 col 15: Subtitle",
"line 6 col 6 to line 6 col 55: This paragraph would be your content paragraph...",
"line 7 col 6 to line 7 col 48: Here's another content article right here.",
"line 8 col 7 to line 9 col 20: Text at end of body"
}
};
int i = 0;
for (String html : htmls) {
SourceTextExtractor extractor=new SourceTextExtractor();
List<TextResult> textParts = extractor.extractTextSegments(html);
// List<String> textParts = HTMLCleanerTextExtractor.extractText(html);
int j=0;
for (TextResult textPart : textParts) {
System.out.println(textPart.getInfo());
assertTrue(textPart.getInfo().startsWith(expectedInfo[i][j]));
j++;
}
assertEquals(expectedSize[i], textParts.size());
i++;
}
}
@测试
公共void testTextExtract(){
// https://github.com/paepcke/CorEx/blob/master/src/extraction/HTMLUtils.java
字符串htmls[]={
“\n”+“\n”+“\n”+“\n”
+“我的第一个标题\n”+“\n”
+我的第一段。\n“+”\n“+”\n“+”,
“\n”
+“\n”
+“\n”
+“\n”
+“字幕\n”
+“此段落将是您的内容段落…\n”
+“这里有另一篇内容文章。\n”
+正文末尾的“+”\n“+”文本\n“+”};
int expectedSize[]={2,4};
字符串应为fo[][]={
{
“第5行第5列至第5行第21列:我的第一个标题”,
“第7行第4列至第7行第23列:我的第一段。”
},
{
“第5行第7列至第5行第15列:副标题”,
“第6行第6列至第6行第55列:本段为您的
/**
* TextExtractor that makes source line and col references available
* http://grepcode.com/file_/repo1.maven.org/maven2/net.htmlparser.jericho/jericho-html/3.3/net/htmlparser/jericho/TextExtractor.java/?v=source
*/
public class SourceTextExtractor {
public static class TextResult {
private String text;
private Source root;
private Segment segment;
private int line;
private int col;
/**
* get a textResult
* @param root
* @param segment
*/
public TextResult(Source root,Segment segment) {
this.root=root;
this.segment=segment;
final StringBuilder sb=new StringBuilder(segment.length());
sb.append(segment);
setText(CharacterReference.decodeCollapseWhiteSpace(sb));
int spos = segment.getBegin();
line=root.getRow(spos);
col=root.getColumn(spos);
}
/**
* gets info about this TextResult
* @return
*/
public String getInfo() {
int epos=segment.getEnd();
String result=
" line "+ line+" col "+col+
" to "+
" line "+ root.getRow(epos)+" col "+root.getColumn(epos)+
":"+getText();
return result;
}
/**
* @return the text
*/
public String getText() {
return text;
}
/**
* @param text the text to set
*/
public void setText(String text) {
this.text = text;
}
public int getLine() {
return line;
}
public int getCol() {
return col;
}
}
/**
* extract textSegments from the given html
* @param html
* @return
*/
public List<TextResult> extractTextSegments(String html) {
Source htmlSource=new Source(html);
List<TextResult> result = extractTextSegments(htmlSource);
return result;
}
/**
* get the TextSegments from the given root segment
* @param root
* @return
*/
public List<TextResult> extractTextSegments(Source root) {
List<TextResult> result=new ArrayList<TextResult>();
for (NodeIterator nodeIterator=new NodeIterator(root); nodeIterator.hasNext();) {
Segment segment=nodeIterator.next();
if (segment instanceof Tag) {
final Tag tag=(Tag)segment;
if (tag.getTagType().isServerTag()) {
// elementContainsMarkup should be made into a TagType property one day.
// for the time being assume all server element content is code, although this is not true for some Mason elements.
final boolean elementContainsMarkup=false;
if (!elementContainsMarkup) {
final net.htmlparser.jericho.Element element=tag.getElement();
if (element!=null && element.getEnd()>tag.getEnd()) nodeIterator.skipToPos(element.getEnd());
}
continue;
}
if (tag.getTagType()==StartTagType.NORMAL) {
final StartTag startTag=(StartTag)tag;
if (tag.name==HTMLElementName.SCRIPT || tag.name==HTMLElementName.STYLE || (!HTMLElements.getElementNames().contains(tag.name))) {
nodeIterator.skipToPos(startTag.getElement().getEnd());
continue;
}
}
// Treat both start and end tags not belonging to inline-level elements as whitespace:
if (tag.getName()==HTMLElementName.BR || !HTMLElements.getInlineLevelElementNames().contains(tag.getName())) {
// sb.append(' ');
}
} else {
if (!segment.isWhiteSpace())
result.add(new TextResult(root,segment));
}
}
return result;
}
/**
* extract the text from the given segment
* @param segment
* @return
*/
public String extractText(net.htmlparser.jericho.Segment pSegment) {
// http://grepcode.com/file_/repo1.maven.org/maven2/net.htmlparser.jericho/jericho-html/3.3/net/htmlparser/jericho/TextExtractor.java/?v=source
// this would call the code above
// String result=segment.getTextExtractor().toString();
final StringBuilder sb=new StringBuilder(pSegment.length());
for (NodeIterator nodeIterator=new NodeIterator(pSegment); nodeIterator.hasNext();) {
Segment segment=nodeIterator.next();
if (segment instanceof Tag) {
final Tag tag=(Tag)segment;
if (tag.getTagType().isServerTag()) {
// elementContainsMarkup should be made into a TagType property one day.
// for the time being assume all server element content is code, although this is not true for some Mason elements.
final boolean elementContainsMarkup=false;
if (!elementContainsMarkup) {
final net.htmlparser.jericho.Element element=tag.getElement();
if (element!=null && element.getEnd()>tag.getEnd()) nodeIterator.skipToPos(element.getEnd());
}
continue;
}
if (tag.getTagType()==StartTagType.NORMAL) {
final StartTag startTag=(StartTag)tag;
if (tag.name==HTMLElementName.SCRIPT || tag.name==HTMLElementName.STYLE || (!HTMLElements.getElementNames().contains(tag.name))) {
nodeIterator.skipToPos(startTag.getElement().getEnd());
continue;
}
}
// Treat both start and end tags not belonging to inline-level elements as whitespace:
if (tag.getName()==HTMLElementName.BR || !HTMLElements.getInlineLevelElementNames().contains(tag.getName())) {
sb.append(' ');
}
} else {
sb.append(segment);
}
}
final String result=net.htmlparser.jericho.CharacterReference.decodeCollapseWhiteSpace(sb);
return result;
}
}