如何在JSoupJava中仅显示html标记?
我正在做一个学校的项目,我正试图解析一个html网页来显示标记,就像下面的输出一样,没有结束标记。(我手工编码) } 这是我到目前为止的几行输出如何在JSoupJava中仅显示html标记?,java,html,eclipse,parsing,jsoup,Java,Html,Eclipse,Parsing,Jsoup,我正在做一个学校的项目,我正试图解析一个html网页来显示标记,就像下面的输出一样,没有结束标记。(我手工编码) } 这是我到目前为止的几行输出 package Scrapper; import java.util.LinkedList; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Node; import org.jsoup.select.NodeVisitor; clas
package Scrapper;
import java.util.LinkedList;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Node;
import org.jsoup.select.NodeVisitor;
class TagVisitor implements NodeVisitor {
public static class TagInfo {
public String name;
public int depth;
TagInfo(String name, int depth) {
this.depth = depth;
this.name = name;
}
}
private LinkedList<TagInfo> tags = new LinkedList<>();
public void head(Node node, int depth) {
String tag = node.nodeName();
if(!tag.startsWith("#")) {
tags.add(new TagInfo('<'+node.nodeName()+'>', depth));
}
}
public void tail(Node node, int depth) {
//Do nothing
}
public LinkedList<TagInfo> getTags() {
return tags;
}
public void printTree() {
for(TagInfo info : tags) {
String indentation = new String(new char[info.depth*2]).replace('\0', ' ');
System.out.println(indentation + info.name);
}
}
}
public class MainJsoup {
public static void main(String[] args) throws Exception {
//InputStream stream = new FileInputStream("test.html");
//Document doc = Jsoup.parse(stream, "UTF-8", "");
String URL ="http://csb.stanford.edu/class/public/pages/sykes_webdesign/05_simple.html";
Document doc = Jsoup
.connect(URL)
.userAgent("Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201")
.timeout(2000)
.get();
TagVisitor visitor = new TagVisitor();
doc.traverse(visitor);
visitor.printTree();
}
}
包装刮板机;
导入java.util.LinkedList;
导入org.jsoup.jsoup;
导入org.jsoup.nodes.Document;
导入org.jsoup.nodes.Node;
导入org.jsoup.select.NodeVisitor;
类TagVisitor实现NodeVisitor{
公共静态类TagInfo{
公共字符串名称;
公众参与深度;
TagInfo(字符串名称,整数深度){
这个。深度=深度;
this.name=名称;
}
}
私有LinkedList标记=新LinkedList();
公共空心封头(节点,内部深度){
String tag=node.nodeName();
如果(!tag.startsWith(“#”){
添加(新标记信息(“”,深度));
}
}
公共空尾(节点,整数深度){
//无所事事
}
公共链接列表getTags(){
返回标签;
}
公共void printree(){
用于(标记信息:标记){
字符串缩进=新字符串(新字符[info.depth*2])。替换('\0','';
System.out.println(缩进+信息名);
}
}
}
公共类MainJsoup{
公共静态void main(字符串[]args)引发异常{
//InputStream=新文件InputStream(“test.html”);
//documentdoc=Jsoup.parse(流,“UTF-8”和“”);
字符串URL=”http://csb.stanford.edu/class/public/pages/sykes_webdesign/05_simple.html";
文档doc=Jsoup
.connect(URL)
.userAgent(“Mozilla/5.0(Windows;U;Windows NT 6.1;rv:2.2)Gecko/20110201”)
.超时(2000)
.get();
TagVisitor=新的TagVisitor();
特拉弗斯博士(访客);
visitor.printree();
}
}
问题是什么?如何在JSoup java中只显示html标记?没问题。玩得高兴
public class ReadWithScanner {
public static void main(String[] args) throws IOException
{
String URL ="http://csb.stanford.edu/class/public/pages/sykes_webdesign/05_simple.html";
Document doc = Jsoup.connect(URL).get();
//Element p = doc.select("p");
//Elements p = doc.getElementsByTag("h6");
Elements p = doc.select("html");
//System.out.println(p);
DoublyLinkedList theList = new DoublyLinkedList();
theList.insert(p); // insert at front
theList.displayTree();
}
package Scrapper;
import java.util.LinkedList;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Node;
import org.jsoup.select.NodeVisitor;
class TagVisitor implements NodeVisitor {
public static class TagInfo {
public String name;
public int depth;
TagInfo(String name, int depth) {
this.depth = depth;
this.name = name;
}
}
private LinkedList<TagInfo> tags = new LinkedList<>();
public void head(Node node, int depth) {
String tag = node.nodeName();
if(!tag.startsWith("#")) {
tags.add(new TagInfo('<'+node.nodeName()+'>', depth));
}
}
public void tail(Node node, int depth) {
//Do nothing
}
public LinkedList<TagInfo> getTags() {
return tags;
}
public void printTree() {
for(TagInfo info : tags) {
String indentation = new String(new char[info.depth*2]).replace('\0', ' ');
System.out.println(indentation + info.name);
}
}
}
public class MainJsoup {
public static void main(String[] args) throws Exception {
//InputStream stream = new FileInputStream("test.html");
//Document doc = Jsoup.parse(stream, "UTF-8", "");
String URL ="http://csb.stanford.edu/class/public/pages/sykes_webdesign/05_simple.html";
Document doc = Jsoup
.connect(URL)
.userAgent("Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201")
.timeout(2000)
.get();
TagVisitor visitor = new TagVisitor();
doc.traverse(visitor);
visitor.printTree();
}
}