Java Jsoup Html解析查找内部链接数据时出现问题
通常我们在一个文件中有许多内部链接。我想解析一个html文件,以便在地图中获得页面的标题及其相应的数据 我所做的步骤:Java Jsoup Html解析查找内部链接数据时出现问题,java,html,jsoup,Java,Html,Jsoup,通常我们在一个文件中有许多内部链接。我想解析一个html文件,以便在地图中获得页面的标题及其相应的数据 我所做的步骤: 1) 已获取所有内部参考元素 2) 分析了id=XXX的文档,其中XXX==(元素 ] 强制实行相互排斥 有软件和硬件两种解决方案来强制执行互斥。 不同的解决方案如下所示 []硬件 解决 系统内部实现互斥的常用方法 是 使残废 这是我的密码: import java.io.BufferedReader; import java.io.IOException; import
1) 已获取所有内部参考元素
2) 分析了id=XXX的文档,其中XXX==(元素
- ]
强制实行相互排斥
有软件和硬件两种解决方案来强制执行互斥。
不同的解决方案如下所示
[]硬件
解决
系统内部实现互斥的常用方法
是
使残废
这是我的密码:
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public final class Website {
private URL websiteURL ;
private Document httpDoc ;
LinkedHashMap<String, ArrayList<String>> internalLinks =
new LinkedHashMap<String, ArrayList<String>>();
public Website(URL __websiteURL) throws MalformedURLException, IOException, Exception{
if(__websiteURL == null)
throw new Exception();
websiteURL = __websiteURL;
httpDoc = Jsoup.parse(connect());
System.out.println("Parsed the http file to Document");
}
/* Here is my function: i first gets all the internal links in internalLinksElements.
I then get the href name of <a ..> tag so that i can search for it in documnet.
*/
public void getDataWithHeadingsTogether(){
Elements internalLinksElements;
internalLinksElements = httpDoc.select("a[href^=#]");
for(Element element : internalLinksElements){
// some inline links were bad. i only those having span as their child.
Elements spanElements = element.select("span");
if(!spanElements.isEmpty()){
System.out.println("Text(): " + element.text()); // this can not give what i want
// ok i get the href tag name that would be the id
String href = element.attr("href") ;
href = href.replace("#", "");
System.out.println(href);
// selecting the element where we have that id.
Element data = httpDoc.getElementById(href);
// got the span
if(data == null)
continue;
Elements children = new Elements();
// problem is here.
while(children.isEmpty()){
// going to its element unless gets some data.
data = data.parent();
System.out.println(data);
children = data.select("p");
}
// its giving me all the data of file. thats bad.
System.out.println(children.text());
}
}
}
/**
*
* @return String Get all the headings of the document.
* @throws MalformedURLException
* @throws IOException
*/
@SuppressWarnings("CallToThreadDumpStack")
public String connect() throws MalformedURLException, IOException{
// Is this thread safe ? url.openStream();
BufferedReader reader = null;
try{
reader = new BufferedReader( new InputStreamReader(websiteURL.openStream()));
System.out.println("Got the reader");
} catch(Exception e){
e.printStackTrace();
System.out.println("Bye");
String html = "<html><h1>Heading 1</h1><body><h2>Heading 2</h2><p>hello</p></body></html>";
return html;
}
String inputLine, result = new String();
while((inputLine = reader.readLine()) != null){
result += inputLine;
}
reader.close();
System.out.println("Made the html file");
return result;
}
/**
*
* @param argv all the command line parameters.
* @throws MalformedURLException
* @throws IOException
*/
public static void main(String[] argv) throws MalformedURLException, IOException, Exception{
System.setProperty("proxyHost", "172.16.0.3");
System.setProperty("proxyPort","8383");
System.out.println("Sending url");
// a html file or any url place here ------------------------------------
URL url = new URL("put a html file here ");
Website website = new Website(url);
System.out.println(url.toString());
System.out.println("++++++++++++++++++++++++++++++++++++++++++++++++");
website.getDataWithHeadingsTogether();
}
}
导入java.io.BufferedReader;
导入java.io.IOException;
导入java.io.InputStreamReader;
导入java.net.MalformedURLException;
导入java.net.URL;
导入java.util.ArrayList;
导入java.util.LinkedHashMap;
导入org.jsoup.jsoup;
导入org.jsoup.nodes.Document;
导入org.jsoup.nodes.Element;
导入org.jsoup.select.Elements;
公开期末考试网站{
私有URL网站URL;
私人文件httpDoc;
LinkedHashMap内部链接=
新建LinkedHashMap();
公共网站(URL\uuu websiteURL)抛出畸形的异常异常,IOException,Exception{
如果(_网站URL==null)
抛出新异常();
websiteURL=uu websiteURL;
httpDoc=Jsoup.parse(connect());
println(“将http文件解析为文档”);
}
/*这是我的函数:我首先获取internalLinksElements中的所有内部链接。
然后我得到标签的href名称,以便在documnet中搜索它。
*/
public void getDataWithHeadingsTogether(){
元素内部链接选择;
internalLinksElements=httpDoc.select(“a[href^=#]”;
对于(元素:内部链接选择元素){
//一些内联链接不好。我只知道那些有span作为他们孩子的链接。
元素span元素=元素。选择(“span”);
如果(!spanElements.isEmpty()){
System.out.println(“Text():”+element.Text());//这不能满足我的要求
//好的,我得到了href标签名,它将是id
字符串href=element.attr(“href”);
href=href.replace(“#,”);
System.out.println(href);
//选择具有该id的元素。
元素数据=httpDoc.getElementById(href);
//得到了跨度
如果(数据==null)
持续
元素子元素=新元素();
//问题就在这里。
while(children.isEmpty()){
//除非获得一些数据,否则将转到其元素。
data=data.parent();
系统输出打印项次(数据);
子项=数据。选择(“p”);
}
//它给了我文件中的所有数据,这很糟糕。
System.out.println(children.text());
}
}
}
/**
*
*@return String获取文档的所有标题。
*@抛出错误的DurException
*@抛出异常
*/
@SuppressWarnings(“CallToThreadDumpStack”)
public String connect()引发MalformedURLException,IOException{
//这是线程安全的吗?url.openStream();
BufferedReader reader=null;
试一试{
reader=new BufferedReader(新的InputStreamReader(websiteURL.openStream());
System.out.println(“获得了阅读器”);
}捕获(例外e){
e、 printStackTrace();
System.out.println(“再见”);
String html=“Heading 1Heading 2hello”;
返回html;
}
字符串输入行,结果=新字符串();
而((inputLine=reader.readLine())!=null){
结果+=输入线;
}
reader.close();
System.out.println(“生成html文件”);
返回结果;
}
/**
*
*@param argv所有命令行参数。
*@抛出错误的DurException
*@抛出异常
*/
publicstaticvoidmain(字符串[]argv)引发畸形的DurLexception、IOException、Exception{
System.setProperty(“proxyHost”、“172.16.0.3”);
System.setProperty(“proxyPort”、“8383”);
System.out.println(“发送url”);
//html文件或此处的任何url位置------------------------------------
URL=新URL(“在此处放置html文件”);
网站=新网站(url);
System.out.println(url.toString());
System.out.println(“+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++”;
website.getDataWithHeadingsTogether();
}
}
我认为您需要了解,您正在查找的
是标头元素的子元素,并且您要存储的数据是由该标头的同级组成的
因此,您需要获取
,然后使用来收集作为该
数据的节点。当您的同级数据用完时,或者遇到另一个标头元素时,您需要停止收集数据,因为另一个标头指示下一项数据的开始。您是说您的HTML文档具有内部锚定,如
,每个锚定后面都有一些您想从中获取一些值的标记?如果是,请尽可能显示一些实际的HTML。另外,使用getElementById(href)
不应该用作内部锚定,应该使用name
属性,而不是id
属性。@Paul请查看我的html文件。它是wikipedia页面的典型源文件。link[我假设是这个wikipedia页面,它有子标题(硬件解决方案和软件解决方案)。你想如何处理这些?@Paul我只想要Map。我可以维护另一个列表,它可以告诉层次结构。有没有办法找到这个元素是标记还是这是什么?
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public final class Website {
private URL websiteURL ;
private Document httpDoc ;
LinkedHashMap<String, ArrayList<String>> internalLinks =
new LinkedHashMap<String, ArrayList<String>>();
public Website(URL __websiteURL) throws MalformedURLException, IOException, Exception{
if(__websiteURL == null)
throw new Exception();
websiteURL = __websiteURL;
httpDoc = Jsoup.parse(connect());
System.out.println("Parsed the http file to Document");
}
/* Here is my function: i first gets all the internal links in internalLinksElements.
I then get the href name of <a ..> tag so that i can search for it in documnet.
*/
public void getDataWithHeadingsTogether(){
Elements internalLinksElements;
internalLinksElements = httpDoc.select("a[href^=#]");
for(Element element : internalLinksElements){
// some inline links were bad. i only those having span as their child.
Elements spanElements = element.select("span");
if(!spanElements.isEmpty()){
System.out.println("Text(): " + element.text()); // this can not give what i want
// ok i get the href tag name that would be the id
String href = element.attr("href") ;
href = href.replace("#", "");
System.out.println(href);
// selecting the element where we have that id.
Element data = httpDoc.getElementById(href);
// got the span
if(data == null)
continue;
Elements children = new Elements();
// problem is here.
while(children.isEmpty()){
// going to its element unless gets some data.
data = data.parent();
System.out.println(data);
children = data.select("p");
}
// its giving me all the data of file. thats bad.
System.out.println(children.text());
}
}
}
/**
*
* @return String Get all the headings of the document.
* @throws MalformedURLException
* @throws IOException
*/
@SuppressWarnings("CallToThreadDumpStack")
public String connect() throws MalformedURLException, IOException{
// Is this thread safe ? url.openStream();
BufferedReader reader = null;
try{
reader = new BufferedReader( new InputStreamReader(websiteURL.openStream()));
System.out.println("Got the reader");
} catch(Exception e){
e.printStackTrace();
System.out.println("Bye");
String html = "<html><h1>Heading 1</h1><body><h2>Heading 2</h2><p>hello</p></body></html>";
return html;
}
String inputLine, result = new String();
while((inputLine = reader.readLine()) != null){
result += inputLine;
}
reader.close();
System.out.println("Made the html file");
return result;
}
/**
*
* @param argv all the command line parameters.
* @throws MalformedURLException
* @throws IOException
*/
public static void main(String[] argv) throws MalformedURLException, IOException, Exception{
System.setProperty("proxyHost", "172.16.0.3");
System.setProperty("proxyPort","8383");
System.out.println("Sending url");
// a html file or any url place here ------------------------------------
URL url = new URL("put a html file here ");
Website website = new Website(url);
System.out.println(url.toString());
System.out.println("++++++++++++++++++++++++++++++++++++++++++++++++");
website.getDataWithHeadingsTogether();
}
}