Java Jsoup Html解析查找内部链接数据时出现问题_Java_Html_Jsoup

Java Jsoup Html解析查找内部链接数据时出现问题

java html

Java Jsoup Html解析查找内部链接数据时出现问题,java,html,jsoup,Java,Html,Jsoup,通常我们在一个文件中有许多内部链接。我想解析一个html文件，以便在地图中获得页面的标题及其相应的数据我所做的步骤： 1）已获取所有内部参考元素 2）分析了id=XXX的文档，其中XXX==（元素 ] 强制实行相互排斥有软件和硬件两种解决方案来强制执行互斥。不同的解决方案如下所示 []硬件解决系统内部实现互斥的常用方法是使残废这是我的密码： import java.io.BufferedReader; import java.io.IOException; import

通常我们在一个文件中有许多内部链接。我想解析一个html文件，以便在地图中获得页面的标题及其相应的数据

我所做的步骤：
1）已获取所有内部参考元素
2）分析了id=XXX的文档，其中XXX==（元素



]
强制实行相互排斥
有软件和硬件两种解决方案来强制执行互斥。
不同的解决方案如下所示
[]硬件
解决
系统内部实现互斥的常用方法
是
使残废

这是我的密码：

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public final class Website {
private URL websiteURL ;
private Document httpDoc ;
LinkedHashMap<String, ArrayList<String>> internalLinks = 
        new LinkedHashMap<String, ArrayList<String>>();

public Website(URL __websiteURL) throws MalformedURLException, IOException, Exception{
    if(__websiteURL == null)
        throw new Exception();
    websiteURL = __websiteURL;
    httpDoc = Jsoup.parse(connect());

    System.out.println("Parsed the http file to Document");
}


/* Here is my function: i first gets all the internal links in internalLinksElements. 
   I then get the href name of <a ..> tag so that i can search for it in documnet.

*/
public void getDataWithHeadingsTogether(){
    Elements internalLinksElements;
    internalLinksElements = httpDoc.select("a[href^=#]");
    for(Element element : internalLinksElements){

// some inline links were bad. i only those having span as their child. 
        Elements spanElements = element.select("span");
        if(!spanElements.isEmpty()){
            System.out.println("Text(): " + element.text());  // this can not give what i want
            //  ok i get the href tag name that would be the id
            String href = element.attr("href") ; 
            href = href.replace("#", "");
            System.out.println(href);
            // selecting the element where we have that id.
            Element data = httpDoc.getElementById(href);
            // got the span
            if(data == null)
                continue;
            Elements children = new Elements();
            // problem is here.
            while(children.isEmpty()){
                // going to its element unless gets some data.
                data = data.parent();
                System.out.println(data);
                children = data.select("p");
            }
            // its giving me all the data of file. thats bad.
            System.out.println(children.text());
        }
    }
}
/**
 * 
 * @return String Get all the headings of the document.
 * @throws MalformedURLException
 * @throws IOException
 */
@SuppressWarnings("CallToThreadDumpStack")
public String connect() throws MalformedURLException, IOException{
    // Is this thread safe ? url.openStream();
    BufferedReader  reader = null;
    try{
        reader = new BufferedReader( new InputStreamReader(websiteURL.openStream()));
        System.out.println("Got the reader");
    } catch(Exception e){
        e.printStackTrace();
        System.out.println("Bye");
        String html = "<html><h1>Heading 1</h1><body><h2>Heading 2</h2><p>hello</p></body></html>";
        return html;
    }
    String inputLine, result  = new String();
    while((inputLine = reader.readLine()) != null){
        result += inputLine;
    }
    reader.close();
    System.out.println("Made the html file");
    return result;
}




/**
 * 
 * @param argv all the command line parameters.
 * @throws MalformedURLException
 * @throws IOException
 */
public static void main(String[] argv) throws MalformedURLException, IOException, Exception{
    System.setProperty("proxyHost", "172.16.0.3");
    System.setProperty("proxyPort","8383");
     System.out.println("Sending url");

     // a html file or any url place here ------------------------------------
     URL url = new URL("put a html file here "); 
    Website website = new Website(url);

    System.out.println(url.toString());
    System.out.println("++++++++++++++++++++++++++++++++++++++++++++++++");
    website.getDataWithHeadingsTogether();

}
}

导入java.io.BufferedReader；
导入java.io.IOException；
导入java.io.InputStreamReader；
导入java.net.MalformedURLException；
导入java.net.URL；
导入java.util.ArrayList；
导入java.util.LinkedHashMap；
导入org.jsoup.jsoup；
导入org.jsoup.nodes.Document；
导入org.jsoup.nodes.Element；
导入org.jsoup.select.Elements；
公开期末考试网站{
私有URL网站URL；
私人文件httpDoc；
LinkedHashMap内部链接=
新建LinkedHashMap（）；
公共网站（URL\uuu websiteURL）抛出畸形的异常异常，IOException，Exception{
如果（_网站URL==null）
抛出新异常（）；
websiteURL=uu websiteURL；
httpDoc=Jsoup.parse（connect（））；
println（“将http文件解析为文档”）；
}
/*这是我的函数：我首先获取internalLinksElements中的所有内部链接。
然后我得到标签的href名称，以便在documnet中搜索它。
*/
public void getDataWithHeadingsTogether（）{
元素内部链接选择；
internalLinksElements=httpDoc.select（“a[href^=#]”；
对于（元素：内部链接选择元素）{
//一些内联链接不好。我只知道那些有span作为他们孩子的链接。
元素span元素=元素。选择（“span”）；
如果（！spanElements.isEmpty（））{
System.out.println（“Text（）：”+element.Text（））；//这不能满足我的要求
//好的，我得到了href标签名，它将是id
字符串href=element.attr（“href”）；
href=href.replace（“#，”）；
System.out.println（href）；
//选择具有该id的元素。
元素数据=httpDoc.getElementById（href）；
//得到了跨度
如果（数据==null）
持续
元素子元素=新元素（）；
//问题就在这里。
while（children.isEmpty（））{
//除非获得一些数据，否则将转到其元素。
data=data.parent（）；
系统输出打印项次（数据）；
子项=数据。选择（“p”）；
}
//它给了我文件中的所有数据，这很糟糕。
System.out.println（children.text（））；
}
}
}
/**
* 
*@return String获取文档的所有标题。
*@抛出错误的DurException
*@抛出异常
*/
@SuppressWarnings（“CallToThreadDumpStack”）
public String connect（）引发MalformedURLException，IOException{
//这是线程安全的吗？url.openStream（）；
BufferedReader reader=null；
试一试{
reader=new BufferedReader（新的InputStreamReader（websiteURL.openStream（））；
System.out.println（“获得了阅读器”）；
}捕获（例外e）{
e、 printStackTrace（）；
System.out.println（“再见”）；
String html=“Heading 1Heading 2hello”；
返回html；
}
字符串输入行，结果=新字符串（）；
而（（inputLine=reader.readLine（））！=null）{
结果+=输入线；
}
reader.close（）；
System.out.println（“生成html文件”）；
返回结果；
}
/**
* 
*@param argv所有命令行参数。
*@抛出错误的DurException
*@抛出异常
*/
publicstaticvoidmain（字符串[]argv）引发畸形的DurLexception、IOException、Exception{
System.setProperty（“proxyHost”、“172.16.0.3”）；
System.setProperty（“proxyPort”、“8383”）；
System.out.println（“发送url”）；
//html文件或此处的任何url位置------------------------------------
URL=新URL（“在此处放置html文件”）；
网站=新网站（url）；
System.out.println（url.toString（））；
System.out.println（“+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++”；
website.getDataWithHeadingsTogether（）；
}
}

我认为您需要了解，您正在查找的

是标头元素的子元素，并且您要存储的数据是由该标头的同级组成的

因此，您需要获取

，然后使用来收集作为该

数据的节点。当您的同级数据用完时，或者遇到另一个标头元素时，您需要停止收集数据，因为另一个标头指示下一项数据的开始。

您是说您的HTML文档具有内部锚定，如

，每个锚定后面都有一些您想从中获取一些值的标记？如果是，请尽可能显示一些实际的HTML。另外，使用

getElementById（href）

不应该用作内部锚定，应该使用

name

属性，而不是

id

属性。@Paul请查看我的html文件。它是wikipedia页面的典型源文件。link[我假设是这个wikipedia页面，它有子标题（硬件解决方案和软件解决方案）。你想如何处理这些？@Paul我只想要Map。我可以维护另一个列表，它可以告诉层次结构。有没有办法找到这个元素是标记还是这是什么？

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public final class Website {
private URL websiteURL ;
private Document httpDoc ;
LinkedHashMap<String, ArrayList<String>> internalLinks = 
        new LinkedHashMap<String, ArrayList<String>>();

public Website(URL __websiteURL) throws MalformedURLException, IOException, Exception{
    if(__websiteURL == null)
        throw new Exception();
    websiteURL = __websiteURL;
    httpDoc = Jsoup.parse(connect());

    System.out.println("Parsed the http file to Document");
}


/* Here is my function: i first gets all the internal links in internalLinksElements. 
   I then get the href name of <a ..> tag so that i can search for it in documnet.

*/
public void getDataWithHeadingsTogether(){
    Elements internalLinksElements;
    internalLinksElements = httpDoc.select("a[href^=#]");
    for(Element element : internalLinksElements){

// some inline links were bad. i only those having span as their child. 
        Elements spanElements = element.select("span");
        if(!spanElements.isEmpty()){
            System.out.println("Text(): " + element.text());  // this can not give what i want
            //  ok i get the href tag name that would be the id
            String href = element.attr("href") ; 
            href = href.replace("#", "");
            System.out.println(href);
            // selecting the element where we have that id.
            Element data = httpDoc.getElementById(href);
            // got the span
            if(data == null)
                continue;
            Elements children = new Elements();
            // problem is here.
            while(children.isEmpty()){
                // going to its element unless gets some data.
                data = data.parent();
                System.out.println(data);
                children = data.select("p");
            }
            // its giving me all the data of file. thats bad.
            System.out.println(children.text());
        }
    }
}
/**
 * 
 * @return String Get all the headings of the document.
 * @throws MalformedURLException
 * @throws IOException
 */
@SuppressWarnings("CallToThreadDumpStack")
public String connect() throws MalformedURLException, IOException{
    // Is this thread safe ? url.openStream();
    BufferedReader  reader = null;
    try{
        reader = new BufferedReader( new InputStreamReader(websiteURL.openStream()));
        System.out.println("Got the reader");
    } catch(Exception e){
        e.printStackTrace();
        System.out.println("Bye");
        String html = "<html><h1>Heading 1</h1><body><h2>Heading 2</h2><p>hello</p></body></html>";
        return html;
    }
    String inputLine, result  = new String();
    while((inputLine = reader.readLine()) != null){
        result += inputLine;
    }
    reader.close();
    System.out.println("Made the html file");
    return result;
}




/**
 * 
 * @param argv all the command line parameters.
 * @throws MalformedURLException
 * @throws IOException
 */
public static void main(String[] argv) throws MalformedURLException, IOException, Exception{
    System.setProperty("proxyHost", "172.16.0.3");
    System.setProperty("proxyPort","8383");
     System.out.println("Sending url");

     // a html file or any url place here ------------------------------------
     URL url = new URL("put a html file here "); 
    Website website = new Website(url);

    System.out.println(url.toString());
    System.out.println("++++++++++++++++++++++++++++++++++++++++++++++++");
    website.getDataWithHeadingsTogether();

}
}