Warning: file_get_contents(/data/phpspider/zhask/data//catemap/3/html/77.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Java Jsoup Html解析查找内部链接数据时出现问题_Java_Html_Jsoup - Fatal编程技术网

Java Jsoup Html解析查找内部链接数据时出现问题

Java Jsoup Html解析查找内部链接数据时出现问题,java,html,jsoup,Java,Html,Jsoup,通常我们在一个文件中有许多内部链接。我想解析一个html文件,以便在地图中获得页面的标题及其相应的数据 我所做的步骤: 1) 已获取所有内部参考元素 2) 分析了id=XXX的文档,其中XXX==(元素 ] 强制实行相互排斥 有软件和硬件两种解决方案来强制执行互斥。 不同的解决方案如下所示 []硬件 解决 系统内部实现互斥的常用方法 是 使残废 这是我的密码: import java.io.BufferedReader; import java.io.IOException; import

通常我们在一个文件中有许多内部链接。我想解析一个html文件,以便在地图中获得页面的标题及其相应的数据

我所做的步骤:
1) 已获取所有内部参考元素
2) 分析了id=XXX的文档,其中XXX==(元素
  • ] 强制实行相互排斥 有软件和硬件两种解决方案来强制执行互斥。 不同的解决方案如下所示

    []硬件 解决 系统内部实现互斥的常用方法 是 使残废
这是我的密码:

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public final class Website {
private URL websiteURL ;
private Document httpDoc ;
LinkedHashMap<String, ArrayList<String>> internalLinks = 
        new LinkedHashMap<String, ArrayList<String>>();

public Website(URL __websiteURL) throws MalformedURLException, IOException, Exception{
    if(__websiteURL == null)
        throw new Exception();
    websiteURL = __websiteURL;
    httpDoc = Jsoup.parse(connect());

    System.out.println("Parsed the http file to Document");
}


/* Here is my function: i first gets all the internal links in internalLinksElements. 
   I then get the href name of <a ..> tag so that i can search for it in documnet.

*/
public void getDataWithHeadingsTogether(){
    Elements internalLinksElements;
    internalLinksElements = httpDoc.select("a[href^=#]");
    for(Element element : internalLinksElements){

// some inline links were bad. i only those having span as their child. 
        Elements spanElements = element.select("span");
        if(!spanElements.isEmpty()){
            System.out.println("Text(): " + element.text());  // this can not give what i want
            //  ok i get the href tag name that would be the id
            String href = element.attr("href") ; 
            href = href.replace("#", "");
            System.out.println(href);
            // selecting the element where we have that id.
            Element data = httpDoc.getElementById(href);
            // got the span
            if(data == null)
                continue;
            Elements children = new Elements();
            // problem is here.
            while(children.isEmpty()){
                // going to its element unless gets some data.
                data = data.parent();
                System.out.println(data);
                children = data.select("p");
            }
            // its giving me all the data of file. thats bad.
            System.out.println(children.text());
        }
    }
}
/**
 * 
 * @return String Get all the headings of the document.
 * @throws MalformedURLException
 * @throws IOException
 */
@SuppressWarnings("CallToThreadDumpStack")
public String connect() throws MalformedURLException, IOException{
    // Is this thread safe ? url.openStream();
    BufferedReader  reader = null;
    try{
        reader = new BufferedReader( new InputStreamReader(websiteURL.openStream()));
        System.out.println("Got the reader");
    } catch(Exception e){
        e.printStackTrace();
        System.out.println("Bye");
        String html = "<html><h1>Heading 1</h1><body><h2>Heading 2</h2><p>hello</p></body></html>";
        return html;
    }
    String inputLine, result  = new String();
    while((inputLine = reader.readLine()) != null){
        result += inputLine;
    }
    reader.close();
    System.out.println("Made the html file");
    return result;
}




/**
 * 
 * @param argv all the command line parameters.
 * @throws MalformedURLException
 * @throws IOException
 */
public static void main(String[] argv) throws MalformedURLException, IOException, Exception{
    System.setProperty("proxyHost", "172.16.0.3");
    System.setProperty("proxyPort","8383");
     System.out.println("Sending url");

     // a html file or any url place here ------------------------------------
     URL url = new URL("put a html file here "); 
    Website website = new Website(url);

    System.out.println(url.toString());
    System.out.println("++++++++++++++++++++++++++++++++++++++++++++++++");
    website.getDataWithHeadingsTogether();

}
}
导入java.io.BufferedReader;
导入java.io.IOException;
导入java.io.InputStreamReader;
导入java.net.MalformedURLException;
导入java.net.URL;
导入java.util.ArrayList;
导入java.util.LinkedHashMap;
导入org.jsoup.jsoup;
导入org.jsoup.nodes.Document;
导入org.jsoup.nodes.Element;
导入org.jsoup.select.Elements;
公开期末考试网站{
私有URL网站URL;
私人文件httpDoc;
LinkedHashMap内部链接=
新建LinkedHashMap();
公共网站(URL\uuu websiteURL)抛出畸形的异常异常,IOException,Exception{
如果(_网站URL==null)
抛出新异常();
websiteURL=uu websiteURL;
httpDoc=Jsoup.parse(connect());
println(“将http文件解析为文档”);
}
/*这是我的函数:我首先获取internalLinksElements中的所有内部链接。
然后我得到标签的href名称,以便在documnet中搜索它。
*/
public void getDataWithHeadingsTogether(){
元素内部链接选择;
internalLinksElements=httpDoc.select(“a[href^=#]”;
对于(元素:内部链接选择元素){
//一些内联链接不好。我只知道那些有span作为他们孩子的链接。
元素span元素=元素。选择(“span”);
如果(!spanElements.isEmpty()){
System.out.println(“Text():”+element.Text());//这不能满足我的要求
//好的,我得到了href标签名,它将是id
字符串href=element.attr(“href”);
href=href.replace(“#,”);
System.out.println(href);
//选择具有该id的元素。
元素数据=httpDoc.getElementById(href);
//得到了跨度
如果(数据==null)
持续
元素子元素=新元素();
//问题就在这里。
while(children.isEmpty()){
//除非获得一些数据,否则将转到其元素。
data=data.parent();
系统输出打印项次(数据);
子项=数据。选择(“p”);
}
//它给了我文件中的所有数据,这很糟糕。
System.out.println(children.text());
}
}
}
/**
* 
*@return String获取文档的所有标题。
*@抛出错误的DurException
*@抛出异常
*/
@SuppressWarnings(“CallToThreadDumpStack”)
public String connect()引发MalformedURLException,IOException{
//这是线程安全的吗?url.openStream();
BufferedReader reader=null;
试一试{
reader=new BufferedReader(新的InputStreamReader(websiteURL.openStream());
System.out.println(“获得了阅读器”);
}捕获(例外e){
e、 printStackTrace();
System.out.println(“再见”);
String html=“Heading 1Heading 2hello

”; 返回html; } 字符串输入行,结果=新字符串(); 而((inputLine=reader.readLine())!=null){ 结果+=输入线; } reader.close(); System.out.println(“生成html文件”); 返回结果; } /** * *@param argv所有命令行参数。 *@抛出错误的DurException *@抛出异常 */ publicstaticvoidmain(字符串[]argv)引发畸形的DurLexception、IOException、Exception{ System.setProperty(“proxyHost”、“172.16.0.3”); System.setProperty(“proxyPort”、“8383”); System.out.println(“发送url”); //html文件或此处的任何url位置------------------------------------ URL=新URL(“在此处放置html文件”); 网站=新网站(url); System.out.println(url.toString()); System.out.println(“+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++”; website.getDataWithHeadingsTogether(); } }
我认为您需要了解,您正在查找的
是标头元素的子元素,并且您要存储的数据是由该标头的同级组成的


因此,您需要获取
,然后使用来收集作为该
数据的节点。当您的同级数据用完时,或者遇到另一个标头元素时,您需要停止收集数据,因为另一个标头指示下一项数据的开始。

您是说您的HTML文档具有内部锚定,如
,每个锚定后面都有一些您想从中获取一些值的标记?如果是,请尽可能显示一些实际的HTML。另外,使用
getElementById(href)
不应该用作内部锚定,应该使用
name
属性,而不是
id
属性。@Paul请查看我的html文件。它是wikipedia页面的典型源文件。link[我假设是这个wikipedia页面,它有子标题(硬件解决方案和软件解决方案)。你想如何处理这些?@Paul我只想要Map。我可以维护另一个列表,它可以告诉层次结构。有没有办法找到这个元素是标记还是这是什么?
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public final class Website {
private URL websiteURL ;
private Document httpDoc ;
LinkedHashMap<String, ArrayList<String>> internalLinks = 
        new LinkedHashMap<String, ArrayList<String>>();

public Website(URL __websiteURL) throws MalformedURLException, IOException, Exception{
    if(__websiteURL == null)
        throw new Exception();
    websiteURL = __websiteURL;
    httpDoc = Jsoup.parse(connect());

    System.out.println("Parsed the http file to Document");
}


/* Here is my function: i first gets all the internal links in internalLinksElements. 
   I then get the href name of <a ..> tag so that i can search for it in documnet.

*/
public void getDataWithHeadingsTogether(){
    Elements internalLinksElements;
    internalLinksElements = httpDoc.select("a[href^=#]");
    for(Element element : internalLinksElements){

// some inline links were bad. i only those having span as their child. 
        Elements spanElements = element.select("span");
        if(!spanElements.isEmpty()){
            System.out.println("Text(): " + element.text());  // this can not give what i want
            //  ok i get the href tag name that would be the id
            String href = element.attr("href") ; 
            href = href.replace("#", "");
            System.out.println(href);
            // selecting the element where we have that id.
            Element data = httpDoc.getElementById(href);
            // got the span
            if(data == null)
                continue;
            Elements children = new Elements();
            // problem is here.
            while(children.isEmpty()){
                // going to its element unless gets some data.
                data = data.parent();
                System.out.println(data);
                children = data.select("p");
            }
            // its giving me all the data of file. thats bad.
            System.out.println(children.text());
        }
    }
}
/**
 * 
 * @return String Get all the headings of the document.
 * @throws MalformedURLException
 * @throws IOException
 */
@SuppressWarnings("CallToThreadDumpStack")
public String connect() throws MalformedURLException, IOException{
    // Is this thread safe ? url.openStream();
    BufferedReader  reader = null;
    try{
        reader = new BufferedReader( new InputStreamReader(websiteURL.openStream()));
        System.out.println("Got the reader");
    } catch(Exception e){
        e.printStackTrace();
        System.out.println("Bye");
        String html = "<html><h1>Heading 1</h1><body><h2>Heading 2</h2><p>hello</p></body></html>";
        return html;
    }
    String inputLine, result  = new String();
    while((inputLine = reader.readLine()) != null){
        result += inputLine;
    }
    reader.close();
    System.out.println("Made the html file");
    return result;
}




/**
 * 
 * @param argv all the command line parameters.
 * @throws MalformedURLException
 * @throws IOException
 */
public static void main(String[] argv) throws MalformedURLException, IOException, Exception{
    System.setProperty("proxyHost", "172.16.0.3");
    System.setProperty("proxyPort","8383");
     System.out.println("Sending url");

     // a html file or any url place here ------------------------------------
     URL url = new URL("put a html file here "); 
    Website website = new Website(url);

    System.out.println(url.toString());
    System.out.println("++++++++++++++++++++++++++++++++++++++++++++++++");
    website.getDataWithHeadingsTogether();

}
}