Java 如何从网站获取HtmlElements

Java 如何从网站获取HtmlElements,java,jsoup,Java,Jsoup,我正在尝试从网站获取url和html元素。可以从网站获取url和html,但当一个url包含多个元素(如多个输入元素(或)多个textarea元素)时,我只能获取最后一个元素。代码如下所示 GetURLsAndElemens.java public static void main(String[] args) throws FileNotFoundException, IOException, ParseException { Prop

我正在尝试从网站获取url和html元素。可以从网站获取url和html,但当一个url包含多个元素(如多个输入元素(或)多个textarea元素)时,我只能获取最后一个元素。代码如下所示

GetURLsAndElemens.java

public static void main(String[] args) throws FileNotFoundException,
                IOException, ParseException {

            Properties properties = new Properties();
            properties
                    .load(new FileInputStream(
                            "src//io//servicely//ci//plugin//SeleniumResources.properties"));
            Map<String, String> urls = gettingUrls(properties
                    .getProperty("MAIN_URL"));
            GettingHTMLElements.getHTMLElements(urls);
            // .out.println(urls.size());
            // System.out.println(urls);
        }

        public static Map<String, String> gettingUrls(String mainURL) {
            Document doc = null;
            Map<String, String> urlsList = new HashMap<String, String>();
            try {
                System.out.println("Main URL " + mainURL);

                // need http protocol
                doc = Jsoup.connect(mainURL).get();
                GettingHTMLElements.getInputElements(doc, mainURL);

                // get page title
                // String title = doc.title();
                // System.out.println("title : " + title);

                // get all links
                Elements links = doc.select("a[href]");
                for (Element link : links) {
                    // urlsList.clear();

                    // get the value from href attribute and adding to list
                    if (link.attr("href").contains("http")) {
                        urlsList.put(link.attr("href"), link.text());

                    } else {
                        urlsList.put(mainURL + link.attr("href"), link.text());

                    }

                    // System.out.println(urlsList);
                }

            } catch (IOException e) {
                e.printStackTrace();
            }
            // System.out.println("Total urls are "+urlsList.size());
            // System.out.println(urlsList);
            return urlsList;
        }
publicstaticvoidmain(字符串[]args)抛出FileNotFoundException,
IOException,ParseException{
属性=新属性();
性质
.load(新文件输入流)(
“src//io//servicely//ci//plugin//SeleniumResources.properties”);
映射URL=gettingUrls(属性
.getProperty(“主URL”);
GettingHTMLElements.getHTMLElements(URL);
//.out.println(url.size());
//System.out.println(URL);
}
公共静态映射获取URL(字符串mainURL){
单据单据=空;
Map urlsList=newhashmap();
试一试{
System.out.println(“主URL”+mainURL);
//需要http协议吗
doc=Jsoup.connect(mainURL.get();
GettingHTMLElements.getInputElements(doc,mainURL);
//获取页面标题
//字符串title=doc.title();
//System.out.println(“标题:”+标题);
//获取所有链接
Elements links=doc.select(“a[href]”);
用于(元素链接:链接){
//urlsList.clear();
//从href属性获取值并添加到列表
if(link.attr(“href”).包含(“http”)){
urlsList.put(link.attr(“href”)、link.text();
}否则{
urlsList.put(mainURL+link.attr(“href”)、link.text();
}
//System.out.println(urlsList);
}
}捕获(IOE异常){
e、 printStackTrace();
}
//System.out.println(“总URL为”+urlsList.size());
//System.out.println(urlsList);
返回URL列表;
}
GettingHtmlElements.java

static Map<String, HtmlElements> urlList = new HashMap<String, HtmlElements>();

    public static void getHTMLElements(Map<String, String> urls)
            throws IOException {

        getElements(urls);

    }

    public static void getElements(Map<String, String> urls) throws IOException {

        for (Map.Entry<String, String> entry1 : urls.entrySet()) {

            try {

                System.out.println(entry1.getKey());

                Document doc = Jsoup.connect(entry1.getKey()).get();

                getInputElements(doc, entry1.getKey());

            }

            catch (Exception e) {
                e.printStackTrace();
            }

        }

        Map<String,HtmlElements> list = urlList;
        for(Map.Entry<String,HtmlElements> entry1:list.entrySet())
        {
            HtmlElements ele = entry1.getValue();
            System.out.println("url is "+entry1.getKey());
            System.out.println("input name "+ele.getInput_name());
        }
    }

    public static HtmlElements getInputElements(Document doc, String entry1) {

        HtmlElements htmlElements = new HtmlElements();
        Elements inputElements2 = doc.getElementsByTag("input");
        Elements textAreaElements2 = doc.getElementsByTag("textarea");
        Elements formElements3 = doc.getElementsByTag("form");

        for (Element inputElement : inputElements2) {
            String key = inputElement.attr("name");
            htmlElements.setInput_name(key);
            String key1 = inputElement.attr("type");
            htmlElements.setInput_type(key1);
            String key2 = inputElement.attr("class");
            htmlElements.setInput_class(key2);

        }
        for (Element inputElement : textAreaElements2) {
            String key = inputElement.attr("id");
            htmlElements.setTextarea_id(key);
            String key1 = inputElement.attr("name");
            htmlElements.setTextarea_name(key1);

                    }
        for (Element inputElement : formElements3) {
            String key = inputElement.attr("method");
            htmlElements.setForm_method(key);
            String key1 = inputElement.attr("action");
            htmlElements.setForm_action(key1);


        }

        return urlList.put(entry1, htmlElements);

    }
静态映射urlList=newHashMap();
公共静态void getHTMLElements(映射URL)
抛出IOException{
获取元素(URL);
}
公共静态void getElements(映射URL)引发IOException{
对于(Map.Entry entry1:url.entrySet()){
试一试{
System.out.println(entry1.getKey());
Document doc=Jsoup.connect(entry1.getKey()).get();
getInputElements(doc,entry1.getKey());
}
捕获(例外e){
e、 printStackTrace();
}
}
映射列表=URL列表;
对于(Map.Entry entry1:list.entrySet())
{
HtmlElements=entry1.getValue();
System.out.println(“url是”+entry1.getKey());
System.out.println(“输入名称”+ele.getInput_name());
}
}
公共静态HtmleElements getInputElements(文档文档,字符串entry1){
HtmlElements HtmlElements=新HtmlElements();
元素inputElements2=doc.getElementsByTag(“输入”);
元素textAreaElements2=doc.getElementsByTag(“textarea”);
元素formElements3=doc.getElementsByTag(“表单”);
对于(元素inputElement:inputElements2){
字符串键=inputElement.attr(“名称”);
htmlElements.setInput\u名称(键);
字符串key1=inputElement.attr(“类型”);
htmlElements.setInput_类型(键1);
字符串key2=inputElement.attr(“类”);
htmlElements.setInput_类(键2);
}
对于(元素inputElement:textAreaElements2){
字符串键=inputElement.attr(“id”);
htmlElements.setTextarea_id(键);
字符串key1=inputElement.attr(“名称”);
htmlElements.setTextarea_名称(键1);
}
for(元素inputElement:formElements3){
字符串键=inputElement.attr(“方法”);
htmlElements.setForm_方法(键);
字符串key1=inputElement.attr(“操作”);
htmlElements.setForm_动作(键1);
}
返回urlist.put(entry1,htmlElements);
}

我想要哪些元素将其作为bean。对于每个url,我都会得到url和htmle元素。但是,当url包含多个元素时,我只得到最后一个元素

您使用一个类
HtmlElements
,据我所知,它不属于JSoup的一部分。我不知道它的内部工作原理,但我假设它是某种html节点列表之类的东西

但是,您似乎是这样使用这个类的:

HtmlElements htmlElements = new HtmlElements();
htmlElements.setInput_name(key);
这表示htmlElements变量中只存储了一个html元素。这就解释了为什么只存储最后一个元素——只需一直覆盖一个实例

这不是很清楚,因为我不知道
HtmlElements
类。假设
HtmlElement
作为
HtmlElements
的单个实例工作,并且
HtmlElements
有一个方法
add
,可能类似的方法可以工作:

HtmlElements htmlElements = new HtmlElements();
...
for (Element inputElement : inputElements2) {
  HtmlElement e = new HtmlElement();
  htmlElements.add(e);
  String key = inputElement.attr("name");
  e.setInput_name(key);
}