Java 如何从网站获取HtmlElements
我正在尝试从网站获取url和html元素。可以从网站获取url和html,但当一个url包含多个元素(如多个输入元素(或)多个textarea元素)时,我只能获取最后一个元素。代码如下所示 GetURLsAndElemens.javaJava 如何从网站获取HtmlElements,java,jsoup,Java,Jsoup,我正在尝试从网站获取url和html元素。可以从网站获取url和html,但当一个url包含多个元素(如多个输入元素(或)多个textarea元素)时,我只能获取最后一个元素。代码如下所示 GetURLsAndElemens.java public static void main(String[] args) throws FileNotFoundException, IOException, ParseException { Prop
public static void main(String[] args) throws FileNotFoundException,
IOException, ParseException {
Properties properties = new Properties();
properties
.load(new FileInputStream(
"src//io//servicely//ci//plugin//SeleniumResources.properties"));
Map<String, String> urls = gettingUrls(properties
.getProperty("MAIN_URL"));
GettingHTMLElements.getHTMLElements(urls);
// .out.println(urls.size());
// System.out.println(urls);
}
public static Map<String, String> gettingUrls(String mainURL) {
Document doc = null;
Map<String, String> urlsList = new HashMap<String, String>();
try {
System.out.println("Main URL " + mainURL);
// need http protocol
doc = Jsoup.connect(mainURL).get();
GettingHTMLElements.getInputElements(doc, mainURL);
// get page title
// String title = doc.title();
// System.out.println("title : " + title);
// get all links
Elements links = doc.select("a[href]");
for (Element link : links) {
// urlsList.clear();
// get the value from href attribute and adding to list
if (link.attr("href").contains("http")) {
urlsList.put(link.attr("href"), link.text());
} else {
urlsList.put(mainURL + link.attr("href"), link.text());
}
// System.out.println(urlsList);
}
} catch (IOException e) {
e.printStackTrace();
}
// System.out.println("Total urls are "+urlsList.size());
// System.out.println(urlsList);
return urlsList;
}
publicstaticvoidmain(字符串[]args)抛出FileNotFoundException,
IOException,ParseException{
属性=新属性();
性质
.load(新文件输入流)(
“src//io//servicely//ci//plugin//SeleniumResources.properties”);
映射URL=gettingUrls(属性
.getProperty(“主URL”);
GettingHTMLElements.getHTMLElements(URL);
//.out.println(url.size());
//System.out.println(URL);
}
公共静态映射获取URL(字符串mainURL){
单据单据=空;
Map urlsList=newhashmap();
试一试{
System.out.println(“主URL”+mainURL);
//需要http协议吗
doc=Jsoup.connect(mainURL.get();
GettingHTMLElements.getInputElements(doc,mainURL);
//获取页面标题
//字符串title=doc.title();
//System.out.println(“标题:”+标题);
//获取所有链接
Elements links=doc.select(“a[href]”);
用于(元素链接:链接){
//urlsList.clear();
//从href属性获取值并添加到列表
if(link.attr(“href”).包含(“http”)){
urlsList.put(link.attr(“href”)、link.text();
}否则{
urlsList.put(mainURL+link.attr(“href”)、link.text();
}
//System.out.println(urlsList);
}
}捕获(IOE异常){
e、 printStackTrace();
}
//System.out.println(“总URL为”+urlsList.size());
//System.out.println(urlsList);
返回URL列表;
}
GettingHtmlElements.java
static Map<String, HtmlElements> urlList = new HashMap<String, HtmlElements>();
public static void getHTMLElements(Map<String, String> urls)
throws IOException {
getElements(urls);
}
public static void getElements(Map<String, String> urls) throws IOException {
for (Map.Entry<String, String> entry1 : urls.entrySet()) {
try {
System.out.println(entry1.getKey());
Document doc = Jsoup.connect(entry1.getKey()).get();
getInputElements(doc, entry1.getKey());
}
catch (Exception e) {
e.printStackTrace();
}
}
Map<String,HtmlElements> list = urlList;
for(Map.Entry<String,HtmlElements> entry1:list.entrySet())
{
HtmlElements ele = entry1.getValue();
System.out.println("url is "+entry1.getKey());
System.out.println("input name "+ele.getInput_name());
}
}
public static HtmlElements getInputElements(Document doc, String entry1) {
HtmlElements htmlElements = new HtmlElements();
Elements inputElements2 = doc.getElementsByTag("input");
Elements textAreaElements2 = doc.getElementsByTag("textarea");
Elements formElements3 = doc.getElementsByTag("form");
for (Element inputElement : inputElements2) {
String key = inputElement.attr("name");
htmlElements.setInput_name(key);
String key1 = inputElement.attr("type");
htmlElements.setInput_type(key1);
String key2 = inputElement.attr("class");
htmlElements.setInput_class(key2);
}
for (Element inputElement : textAreaElements2) {
String key = inputElement.attr("id");
htmlElements.setTextarea_id(key);
String key1 = inputElement.attr("name");
htmlElements.setTextarea_name(key1);
}
for (Element inputElement : formElements3) {
String key = inputElement.attr("method");
htmlElements.setForm_method(key);
String key1 = inputElement.attr("action");
htmlElements.setForm_action(key1);
}
return urlList.put(entry1, htmlElements);
}
静态映射urlList=newHashMap();
公共静态void getHTMLElements(映射URL)
抛出IOException{
获取元素(URL);
}
公共静态void getElements(映射URL)引发IOException{
对于(Map.Entry entry1:url.entrySet()){
试一试{
System.out.println(entry1.getKey());
Document doc=Jsoup.connect(entry1.getKey()).get();
getInputElements(doc,entry1.getKey());
}
捕获(例外e){
e、 printStackTrace();
}
}
映射列表=URL列表;
对于(Map.Entry entry1:list.entrySet())
{
HtmlElements=entry1.getValue();
System.out.println(“url是”+entry1.getKey());
System.out.println(“输入名称”+ele.getInput_name());
}
}
公共静态HtmleElements getInputElements(文档文档,字符串entry1){
HtmlElements HtmlElements=新HtmlElements();
元素inputElements2=doc.getElementsByTag(“输入”);
元素textAreaElements2=doc.getElementsByTag(“textarea”);
元素formElements3=doc.getElementsByTag(“表单”);
对于(元素inputElement:inputElements2){
字符串键=inputElement.attr(“名称”);
htmlElements.setInput\u名称(键);
字符串key1=inputElement.attr(“类型”);
htmlElements.setInput_类型(键1);
字符串key2=inputElement.attr(“类”);
htmlElements.setInput_类(键2);
}
对于(元素inputElement:textAreaElements2){
字符串键=inputElement.attr(“id”);
htmlElements.setTextarea_id(键);
字符串key1=inputElement.attr(“名称”);
htmlElements.setTextarea_名称(键1);
}
for(元素inputElement:formElements3){
字符串键=inputElement.attr(“方法”);
htmlElements.setForm_方法(键);
字符串key1=inputElement.attr(“操作”);
htmlElements.setForm_动作(键1);
}
返回urlist.put(entry1,htmlElements);
}
我想要哪些元素将其作为bean。对于每个url,我都会得到url和htmle元素。但是,当url包含多个元素时,我只得到最后一个元素您使用一个类
HtmlElements
,据我所知,它不属于JSoup的一部分。我不知道它的内部工作原理,但我假设它是某种html节点列表之类的东西
但是,您似乎是这样使用这个类的:
HtmlElements htmlElements = new HtmlElements();
htmlElements.setInput_name(key);
这表示htmlElements变量中只存储了一个html元素。这就解释了为什么只存储最后一个元素——只需一直覆盖一个实例
这不是很清楚,因为我不知道HtmlElements
类。假设HtmlElement
作为HtmlElements
的单个实例工作,并且HtmlElements
有一个方法add
,可能类似的方法可以工作:
HtmlElements htmlElements = new HtmlElements();
...
for (Element inputElement : inputElements2) {
HtmlElement e = new HtmlElement();
htmlElements.add(e);
String key = inputElement.attr("name");
e.setInput_name(key);
}