Java 使用jsoup从html获取URL
我正试图用jsoup获取一个url,以便从该url下载一个imadge,但由于某些原因它不起作用 我首先试图找到“div class=“rg_di”第一次出现在html文件中的位置, 而不是获取紧随其后的url:Java 使用jsoup从html获取URL,java,android,jsoup,Java,Android,Jsoup,我正试图用jsoup获取一个url,以便从该url下载一个imadge,但由于某些原因它不起作用 我首先试图找到“div class=“rg_di”第一次出现在html文件中的位置, 而不是获取紧随其后的url: a href="http://www.google.co.il/imgres?imgurl=http://michellepicker.files.wordpress.com/2011/03/grilled-chicken-mexican-style.jpg&imgre
a href="http://www.google.co.il/imgres?imgurl=http://michellepicker.files.wordpress.com/2011/03/grilled-chicken-mexican-style.jpg&imgrefurl=http://michellepicker.wordpress.com/2011/04/25/grilled-chicken-mexican-style-black-beans-guacamole/&h=522&w=700&tbnid=4hXCtCfljxmJXM:&zoom=1&docid=ajIrwZMUrP5_GM&ei=iVOqVPmDDYrnaJzYgIAM&tbm=isch"
这是html的url:
查看来源:
以下是我尝试的代码:
try
{
doc = Jsoup.connect(url).get();
Element link = doc.select("div.rg_di").first();
Element link2 = link.select("a").first();
String relHref = link2.attr("href"); // == "/"
String absHref = link.attr("abs:href");
tmpResult = absHref;
}
catch (Exception e)
{
Log.e("Error", e.getMessage());
e.printStackTrace();
}
完整活动代码:
package com.androidbegin.parselogintutorial;
import com.androidbegin.parselogintutorial.SingleRecipe.urlTask;
import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import com.koushikdutta.urlimageviewhelper.sample.UrlImageViewHelperSample;
import com.parse.GetCallback;
import com.parse.ParseException;
import com.parse.ParseObject;
import com.parse.ParseQuery;
import com.parse.ParseUser;
public class Bla extends Activity
{
ImageView iv,bm;
TextView recipeTitle;
String urlForImage = "";
@Override
protected void onCreate(Bundle savedInstanceState)
{
// TODO Auto-generated method stub
super.onCreate(savedInstanceState);
setContentView(R.layout.bla_layout);
new urlTask("grilled mexican chicken").execute("grilled mexican chicken");
//new DownloadImageTask((ImageView)findViewById(R.id.RecipeImage)).execute(urlForImage);
}
public class DownloadImageTask extends AsyncTask<String, Void, Bitmap>
{
ImageView bmImage;
public DownloadImageTask(ImageView bmImage) {
this.bmImage = bmImage;
}
protected Bitmap doInBackground(String... urls)
{
String urldisplay = urls[0];
Bitmap mIcon11 = null;
try
{
InputStream in = new java.net.URL(urldisplay).openStream();
mIcon11 = BitmapFactory.decodeStream(in);
in.close();
}
catch (Exception e)
{
Log.e("Error", e.getMessage());
e.printStackTrace();
}
return mIcon11;
}
protected void onPostExecute(Bitmap result)
{
bmImage.setImageBitmap(result);
}
}
public class urlTask extends AsyncTask<String, Void, String>
{
String str;
public urlTask(String str)
{
this.str = str;
}
String tmpResult = str;
Document doc;
protected String doInBackground(String... urls)
{
String urldisplay = urls[0];
String url = "https://www.google.co.il/search?q=grilled+mexican+chicken&es_sm=93&source=lnms&tbm=isch&sa=X&ei=h1OqVOH6B5bjaqGogvAP&ved=0CAgQ_AUoAQ&biw=1920&bih=955";
WebClient webClient = new WebClient(BrowserVersion.FIREFOX_24); // Chrome not working
HtmlPage page = null;
try
{
page = webClient.getPage(url);
} catch (FailingHttpStatusCodeException e1)
{
// TODO Auto-generated catch block
e1.printStackTrace();
}
catch (MalformedURLException e1)
{
// TODO Auto-generated catch block
e1.printStackTrace();
}
catch (IOException e1)
{
// TODO Auto-generated catch block
e1.printStackTrace();
}
try
{
Document doc = Jsoup.parse(page.asXml());
Elements divs = doc.select(".rg_di");
for(Element div : divs)
{
Element img = div.select("a").get(0);
String link = img.attr("href");
System.out.println(link);
}
}
catch (Exception e)
{
e.printStackTrace();
}
return tmpResult;
}
protected void onPostExecute(String result)
{
result = tmpResult;
urlForImage = tmpResult;
}
}
}
package com.androidbegin.parselogintutorial;
导入com.androidbegin.parselogintutorial.SingleRecipe.urlstask;
导入com.gargoylesoftware.htmlunit.BrowserVersion;
导入com.gargoylesoftware.htmlunit.failinghttpstatuscodecoexception;
导入com.gargoylesoftware.htmlunit.WebClient;
导入com.gargoylesoftware.htmlunit.html.HtmlPage;
导入com.koushikdutta.urlmageviewHelper.sample.urlmageviewHelperSample;
导入com.parse.GetCallback;
导入com.parse.ParseException;
导入com.parse.ParseObject;
导入com.parse.ParseQuery;
导入com.parse.ParseUser;
公共类Bla扩展活动
{
ImageView iv,bm;
文本视图往复;
字符串urlForImage=“”;
@凌驾
创建时受保护的void(Bundle savedInstanceState)
{
//TODO自动生成的方法存根
super.onCreate(savedInstanceState);
setContentView(R.layout.bla_layout);
新urlTask(“墨西哥烤鸡”)。执行(“墨西哥烤鸡”);
//新下载的ImageTask((ImageView)findViewById(R.id.RecipeImage)).execute(urlForImage);
}
公共类DownloadImageTask扩展了AsyncTask
{
图像视图bmImage;
公共下载ImageTask(ImageView bmImage){
this.bmImage=bmImage;
}
受保护位图doInBackground(字符串…URL)
{
字符串urldisplay=url[0];
位图mIcon11=null;
尝试
{
InputStream in=newjava.net.URL(urldisplay.openStream();
mIcon11=BitmapFactory.decodeStream(in);
in.close();
}
捕获(例外e)
{
Log.e(“Error”,e.getMessage());
e、 printStackTrace();
}
返回mIcon11;
}
受保护的void onPostExecute(位图结果)
{
bmImage.setImageBitmap(结果);
}
}
公共类urlTask扩展了AsyncTask
{
字符串str;
公共URL任务(字符串str)
{
this.str=str;
}
字符串tmpResult=str;
文件文件;
受保护的字符串doInBackground(字符串…URL)
{
字符串urldisplay=url[0];
字符串url=”https://www.google.co.il/search?q=grilled+墨西哥+鸡肉&es_sm=93&source=lnms&tbm=isch&sa=X&ei=h1OqVOH6B5bjaqGogvAP&ved=0CAgQ_AUoAQ&biw=1920&bih=955”;
WebClient WebClient=新的WebClient(BrowserVersion.FIREFOX_24);//Chrome不工作
HtmlPage=null;
尝试
{
page=webClient.getPage(url);
}捕获(失败TTPStatusCodeException e1)
{
//TODO自动生成的捕捉块
e1.printStackTrace();
}
捕获(格式错误的异常e1)
{
//TODO自动生成的捕捉块
e1.printStackTrace();
}
捕获(IOE1异常)
{
//TODO自动生成的捕捉块
e1.printStackTrace();
}
尝试
{
Document doc=Jsoup.parse(page.asXml());
Elements divs=doc.select(“.rg_di”);
用于(元素div:divs)
{
元素img=div.select(“a”).get(0);
字符串链接=img.attr(“href”);
System.out.println(链接);
}
}
捕获(例外e)
{
e、 printStackTrace();
}
返回tmpResult;
}
受保护的void onPostExecute(字符串结果)
{
结果=tMPpreslt;
urlForImage=tmpResult;
}
}
}
感谢您的帮助我编辑了您的代码以消除错误403 与此相反:
doc = Jsoup.connect(url).get();
写下:
doc = Jsoup.connect(url).userAgent("Mozilla").get();
这似乎是动态生成的。因此,Jsoup获取的html不包含.rgu di类
doc.select("div.rg_di").first();
返回null,我们得到nullpointerexception
jsoup下载的html代码段
<img height="104" src="https://encrypted-tbn2.gstatic.com/images?q=tbn:ANd9GcT-pctOxpuUcdq118aFU3s2miRfUa6Ev8eF-UsxARHV-vbcOUV8byEtt2YT" width="140">
有许多解析动态内容的解决方案
编辑1
我实现了htmlunit来呈现页面
import java.io.IOException;
import java.net.MalformedURLException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
public class Main {
public static void main(String[] args) throws FailingHttpStatusCodeException, MalformedURLException, IOException {
String url = "https://www.google.co.il/search?q=grilled+mexican+chicken&es_sm=93&source=lnms&tbm=isch&sa=X&ei=h1OqVOH6B5bjaqGogvAP&ved=0CAgQ_AUoAQ&biw=1920&bih=955";
WebClient webClient = new WebClient(BrowserVersion.FIREFOX_24); // Chrome not working
HtmlPage page = webClient.getPage(url);
try {
Document doc = Jsoup.parse(page.asXml());
Elements divs = doc.select(".rg_di");
for(Element div : divs){
Element img = div.select("a").get(0);
String link = img.attr("href");
System.out.println(link);
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
htmlunit有自己的html解析api,但我将坚持使用更直观的jsoup
编辑2
只要您的目标是在Android设备HTMLUN上呈现和解析HTML页面,它就不是一个好的选择
HtmlUnit使用Android上不可用的Java类。
除此之外,HtmlUnit还使用了许多其他库,其中一些库可能对这些库有自己的依赖关系。因此,尽管HmlUnit非常棒,但我认为让它在Android上运行可能并不容易
- 你可以尝试一种解决方法。或
- 你可以折磨自己,尝试解决办法(最好不要)。或
- 如果考虑guy的经验,重新设计软件架构会更好:
- 创建呈现网页并对其进行解析的java服务器。HTMLUnit+Jsoup
- 将解析后的数据以JSON格式保存在服务器的文件系统中。格森
- 创建servlet,在android应用程序请求JSON文件时发送该文件
import java.io.IOException;
import java.net.MalformedURLException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
public class Main {
public static void main(String[] args) throws FailingHttpStatusCodeException, MalformedURLException, IOException {
String url = "https://www.google.co.il/search?q=grilled+mexican+chicken&es_sm=93&source=lnms&tbm=isch&sa=X&ei=h1OqVOH6B5bjaqGogvAP&ved=0CAgQ_AUoAQ&biw=1920&bih=955";
WebClient webClient = new WebClient(BrowserVersion.FIREFOX_24); // Chrome not working
HtmlPage page = webClient.getPage(url);
try {
Document doc = Jsoup.parse(page.asXml());
Elements divs = doc.select(".rg_di");
for(Element div : divs){
Element img = div.select("a").get(0);
String link = img.attr("href");
System.out.println(link);
}
} catch (Exception e) {
e.printStackTrace();
}
}
}