Java 使用jsoup从html获取URL

Java 使用jsoup从html获取URL,java,android,jsoup,Java,Android,Jsoup,我正试图用jsoup获取一个url,以便从该url下载一个imadge,但由于某些原因它不起作用 我首先试图找到“div class=“rg_di”第一次出现在html文件中的位置, 而不是获取紧随其后的url: a href="http://www.google.co.il/imgres?imgurl=http://michellepicker.files.wordpress.com/2011/03/grilled-chicken-mexican-style.jpg&imgre

我正试图用jsoup获取一个url,以便从该url下载一个imadge,但由于某些原因它不起作用

我首先试图找到“div class=“rg_di”第一次出现在html文件中的位置, 而不是获取紧随其后的url:

a href="http://www.google.co.il/imgres?imgurl=http://michellepicker.files.wordpress.com/2011/03/grilled-chicken-mexican-style.jpg&imgrefurl=http://michellepicker.wordpress.com/2011/04/25/grilled-chicken-mexican-style-black-beans-guacamole/&h=522&w=700&tbnid=4hXCtCfljxmJXM:&zoom=1&docid=ajIrwZMUrP5_GM&ei=iVOqVPmDDYrnaJzYgIAM&tbm=isch"
这是html的url:

查看来源:

以下是我尝试的代码:

try 
        {
            doc = Jsoup.connect(url).get();
            Element link = doc.select("div.rg_di").first();
            Element link2 = link.select("a").first();
            String relHref = link2.attr("href"); // == "/"
            String absHref = link.attr("abs:href");
            tmpResult = absHref;



        } 
        catch (Exception e) 
        {
            Log.e("Error", e.getMessage());
            e.printStackTrace();
        }
完整活动代码:

package com.androidbegin.parselogintutorial;

import com.androidbegin.parselogintutorial.SingleRecipe.urlTask;
import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import com.koushikdutta.urlimageviewhelper.sample.UrlImageViewHelperSample;
import com.parse.GetCallback;
import com.parse.ParseException;
import com.parse.ParseObject;
import com.parse.ParseQuery;
import com.parse.ParseUser;
public class Bla extends Activity
{
    ImageView iv,bm;
    TextView recipeTitle;
    String urlForImage = "";
    @Override
    protected void onCreate(Bundle savedInstanceState) 
    {
        // TODO Auto-generated method stub
        super.onCreate(savedInstanceState);
        setContentView(R.layout.bla_layout);
        new urlTask("grilled mexican chicken").execute("grilled mexican chicken");
        //new DownloadImageTask((ImageView)findViewById(R.id.RecipeImage)).execute(urlForImage);
    }
    public class DownloadImageTask extends AsyncTask<String, Void, Bitmap> 
    {
        ImageView bmImage;
        public DownloadImageTask(ImageView bmImage) {
            this.bmImage = bmImage;
        }
        protected Bitmap doInBackground(String... urls) 
        {
            String urldisplay = urls[0];
            Bitmap mIcon11 = null;
            try 
            {
                InputStream in = new java.net.URL(urldisplay).openStream();
                mIcon11 = BitmapFactory.decodeStream(in);
                in.close();
            } 
            catch (Exception e) 
            {
                Log.e("Error", e.getMessage());
                e.printStackTrace();
            }
            return mIcon11;
        }
        protected void onPostExecute(Bitmap result) 
        {
            bmImage.setImageBitmap(result);
        }   
    }
    public class urlTask extends AsyncTask<String, Void, String> 
    {
        String str;
        public urlTask(String str)
        {
            this.str = str;
        }
        String tmpResult = str;
        Document doc;
        protected String doInBackground(String... urls) 
        {
            String urldisplay = urls[0];
            String url = "https://www.google.co.il/search?q=grilled+mexican+chicken&es_sm=93&source=lnms&tbm=isch&sa=X&ei=h1OqVOH6B5bjaqGogvAP&ved=0CAgQ_AUoAQ&biw=1920&bih=955";
            WebClient webClient = new WebClient(BrowserVersion.FIREFOX_24); // Chrome not working
            HtmlPage page = null;
            try 
            {
                page = webClient.getPage(url);
            } catch (FailingHttpStatusCodeException e1) 
            {
                // TODO Auto-generated catch block
                e1.printStackTrace();
            }
            catch (MalformedURLException e1) 
            {
                // TODO Auto-generated catch block
                e1.printStackTrace();
            }
            catch (IOException e1) 
            {
                // TODO Auto-generated catch block
                e1.printStackTrace();
            } 
            try 
            {
                Document doc = Jsoup.parse(page.asXml());
                Elements divs = doc.select(".rg_di");
                for(Element div : divs)
                {
                    Element img = div.select("a").get(0);
                    String link  = img.attr("href");
                    System.out.println(link);
                }

            }
            catch (Exception e) 
            {
                 e.printStackTrace();
            }
            return tmpResult;
        }
        protected void onPostExecute(String result) 
        {
            result = tmpResult;
            urlForImage = tmpResult;
        }   
    }
}
package com.androidbegin.parselogintutorial;
导入com.androidbegin.parselogintutorial.SingleRecipe.urlstask;
导入com.gargoylesoftware.htmlunit.BrowserVersion;
导入com.gargoylesoftware.htmlunit.failinghttpstatuscodecoexception;
导入com.gargoylesoftware.htmlunit.WebClient;
导入com.gargoylesoftware.htmlunit.html.HtmlPage;
导入com.koushikdutta.urlmageviewHelper.sample.urlmageviewHelperSample;
导入com.parse.GetCallback;
导入com.parse.ParseException;
导入com.parse.ParseObject;
导入com.parse.ParseQuery;
导入com.parse.ParseUser;
公共类Bla扩展活动
{
ImageView iv,bm;
文本视图往复;
字符串urlForImage=“”;
@凌驾
创建时受保护的void(Bundle savedInstanceState)
{
//TODO自动生成的方法存根
super.onCreate(savedInstanceState);
setContentView(R.layout.bla_layout);
新urlTask(“墨西哥烤鸡”)。执行(“墨西哥烤鸡”);
//新下载的ImageTask((ImageView)findViewById(R.id.RecipeImage)).execute(urlForImage);
}
公共类DownloadImageTask扩展了AsyncTask
{
图像视图bmImage;
公共下载ImageTask(ImageView bmImage){
this.bmImage=bmImage;
}
受保护位图doInBackground(字符串…URL)
{
字符串urldisplay=url[0];
位图mIcon11=null;
尝试
{
InputStream in=newjava.net.URL(urldisplay.openStream();
mIcon11=BitmapFactory.decodeStream(in);
in.close();
} 
捕获(例外e)
{
Log.e(“Error”,e.getMessage());
e、 printStackTrace();
}
返回mIcon11;
}
受保护的void onPostExecute(位图结果)
{
bmImage.setImageBitmap(结果);
}   
}
公共类urlTask扩展了AsyncTask
{
字符串str;
公共URL任务(字符串str)
{
this.str=str;
}
字符串tmpResult=str;
文件文件;
受保护的字符串doInBackground(字符串…URL)
{
字符串urldisplay=url[0];
字符串url=”https://www.google.co.il/search?q=grilled+墨西哥+鸡肉&es_sm=93&source=lnms&tbm=isch&sa=X&ei=h1OqVOH6B5bjaqGogvAP&ved=0CAgQ_AUoAQ&biw=1920&bih=955”;
WebClient WebClient=新的WebClient(BrowserVersion.FIREFOX_24);//Chrome不工作
HtmlPage=null;
尝试
{
page=webClient.getPage(url);
}捕获(失败TTPStatusCodeException e1)
{
//TODO自动生成的捕捉块
e1.printStackTrace();
}
捕获(格式错误的异常e1)
{
//TODO自动生成的捕捉块
e1.printStackTrace();
}
捕获(IOE1异常)
{
//TODO自动生成的捕捉块
e1.printStackTrace();
} 
尝试
{
Document doc=Jsoup.parse(page.asXml());
Elements divs=doc.select(“.rg_di”);
用于(元素div:divs)
{
元素img=div.select(“a”).get(0);
字符串链接=img.attr(“href”);
System.out.println(链接);
}
}
捕获(例外e)
{
e、 printStackTrace();
}
返回tmpResult;
}
受保护的void onPostExecute(字符串结果)
{
结果=tMPpreslt;
urlForImage=tmpResult;
}   
}
}

感谢您的帮助

我编辑了您的代码以消除错误403

与此相反:

doc = Jsoup.connect(url).get();
写下:

doc = Jsoup.connect(url).userAgent("Mozilla").get();

这似乎是动态生成的。因此,Jsoup获取的html不包含.rgu di

doc.select("div.rg_di").first();
返回null,我们得到nullpointerexception

jsoup下载的html代码段

<img height="104" src="https://encrypted-tbn2.gstatic.com/images?q=tbn:ANd9GcT-pctOxpuUcdq118aFU3s2miRfUa6Ev8eF-UsxARHV-vbcOUV8byEtt2YT" width="140">
有许多解析动态内容的解决方案

编辑1 我实现了htmlunit来呈现页面

import java.io.IOException;
import java.net.MalformedURLException;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlPage;


public class Main {
    public static void main(String[] args) throws FailingHttpStatusCodeException, MalformedURLException, IOException {
        String url = "https://www.google.co.il/search?q=grilled+mexican+chicken&es_sm=93&source=lnms&tbm=isch&sa=X&ei=h1OqVOH6B5bjaqGogvAP&ved=0CAgQ_AUoAQ&biw=1920&bih=955";
        WebClient webClient = new WebClient(BrowserVersion.FIREFOX_24); // Chrome not working
        HtmlPage page = webClient.getPage(url); 
        try {
            Document doc = Jsoup.parse(page.asXml());
            Elements divs = doc.select(".rg_di");
            for(Element div : divs){
                Element img = div.select("a").get(0);
                String link  = img.attr("href");
                System.out.println(link);
            }
        } catch (Exception e) {
             e.printStackTrace();
        }
    }
}
htmlunit有自己的html解析api,但我将坚持使用更直观的jsoup

编辑2 只要您的目标是在Android设备HTMLUN上呈现和解析HTML页面,它就不是一个好的选择

HtmlUnit使用Android上不可用的Java类。 除此之外,HtmlUnit还使用了许多其他库,其中一些库可能对这些库有自己的依赖关系。因此,尽管HmlUnit非常棒,但我认为让它在Android上运行可能并不容易

  • 你可以尝试一种解决方法。或
  • 你可以折磨自己,尝试解决办法(最好不要)。或
  • 如果考虑guy的经验,重新设计软件架构会更好:
  • 创建呈现网页并对其进行解析的java服务器。HTMLUnit+Jsoup
  • 将解析后的数据以JSON格式保存在服务器的文件系统中。格森
  • 创建servlet,在android应用程序请求JSON文件时发送该文件

您是否收到错误403?您是否解决了问题?非常感谢您的回答!我试着查看链接中的所有答案。这一切都非常复杂,即使是“httmlunit”也不容易理解如何实现,你有没有可能发布一个工作
import java.io.IOException;
import java.net.MalformedURLException;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlPage;


public class Main {
    public static void main(String[] args) throws FailingHttpStatusCodeException, MalformedURLException, IOException {
        String url = "https://www.google.co.il/search?q=grilled+mexican+chicken&es_sm=93&source=lnms&tbm=isch&sa=X&ei=h1OqVOH6B5bjaqGogvAP&ved=0CAgQ_AUoAQ&biw=1920&bih=955";
        WebClient webClient = new WebClient(BrowserVersion.FIREFOX_24); // Chrome not working
        HtmlPage page = webClient.getPage(url); 
        try {
            Document doc = Jsoup.parse(page.asXml());
            Elements divs = doc.select(".rg_di");
            for(Element div : divs){
                Element img = div.select("a").get(0);
                String link  = img.attr("href");
                System.out.println(link);
            }
        } catch (Exception e) {
             e.printStackTrace();
        }
    }
}