如何在java中创建web爬虫?

如何在java中创建web爬虫?,java,android,Java,Android,您好,我想用java创建一个网络爬虫,在其中我想从网页中检索一些数据,如标题、描述,并将数据存储在数据库中 以及现有的开源爬虫程序: 看看这个例子: 以及现有的开源爬虫程序: 如果您想自己动手,请使用android中包含的 HttpClient的示例用法(您只需解析出: public class HttpTest { public static void main(String... args) throws ClientProtocolException, IOExcep

您好,我想用java创建一个网络爬虫,在其中我想从网页中检索一些数据,如标题、描述,并将数据存储在数据库中

以及现有的开源爬虫程序:
看看这个例子:

以及现有的开源爬虫程序:

如果您想自己动手,请使用android中包含的

HttpClient的示例用法(您只需解析出:

public class HttpTest {
    public static void main(String... args) 
    throws ClientProtocolException, IOException {
        crawlPage("http://www.google.com/");
    }

    static Set<String> checked = new HashSet<String>();

    private static void crawlPage(String url) throws ClientProtocolException, IOException {

        if (checked.contains(url))
            return;

        checked.add(url);

        System.out.println("Crawling: " + url);

        HttpClient client = new DefaultHttpClient();
        HttpGet request = new HttpGet("http://www.google.com");
        HttpResponse response = client.execute(request);

        Reader reader = null;
        try {
            reader = new InputStreamReader(response.getEntity().getContent());

            Links links = new Links();
            new ParserDelegator().parse(reader, links, true);

            for (String link : links.list) 
                if (link.startsWith("http://"))
                    crawlPage(link);

        } finally {
            if (reader != null) {
                try {
                    reader.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
    }



    static class Links extends HTMLEditorKit.ParserCallback {

        List<String> list = new LinkedList<String>();

        public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) {
            if (t == HTML.Tag.A)
                list.add(a.getAttribute(HTML.Attribute.HREF).toString());
        }
    }
}
公共类HttpTest{
公共静态void main(字符串…参数)
抛出ClientProtocolException,IOException{
爬网页(“http://www.google.com/");
}
选中的静态集=新HashSet();
私有静态无效爬网页面(字符串url)引发ClientProtocolException,IOException{
如果(选中。包含(url))
返回;
选中。添加(url);
System.out.println(“爬行:+url”);
HttpClient=new DefaultHttpClient();
HttpGet请求=新建HttpGet(“http://www.google.com");
HttpResponse response=client.execute(请求);
Reader=null;
试一试{
reader=新的InputStreamReader(response.getEntity().getContent());
链接=新链接();
新建ParserDelegator().parse(读取器,链接,true);
for(字符串链接:links.list)
if(link.startsWith(“http:/”)
爬网页面(链接);
}最后{
if(读卡器!=null){
试一试{
reader.close();
}捕获(IOE异常){
e、 printStackTrace();
}
}
}
}
静态类链接扩展了HTMLEditorKit.ParserCallback{
列表=新建LinkedList();
public void handleStartTag(HTML.Tag t、MutableAttributeSet a、int pos){
if(t==HTML.Tag.A)
添加(a.getAttribute(HTML.Attribute.HREF.toString());
}
}
}

如果您想自己动手,请使用android中包含的

HttpClient的示例用法(您只需解析出:

public class HttpTest {
    public static void main(String... args) 
    throws ClientProtocolException, IOException {
        crawlPage("http://www.google.com/");
    }

    static Set<String> checked = new HashSet<String>();

    private static void crawlPage(String url) throws ClientProtocolException, IOException {

        if (checked.contains(url))
            return;

        checked.add(url);

        System.out.println("Crawling: " + url);

        HttpClient client = new DefaultHttpClient();
        HttpGet request = new HttpGet("http://www.google.com");
        HttpResponse response = client.execute(request);

        Reader reader = null;
        try {
            reader = new InputStreamReader(response.getEntity().getContent());

            Links links = new Links();
            new ParserDelegator().parse(reader, links, true);

            for (String link : links.list) 
                if (link.startsWith("http://"))
                    crawlPage(link);

        } finally {
            if (reader != null) {
                try {
                    reader.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
    }



    static class Links extends HTMLEditorKit.ParserCallback {

        List<String> list = new LinkedList<String>();

        public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) {
            if (t == HTML.Tag.A)
                list.add(a.getAttribute(HTML.Attribute.HREF).toString());
        }
    }
}
公共类HttpTest{
公共静态void main(字符串…参数)
抛出ClientProtocolException,IOException{
爬网页(“http://www.google.com/");
}
选中的静态集=新HashSet();
私有静态无效爬网页面(字符串url)引发ClientProtocolException,IOException{
如果(选中。包含(url))
返回;
选中。添加(url);
System.out.println(“爬行:+url”);
HttpClient=new DefaultHttpClient();
HttpGet请求=新建HttpGet(“http://www.google.com");
HttpResponse response=client.execute(请求);
Reader=null;
试一试{
reader=新的InputStreamReader(response.getEntity().getContent());
链接=新链接();
新建ParserDelegator().parse(读取器,链接,true);
for(字符串链接:links.list)
if(link.startsWith(“http:/”)
爬网页面(链接);
}最后{
if(读卡器!=null){
试一试{
reader.close();
}捕获(IOE异常){
e、 printStackTrace();
}
}
}
}
静态类链接扩展了HTMLEditorKit.ParserCallback{
列表=新建LinkedList();
public void handleStartTag(HTML.Tag t、MutableAttributeSet a、int pos){
if(t==HTML.Tag.A)
添加(a.getAttribute(HTML.Attribute.HREF.toString());
}
}
}

您可以使用crawler4j。crawler4j是一个开源Java爬虫程序,它提供了一个简单的网络爬虫界面。您可以在几个小时内设置多线程网络爬虫。

您可以使用crawler4j。crawler4j是一个开源Java爬虫程序,它提供了一个简单的网络爬虫界面。您可以设置多线程网络爬虫几个小时后即可启动爬虫程序。

您可以使用WebCollector:

基于WebCollector 2.05的演示:

import cn.edu.hfut.dmic.webcollector.crawler.BreadthCrawler;
import cn.edu.hfut.dmic.webcollector.model.Links;
import cn.edu.hfut.dmic.webcollector.model.Page;
import java.util.regex.Pattern;
import org.jsoup.nodes.Document;

/**
 * Crawl news from yahoo news
 *
 * @author hu
 */
public class YahooCrawler extends BreadthCrawler {

    /**
     * @param crawlPath crawlPath is the path of the directory which maintains
     * information of this crawler
     * @param autoParse if autoParse is true,BreadthCrawler will auto extract
     * links which match regex rules from pag
     */
    public YahooCrawler(String crawlPath, boolean autoParse) {
        super(crawlPath, autoParse);
        /*start page*/
        this.addSeed("http://news.yahoo.com/");

        /*fetch url like http://news.yahoo.com/xxxxx*/
        this.addRegex("http://news.yahoo.com/.*");
        /*do not fetch url like http://news.yahoo.com/xxxx/xxx)*/
        this.addRegex("-http://news.yahoo.com/.+/.*");
        /*do not fetch jpg|png|gif*/
        this.addRegex("-.*\\.(jpg|png|gif).*");
        /*do not fetch url contains #*/
        this.addRegex("-.*#.*");
    }

    @Override
    public void visit(Page page, Links nextLinks) {
        String url = page.getUrl();
        /*if page is news page*/
        if (Pattern.matches("http://news.yahoo.com/.+html", url)) {
            /*we use jsoup to parse page*/
            Document doc = page.getDoc();

            /*extract title and content of news by css selector*/
            String title = doc.select("h1[class=headline]").first().text();
            String content = doc.select("div[class=body yom-art-content clearfix]").first().text();

            System.out.println("URL:\n" + url);
            System.out.println("title:\n" + title);
            System.out.println("content:\n" + content);

            /*If you want to add urls to crawl,add them to nextLink*/
            /*WebCollector automatically filters links that have been fetched before*/
            /*If autoParse is true and the link you add to nextLinks does not match the regex rules,the link will also been filtered.*/
            // nextLinks.add("http://xxxxxx.com");
        }
    }

    public static void main(String[] args) throws Exception {
        YahooCrawler crawler = new YahooCrawler("crawl", true);
        crawler.setThreads(50);
        crawler.setTopN(100);
        //crawler.setResumable(true);
        /*start crawl with depth of 4*/
        crawler.start(4);
    }

}

您可以使用WebCollector:

基于WebCollector 2.05的演示:

import cn.edu.hfut.dmic.webcollector.crawler.BreadthCrawler;
import cn.edu.hfut.dmic.webcollector.model.Links;
import cn.edu.hfut.dmic.webcollector.model.Page;
import java.util.regex.Pattern;
import org.jsoup.nodes.Document;

/**
 * Crawl news from yahoo news
 *
 * @author hu
 */
public class YahooCrawler extends BreadthCrawler {

    /**
     * @param crawlPath crawlPath is the path of the directory which maintains
     * information of this crawler
     * @param autoParse if autoParse is true,BreadthCrawler will auto extract
     * links which match regex rules from pag
     */
    public YahooCrawler(String crawlPath, boolean autoParse) {
        super(crawlPath, autoParse);
        /*start page*/
        this.addSeed("http://news.yahoo.com/");

        /*fetch url like http://news.yahoo.com/xxxxx*/
        this.addRegex("http://news.yahoo.com/.*");
        /*do not fetch url like http://news.yahoo.com/xxxx/xxx)*/
        this.addRegex("-http://news.yahoo.com/.+/.*");
        /*do not fetch jpg|png|gif*/
        this.addRegex("-.*\\.(jpg|png|gif).*");
        /*do not fetch url contains #*/
        this.addRegex("-.*#.*");
    }

    @Override
    public void visit(Page page, Links nextLinks) {
        String url = page.getUrl();
        /*if page is news page*/
        if (Pattern.matches("http://news.yahoo.com/.+html", url)) {
            /*we use jsoup to parse page*/
            Document doc = page.getDoc();

            /*extract title and content of news by css selector*/
            String title = doc.select("h1[class=headline]").first().text();
            String content = doc.select("div[class=body yom-art-content clearfix]").first().text();

            System.out.println("URL:\n" + url);
            System.out.println("title:\n" + title);
            System.out.println("content:\n" + content);

            /*If you want to add urls to crawl,add them to nextLink*/
            /*WebCollector automatically filters links that have been fetched before*/
            /*If autoParse is true and the link you add to nextLinks does not match the regex rules,the link will also been filtered.*/
            // nextLinks.add("http://xxxxxx.com");
        }
    }

    public static void main(String[] args) throws Exception {
        YahooCrawler crawler = new YahooCrawler("crawl", true);
        crawler.setThreads(50);
        crawler.setTopN(100);
        //crawler.setResumable(true);
        /*start crawl with depth of 4*/
        crawler.start(4);
    }

}

我喜欢HtmlUnit,但我不知道它在Android上的效果如何…告诉我如何使用HtmlUnit创建网络爬虫。首先我想解析一些数据并将其存储在db中。我喜欢HtmlUnit,但我不知道它在Android上的效果如何…告诉我如何使用HtmlUnit创建网络爬虫。首先我想解析一些数据并将其存储在db中。