Java 从HTML字符串中提取URL_Java_Html_Html Parsing

Java 从HTML字符串中提取URL
java html
Java 从HTML字符串中提取URL,java,html,html-parsing,Java,Html,Html Parsing,我试图从给定的字符串中提取URL，该字符串包含带有HREF标记的HTTP响应。我已经到达链接的开头，但我需要在HREF结束后立即终止字符串。如何做到这一点 public class Extracturl { public static void main(String[] args) throws IOException { // TODO Auto-generated method stub String line; try { String u="h
我试图从给定的
字符串中提取URL，该字符串包含带有HREF标记的HTTP响应。我已经到达链接的开头，但我需要在HREF结束后立即终止字符串。如何做到这一点
public class Extracturl {
public static void main(String[] args) throws IOException {
    // TODO Auto-generated method stub
    String line;

    try {
        String u="http://en.wikipedia.org/wiki/china";
        String fileName = "e:\\test.txt";
         BufferedWriter writer = new BufferedWriter(new FileWriter(fileName,true));
        url = new URL(u);
        is = url.openStream();  // throws an IOException
        dis = new DataInputStream(new BufferedInputStream(is));

        String w=new String();
        while ((line = dis.readLine()) != null) {


                try {
   if(line.contains("href=\"/wiki")&&line.contains("\" />")&& (!line.contains("File")))
                    {   

                    if(!w.contains(line.substring(line.indexOf("href=\"/"))))
                    {w=w+line.substring(line.indexOf("href=\"/"));                        
                        System.out.println(line.substring(line.indexOf("href=\"/"))); 
                    writer.write(w);
                    writer.newLine();
                    }}
                } catch (IOException e) {
                    e.printStackTrace();
                }
        }
    } catch (MalformedURLException mue) {
         mue.printStackTrace();
    } catch (IOException ioe) {
         ioe.printStackTrace();
    } finally {
        try {
            is.close();

           // writer.close();
        } catch (IOException ioe) {
            // nothing to see here
        }
    }
}

    }

我甚至试过
   w=w+line.substring(line.indexOf("href=\"/"),line.indexOf("\">"));

但这给了我错误
我的目标是获取从页面链接的所有URL。
为此使用HTML解析器。下面是一个嵌入式Java HTML解析器的示例。还有其他选择，例如，但对于基本的HTML处理，这一个做得相当好：
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.LinkedHashSet;
import java.util.Set;

import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTML.Tag;
import javax.swing.text.html.HTMLEditorKit;
import javax.swing.text.html.parser.ParserDelegator;

public class URLExtractor {

    private static class HTMLPaserCallBack extends HTMLEditorKit.ParserCallback {

        private Set<String> urls;

        public HTMLPaserCallBack() {
            urls = new LinkedHashSet<String>();
        }

        public Set<String> getUrls() {
            return urls;
        }

        @Override
        public void handleSimpleTag(Tag t, MutableAttributeSet a, int pos) {
            handleTag(t, a, pos);
        }

        @Override
        public void handleStartTag(Tag t, MutableAttributeSet a, int pos) {
            handleTag(t, a, pos);
        }

        private void handleTag(Tag t, MutableAttributeSet a, int pos) {
            if (t == Tag.A) {
                Object href = a.getAttribute(HTML.Attribute.HREF);
                if (href != null) {
                    String url = href.toString();
                    if (!urls.contains(url)) {
                        urls.add(url);
                    }
                }
            }
        }
    }

    public static void main(String[] args) throws IOException {
        InputStream is = null;
        try {
            String u = "http://en.wikipedia.org/wiki/china";
            URL url = new URL(u);
            is = url.openStream(); // throws an IOException
            HTMLPaserCallBack cb = new HTMLPaserCallBack();
            new ParserDelegator().parse(new BufferedReader(new InputStreamReader(is)), cb, true);
            for (String aUrl : cb.getUrls()) {
                System.out.println("Found URL: " + aUrl);
            }
        } catch (MalformedURLException mue) {
            mue.printStackTrace();
        } catch (IOException ioe) {
            ioe.printStackTrace();
        } finally {
            try {
                is.close();
            } catch (IOException ioe) {
                // nothing to see here
            }
        }
    }
}

导入java.io.BufferedReader；
导入java.io.IOException；
导入java.io.InputStream；
导入java.io.InputStreamReader；
导入java.net.MalformedURLException；
导入java.net.URL；
导入java.util.LinkedHashSet；
导入java.util.Set；
导入javax.swing.text.MutableAttributeSet；
导入javax.swing.text.html.html；
导入javax.swing.text.html.html.Tag；
导入javax.swing.text.html.HTMLEditorKit；
导入javax.swing.text.html.parser.ParserDelegator；
公共类提取程序{
私有静态类HTMLPaserCallBack扩展了HTMLEditorKit.ParserCallback{
专用设置URL；
公共HtmlPasserCallback（）{
URL=新LinkedHashSet（）；
}
公共集getURL（）{
返回URL；
}
@凌驾
public void handleSimpleTag（标记t、可变属性集a、整数位置）{
把手（t、a、pos）；
}
@凌驾
公共无效handleStartTag（Tag t，可变属性集a，内部位置）{
把手（t、a、pos）；
}
私有void handleTag（标签t、可变属性集a、内部位置）{
if（t==Tag.A）{
Object href=a.getAttribute（HTML.Attribute.href）；
如果（href！=null）{
字符串url=href.toString（）；
如果（！url.contains（url））{
添加（url）；
}
}
}
}
}
公共静态void main（字符串[]args）引发IOException{
InputStream=null；
试一试{
字符串u=”http://en.wikipedia.org/wiki/china";
URL=新URL（u）；
is=url.openStream（）；//引发IOException
HTMLPaserCallBack cb=新的HTMLPaserCallBack（）；
新的ParserDelegator（）.parse（新的BufferedReader（新的InputStreamReader（is）），cb，true）；
for（字符串aUrl:cb.getUrls（））{
System.out.println（“找到的URL:+aUrl”）；
}
}捕获（格式不正确）{
mue.printStackTrace（）；
}捕获（ioe异常ioe）{
ioe.printStackTrace（）；
}最后{
试一试{
is.close（）；
}捕获（ioe异常ioe）{
//这里没什么可看的
}
}
}
}
为此目的使用HTML解析器。@GuillaumePolet评论不错，但我认为这会是一个更好的答案。在说“不要这样做”之前，请先看@GuillaumePolet，告诉我们如何用其他方式来做。试着举个例子：@AndrewThompson我本来打算这么做的，但我想在回答中更深入地解释这一点，而不仅仅是说明这一点。火车上的互联网连接很乱，所以我花了一段时间。按给定的方式执行。。但是没有产生任何输出。@ratankumar，即使在这里，从它工作的火车（而且连接非常差）