如何从Java中的google搜索中排除搜索结果（链接）_Java

如何从Java中的google搜索中排除搜索结果（链接）

java

如何从Java中的google搜索中排除搜索结果（链接）,java,Java,我想筛选出谷歌搜索的所有网站链接。如果我在搜索某个东西，我想获得谷歌向我们展示的所有网站链接首先，我想阅读完整的html内容。之后，我想过滤掉所有重要的URL。例如->如果我把“买鞋”这个词放到谷歌->我想得到像“www.amazon.in/shoes”这样的链接等等如果我开始我的程序，我只会得到几个URL和基于google的网站，比如“google.de/intl/de/options/” PS：我在Chrome和Firefox浏览器中使用相同的查询（“buy+shoes”）检查了页面源代

我想筛选出谷歌搜索的所有网站链接。如果我在搜索某个东西，我想获得谷歌向我们展示的所有网站链接

首先，我想阅读完整的html内容。之后，我想过滤掉所有重要的URL。例如->如果我把“买鞋”这个词放到谷歌->我想得到像“www.amazon.in/shoes”这样的链接等等

如果我开始我的程序，我只会得到几个URL和基于google的网站，比如“google.de/intl/de/options/”

PS：我在Chrome和Firefox浏览器中使用相同的查询（“buy+shoes”）检查了页面源代码，注意到Chrome浏览器提供的内容比Firefox浏览器多得多。我的感觉是，我只得到很少的网站结果，因为java就像Firefox浏览器一样连接，不是吗？我如何获得谷歌显示的所有这些链接

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.nio.charset.Charset;
import java.util.Scanner;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class findEveryUrl {
public static void main( String[] args ) throws IOException
{

    String gInput = "https://www.google.de/#q=";
    // setKeyWord asks you to enter the keyword into the console
    String fullUrl = gInput + setKeyWord();
    //fullUrl is used for the InputStream and "www." is the string, which is used for splitting
    findAllSubs( fullUrl, "www.");
    //System.out.println("given url: " + fullUrl);
}



/* 
 * @param <T> String type.
 * @param urlString has to be the full Url.
 * @param splitphrase is the String which is used for splitting. 
 * @return void
 */
static void findAllSubs( String urlString, String splitphrase )
{
    try
    {
        URL     url     = new URL( urlString );
        URLConnection yc = url.openConnection();
        BufferedReader in = new BufferedReader(new InputStreamReader(
                yc.getInputStream()));
        String inputLine ;  
        String array[];

        while ((inputLine = in.readLine()) != null){
            inputLine += in.readLine();
            array = inputLine.split(splitphrase);
            arrayToConsol(array);

        }
    }catch (IOException e) {
        e.printStackTrace();
    }

}



/* 
 * urlQuery() asks you for the search keyword for the google query
 * @return returns the keyword, which you wrote into the console
 */
public static String setKeyWord(){
    BufferedReader console = new BufferedReader(new InputStreamReader(System.in));
    System.out.print("Enter a KeyWord: ");
    //googles search engine url

    String keyWord = null;
    try {
        keyWord = console.readLine();
    } catch (IOException e) {
        // shouldn't be happen
        e.printStackTrace();
    }

    return keyWord;
}

public static void arrayToConsol(String[] array){
    for (String item : array) {
        System.out.println(item);
    }
}

public static void searchQueryToConsole(String url) throws IOException{
    URL googleSearch = new URL(url);
    URLConnection yc = googleSearch.openConnection();
    BufferedReader in = new BufferedReader(new InputStreamReader(
            yc.getInputStream()));
    String inputLine;
    while ((inputLine = in.readLine()) != null) 
        System.out.println(inputLine);
    in.close();
}}

导入java.io.BufferedReader；
导入java.io.BufferedWriter；
导入java.io.File；
导入java.io.FileWriter；
导入java.io.IOException；
导入java.io.InputStreamReader；
导入java.net.MalformedURLException；
导入java.net.URL；
导入java.net.URLConnection；
导入java.nio.charset.charset；
导入java.util.Scanner；
导入java.util.regex.Matcher；
导入java.util.regex.Pattern；
公共类findEveryUrl{
公共静态void main（字符串[]args）引发IOException
{
字符串gInput=”https://www.google.de/#q=";
//setKeyWord要求您在控制台中输入关键字
字符串fullUrl=gInput+setKeyWord（）；
//fullUrl用于InputStream，“www.”是用于拆分的字符串
findAllSubs（完整URL，“www.”）；
//System.out.println（“给定url:+fullUrl”）；
}
/* 
*@param字符串类型。
*@param urlString必须是完整的Url。
*@param splitphrase是用于拆分的字符串。
*@返回无效
*/
静态void findAllSubs（字符串urlString、字符串splitphrase）
{
尝试
{
URL=新URL（URL字符串）；
URLConnection yc=url.openConnection（）；
BufferedReader in=新的BufferedReader（新的InputStreamReader(
yc.getInputStream（））；
字符串输入线；
字符串数组[]；
而（（inputLine=in.readLine（））！=null）{
inputLine+=in.readLine（）；
数组=inputLine.split（splitphrase）；
阵列控制台（阵列）；
}
}捕获（IOE异常）{
e、 printStackTrace（）；
}
}
/* 
*urlQuery（）要求您输入google查询的搜索关键字
*@return返回您写入控制台的关键字
*/
公共静态字符串setKeyWord（）{
BufferedReader控制台=新的BufferedReader（新的InputStreamReader（System.in））；
System.out.print（“输入关键字：”）；
//谷歌搜索引擎url
字符串关键字=null；
试一试{
关键字=console.readLine（）；
}捕获（IOE异常）{
//这不应该发生
e、 printStackTrace（）；
}
返回关键字；
}
公共静态void数组控制台（字符串[]数组）{
for（字符串项：数组）{
系统输出打印项次（项）；
}
}
公共静态void searchQueryToConsole（字符串url）引发IOException{
URL googleSearch=新URL（URL）；
URLConnection yc=googleSearch.openConnection（）；
BufferedReader in=新的BufferedReader（新的InputStreamReader(
yc.getInputStream（））；
字符串输入线；
而（（inputLine=in.readLine（））！=null）
系统输出打印LN（输入线）；
in.close（）；
}}

这里有一个简单易行的解决方案

但是，如果您想使用CSS选择器解析其他页面以查找元素，请将其强大的库组合起来

Document doc = Jsoup.connect("http://en.wikipedia.org/").get();
Elements newsHeadlines = doc.select("#mp-itn b a");

感谢Daredesm，您的快速回复=）