Java Jsoup Reddit图像刮刀超过18个问题_Java_Web Scraping_Jsoup_Reddit

Java Jsoup Reddit图像刮刀超过18个问题

java web-scraping

Java Jsoup Reddit图像刮刀超过18个问题,java,web-scraping,jsoup,reddit,Java,Web Scraping,Jsoup,Reddit,我正在开发一个图像刮板，它使用JSOUP刮取各种子Reddits的第一页。然而，出现的问题是，当尝试刮取NSFW子reddit时，reddit会重定向到超过18的身份验证页面，而刮取器会刮取身份验证页面。我是一个新手，我知道这是一个noob的问题，但任何帮助都将不胜感激，因为我完全迷失了方向 import java.io.IOException; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.

我正在开发一个图像刮板，它使用JSOUP刮取各种子Reddits的第一页。然而，出现的问题是，当尝试刮取NSFW子reddit时，reddit会重定向到超过18的身份验证页面，而刮取器会刮取身份验证页面。我是一个新手，我知道这是一个noob的问题，但任何帮助都将不胜感激，因为我完全迷失了方向

import java.io.IOException;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;

import java.io.*;
import java.net.URL;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.io.*;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.jsoup.Jsoup;

import org.jsoup.nodes.Attributes;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.net.URL;
import java.util.Scanner;

public class javascraper{
    public static final String USER_AGENT = "<User-Agent: github.com/dabeermasood:v1.2.3 (by /u/swedenotswiss)>";


public static void main (String[]args) throws MalformedURLException
{
    Scanner scan = new Scanner (System.in);
    System.out.println("Where do you want to store the files?");
    String folderpath = scan.next();
    System.out.println("What subreddit do you want to scrape?");
    String subreddit = scan.next();
    subreddit = ("http://reddit.com/r/" + subreddit);

    new File(folderpath + "/" + subreddit).mkdir();


//test

try{
    //gets http protocol
    Document doc = Jsoup.connect(subreddit).userAgent(USER_AGENT).timeout(0).get();

//get page title
String title = doc.title();
System.out.println("title : " + title);

//get all links
Elements links = doc.select("a[href]");

for(Element link : links){

//get value from href attribute
String checkLink = link.attr("href");
Elements images = doc.select("img[src~=(?i)\\.(png|jpe?g|gif)]");
if (imgCheck(checkLink)){ // checks to see if img link j
    System.out.println("link : " + link.attr("href"));
downloadImages(checkLink, folderpath);





}
}



}
catch (IOException e){
e.printStackTrace();
}

}


public static boolean imgCheck(String http){
String png = ".png";
String jpg = ".jpg";
String jpeg = "jpeg"; // no period so checker will only check last four characaters
String gif = ".gif";
int length = http.length();

if (http.contains(png)|| http.contains("gfycat") || http.contains(jpg)|| http.contains(jpeg) || http.contains(gif)){
return true;
}
else{
return false;
}



}



private static void downloadImages(String src, String folderpath) throws IOException{
String folder = null;

        //Exctract the name of the image from the src attribute

        int indexname = src.lastIndexOf("/");

        if (indexname == src.length()) {

            src = src.substring(1, indexname);

        }
 indexname = src.lastIndexOf("/");

        String name = src.substring(indexname, src.length());

        System.out.println(name);

        //Open a URL Stream

        URLConnection connection = (new URL(src)).openConnection();

        try {
            Thread.sleep(2000);
        } catch (InterruptedException e) {

            e.printStackTrace();
        } //Delay to comply with rate limiting
        connection.setRequestProperty("User-Agent", USER_AGENT);

        InputStream in = connection.getInputStream();

        OutputStream out = new BufferedOutputStream(new FileOutputStream( folderpath+ name));



        for (int b; (b = in.read()) != -1;) {

            out.write(b);

        }

        out.close();

        in.close();






}



}

import java.io.IOException；
导入org.jsoup.jsoup；
导入org.jsoup.nodes.Document；
导入org.jsoup.nodes.Element；
导入org.jsoup.select.Elements；
导入java.io.BufferedReader；
导入java.io.BufferedWriter；
导入java.io.File；
导入java.io.FileWriter；
导入java.io.IOException；
导入java.io.InputStreamReader；
导入java.net.MalformedURLException；
导入java.net.URL；
导入java.net.URLConnection；
导入java.io.*；
导入java.net.URL；
导入java.util.logging.Level；
导入java.util.logging.Logger；
导入java.io.*；
导入java.util.logging.Level；
导入java.util.logging.Logger；
导入org.jsoup.jsoup；
导入org.jsoup.nodes.Attributes；
导入org.jsoup.nodes.Document；
导入org.jsoup.nodes.Element；
导入org.jsoup.select.Elements；
导入java.io.IOException；
导入java.net.URL；
导入java.util.Scanner；
公共类javascraper{
公共静态最终字符串USER_AGENT=“”；
publicstaticvoidmain（字符串[]args）引发畸形的DurLexException
{
扫描仪扫描=新扫描仪（System.in）；
System.out.println（“您想将文件存储在哪里？”）；
字符串folderpath=scan.next（）；
System.out.println（“您想要刮取什么子reddit？”）；
String subreddit=scan.next（）；
subreddit=（”http://reddit.com/r/“+subreddit）；
新文件（folderpath+“/”+subreddit）.mkdir（）；
//试验
试一试{
//获取http协议
Document doc=Jsoup.connect（subreddit）.userAgent（USER\u AGENT）.timeout（0）.get（）；
//获取页面标题
字符串title=doc.title（）；
System.out.println（“标题：”+标题）；
//获取所有链接
Elements links=doc.select（“a[href]”）；
用于（元素链接：链接）{
//从href属性获取值
字符串checkLink=link.attr（“href”）；
Elements images=doc.select（“img[src~=（？i）\\（png | jpe？g | gif）]；
if（imgCheck（checkLink））{//检查img link j
System.out.println（“link:+link.attr（“href”）；
下载图像（checkLink、folderpath）；
}
}
}
捕获（IOE异常）{
e、 printStackTrace（）；
}
}
公共静态布尔值imgCheck（字符串http）{
字符串png=“.png”；
字符串jpg=“.jpg”；
String jpeg=“jpeg”//没有句点，所以检查器只检查最后四个字符
字符串gif=“.gif”；
int length=http.length（）；
如果（http.contains（png）| http.contains（“gfycat”）| http.contains（jpg）| http.contains（jpeg）| http.contains（gif））{
返回true；
}
否则{
返回false；
}
}
私有静态void downloadImages（字符串src、字符串folderpath）引发IOException{
字符串文件夹=null；
//从src属性中提取图像的名称
int indexname=src.lastIndexOf（“/”）；
if（indexname==src.length（））{
src=src.substring（1，indexname）；
}
indexname=src.lastIndexOf（“/”）；
String name=src.substring（indexname，src.length（））；
System.out.println（名称）；
//打开URL流
URLConnection connection=（新URL（src））.openConnection（）；
试一试{
《睡眠》（2000年）；
}捕捉（中断异常e）{
e、 printStackTrace（）；
}//延迟遵守速率限制
setRequestProperty（“用户代理”，用户代理）；
InputStream in=connection.getInputStream（）；
OutputStream out=新的BufferedOutputStream（新的FileOutputStream（folderpath+名称））；
对于（int b；（b=in.read（））！=-1；）{
写出（b）；
}
out.close（）；
in.close（）；
}
}

我在中发布了一个答案，使用

Jsoup对服务器进行身份验证。基本上，您需要POST
将您的登录ID和密码以及其他所需数据通过以下方式发送到服务器：
Connection.Response res=Jsoup.connect（url）.data（…）.method（method.Post）.execute（），然后从服务器保存响应cookie以保持会话的身份验证。
您必须处理并提交身份验证页面，然后删除下一页。