Java代理发现机器人

Java代理发现机器人,java,multithreading,performance,list,overhead,Java,Multithreading,Performance,List,Overhead,我已经编写了一个类ProxyFinder,它连接到随机IP并首先ping它们,如果它们响应,则尝试通过公共代理端口创建http代理连接 目前,它只是建立连接到随机IP。这是相对快速的,每小时发现几个代理。然而,我想以某种方式检查我以前是否已经连接到ip。首先,我尝试将它们保存在一个列表中,但这使用了超过10GB的ram。。我在下面的代码中尝试了一种方法,该方法使用RandomAccessFile将数据写入缓存,但随着连接变大,在整个文件中搜索每个连接的速度非常慢 我以尽可能小的格式存储数据,每个

我已经编写了一个类ProxyFinder,它连接到随机IP并首先ping它们,如果它们响应,则尝试通过公共代理端口创建http代理连接

目前,它只是建立连接到随机IP。这是相对快速的,每小时发现几个代理。然而,我想以某种方式检查我以前是否已经连接到ip。首先,我尝试将它们保存在一个列表中,但这使用了超过10GB的ram。。我在下面的代码中尝试了一种方法,该方法使用RandomAccessFile将数据写入缓存,但随着连接变大,在整个文件中搜索每个连接的速度非常慢

我以尽可能小的格式存储数据,每个ip只需四个字节。尽管如此,这是4*256*256*256*256字节..=16gb的原始ram。。或者每次要测试另一个ip时都要搜索一个16gb的文件

我还尝试创建一个单独的线程来生成IP,对照文件检查它们,然后将它们添加到探测线程可以从中提取的队列中。它也跟不上探针线程

如何快速检查我是否已连接到IP,而不会出现难以置信的速度或占用大量内存

package net;

import java.io.File;
import java.io.RandomAccessFile;
import java.net.HttpURLConnection;
import java.net.InetAddress;
import java.net.InetSocketAddress;
import java.net.Proxy;
import java.net.URL;
import java.util.Arrays;
import java.util.concurrent.atomic.AtomicInteger;

/**
 *
 * @author Colby
 */
public class ProxyFinder {

    /**
     * @param args the command line arguments
     */
    public static void main(String[] args) throws Exception {

        int[] ports = {
            1080, 3128, 3128, 8080
        };

        System.out.println("Starting network probe");

        AtomicInteger counter = new AtomicInteger();
        for (int i = 0; i < 500; i++) {
            new Thread(() -> {

                do {
                    try {
                        byte[] addrBytes = randomAddress();//could be getNextAddress also
                        if (addrBytes == null) {
                            break;
                        }

                        InetAddress addr = InetAddress.getByAddress(addrBytes);
                        if (ping(addr)) {
                            float percent = (float) ((counter.get() / (256f * 256f * 256f * 256f)) * 100F);
                            if (counter.incrementAndGet() % 10000 == 0) {
                                System.out.println("Searching " + percent + "% network search");
                            }

                            for (int port : ports) {
                                try {
                                    Proxy proxy = new Proxy(Proxy.Type.HTTP, new InetSocketAddress(addr, port));

                                    HttpURLConnection con = (HttpURLConnection) new URL("http://google.com").openConnection(proxy);

                                    con.setConnectTimeout(1000);
                                    con.setReadTimeout(1000);
                                    con.setRequestMethod("GET");
                                    con.setRequestProperty("User-Agent", "Mozilla/5.0");

                                    con.getContent();
                                    con.disconnect();

                                    System.out.println("Proxy found!" + addr.getHostAddress() + ":" + port + "  Found at " + percent + "% network search");

                                } catch (Exception e) {
                                }
                            }

                            //
                            //System.out.println("Ping response: --" + addr.getHostAddress() + "-- Attempt: " + counter.get() + " Percent: " + percent + "%");
                        } else {
                            //System.out.println("Ping response failed: " + addr.getHostAddress() + " attempt " + counter.incrementAndGet());
                        }

                    } catch (Exception e) {
                        //e.printStackTrace();
                    }

                } while (true);

            }).start();
        }
    }

    private static RandomAccessFile cache;

    private static byte[] getNextAddress() throws Exception {
        if (cache == null) {
            cache = new RandomAccessFile(File.createTempFile("abc", ".tmp"), "rw");
        }

        byte[] check;
        checkFile:
        {
            byte[] addr = new byte[4];
            do {
                check = randomAddress();
                inner:
                {
                    cache.seek(0);
                    while (cache.length() - cache.getFilePointer() > 0) {
                        cache.readFully(addr);
                        if (Arrays.equals(check, addr)) {
                            break inner;
                        }
                    }
                    cache.write(check);
                    break checkFile;
                }

            } while (true);
        }
        return check;
    }

    private static byte[] randomAddress() {
        return new byte[]{(byte) (Math.random() * 256), (byte) (Math.random() * 256), (byte) (Math.random() * 256), (byte) (Math.random() * 256)};
    }

    private static boolean ping(InetAddress addr) throws Exception {
        return addr.isReachable(500);
    }
}
包网;
导入java.io.File;
导入java.io.RandomAccessFile;
导入java.net.HttpURLConnection;
导入java.net.InetAddress;
导入java.net.InetSocketAddress;
导入java.net.Proxy;
导入java.net.URL;
导入java.util.array;
导入java.util.concurrent.AtomicInteger;
/**
*
*@作者科尔比
*/
公共类代理查找器{
/**
*@param指定命令行参数
*/
公共静态void main(字符串[]args)引发异常{
int[]端口={
1080, 3128, 3128, 8080
};
System.out.println(“启动网络探测”);
AtomicInteger计数器=新的AtomicInteger();
对于(int i=0;i<500;i++){
新线程(()->{
做{
试一试{
字节[]addrBytes=randomAddress();//也可以是getNextAddress
if(addrBytes==null){
打破
}
InetAddress addr=InetAddress.getByAddress(addrBytes);
如果(ping(addr)){
浮动百分比=(浮动)((counter.get()/(256f*256f*256f*256f))*100F);
if(counter.incrementAndGet()%10000==0){
System.out.println(“搜索”+百分比+%网络搜索”);
}
用于(int端口:端口){
试一试{
Proxy Proxy=新代理(Proxy.Type.HTTP,新的InetSocketAddress(addr,port));
HttpURLConnection con=(HttpURLConnection)新URL(“http://google.com)openConnection(代理);
con.设置连接超时(1000);
con.setReadTimeout(1000);
con.setRequestMethod(“GET”);
con.setRequestProperty(“用户代理”、“Mozilla/5.0”);
con.getContent();
con.disconnect();
System.out.println(“找到代理!”+addr.getHostAddress()+”:“+port+”在“+percent+%network search”中找到);
}捕获(例外e){
}
}
//
//System.out.println(“Ping响应:“+addr.getHostAddress()+”--trunt:“+counter.get()+”Percent:“+Percent+”%);
}否则{
//System.out.println(“Ping响应失败:”+addr.getHostAddress()+“trunt”+counter.incrementAndGet());
}
}捕获(例外e){
//e、 printStackTrace();
}
}虽然(正确);
}).start();
}
}
私有静态文件缓存;
私有静态字节[]getNextAddress()引发异常{
if(缓存==null){
cache=new RandomAccessFile(File.createTempFile(“abc”)、“.tmp”)、“rw”);
}
字节[]检查;
检查文件:
{
字节[]地址=新字节[4];
做{
check=randomAddress();
内部:
{
cache.seek(0);
while(cache.length()-cache.getFilePointer()>0){
cache.readFully(addr);
if(Arrays.equals(check,addr)){
打破内在;
}
}
cache.write(检查);
破坏校验文件;
}
}虽然(正确);
}
退货检查;
}
专用静态字节[]随机地址(){
返回新字节[]{(字节)(Math.random()*256),(字节)(Math.random()*256),(字节)(Math.random()*256),(字节)(Math.random()*256)};
}
私有静态布尔ping(InetAddress addr)引发异常{
返回地址可删除(500);
}
}
另外,如果有人想知道,我已经运行了12个小时,发现了大约50个代理,ping了大约2.09664E-4%的ip范围,即大约120万个ip。分配的带宽不错(0.5Mbps)


编辑:我开始认为,存储和检查所有这些ip的开销可能比简单地在搜索ip范围的末尾连接到许多重复的ip还要大。

使用MySql和Hibernate这样的数据库,并使用1级和2级缓存

它是w
assuming that no address was already seen
   1.0.0.1 - seen false
   2.0.0.2 - seen false
   2.0.0.1 - seen true, which was wrong and is correctly handled by code below
public class KeepSeenAddresses {

    static final int FILE_BUFFER_SIZE = 81_920;
    static final int RANGES_SIZE = 256;

    // to store 256 ranges of 255*255*255+1 addresses
    static BitSet[] ranges;

    // Random(1) is taken only for demonstration purpose, so the second
    // application run will find the same seen addresses from previous run
    static Random random = new Random(1);
    // for normal use it's better to have better randomness
    //static Random random = new Random(System.currentTimeMillis());

    public static void main(String[] args)
            throws IOException, ClassNotFoundException {

        if (!readRanges()) {
            initRanges();
        }

        // this case was failing in the initial solution
        // uncomment this block to see how all edge cases
        // which where mentioned in other comments are handled
        /*
         byte[][] addresses = {
             {1, 0, 0, 1}, 
             {2, 0, 0, 2}, 
             {2, 0, 0, 1},
             {1, 2, 3, 4}, 
             {4, 3, 2, 1}, 
             {(byte)128, 0, 0, 0},
             {(byte)255, (byte)255, (byte)255, (byte)255}
         };
         seenAddress(addresses[0]);
         seenAddress(addresses[1]);
         seenAddress(addresses[3]);
         seenAddress(addresses[5]);
         seenAddress(addresses[6]);
         for (byte[] addressBytes : addresses) {
         System.out.printf("seen %s before: %s%n",
         prettyAddress(addressBytes),
         seenBefore(addressBytes)
         );
         }
         */
        processAddresses();

        persistRanges();
    }

    /**
     * Read the seen addresses from a file.
     *
     * @return <code>true</code> if the file was found and has the expected
     * number of ranges, otherwise <code>false</code>
     * @throws IOException
     * @throws ClassNotFoundException
     */
    private static boolean readRanges() throws IOException, ClassNotFoundException {
        File rangesStore = new File("addresses.bin");
        if (!rangesStore.exists()) {
            return false;
        }
        System.out.print("found previous rangesStore... ");
        try (ObjectInputStream ois = new ObjectInputStream(
                new BufferedInputStream(
                        new FileInputStream(rangesStore), FILE_BUFFER_SIZE
                )
        )) {
            ranges = (BitSet[]) ois.readObject();
        }
        if (ranges.length != RANGES_SIZE) {
            System.out.printf("wrong size of rangesStore: expected %d"
                    + "  found: %d%n", RANGES_SIZE, ranges.length);
            return false;
        } else {
            System.out.printf("restored ranges: %d%n", ranges.length);
            return true;
        }
    }

    /**
     * Initialize the address ranges array. All address flags will be set to
     * <code>false</code>.
     */
    private static void initRanges() {
        System.out.print("initialize new rangesStore... ");
        ranges = new BitSet[RANGES_SIZE];
        for (int i = 0; i < RANGES_SIZE; i++) {
            BitSet bitSet = new BitSet(255 * 255 * 255 + 1);
            for (int j = 0; j < 255 * 255 * 255 + 1; j++) {
                bitSet.clear(j);
            }
            ranges[i] = bitSet;
        }
        System.out.printf("initialized ranges: %d%n", RANGES_SIZE);
    }

    /**
     * For demonstration purpose.<br>
     * Generates some random IPv4 addresses. If the address was not seen before
     * the flag for this address will be set to <code>true</code>.
     */
    private static void processAddresses() {
        for (int i = 0; i < 10; i++) {
            byte[] addrBytes = randomAddress();
            boolean seenBefore = seenBefore(addrBytes);
            if (!seenBefore) {
                seenAddress(addrBytes);
                seenBefore = false;
            }
            System.out.printf("seen %s before: %s%n",
                    prettyAddress(addrBytes),
                    seenBefore
            );
        }
    }

    /**
     * Persist the address ranges array. The file size is around 500MB.
     *
     * @throws IOException
     */
    private static void persistRanges() throws IOException {
        System.out.print("persist rangesStore... ");
        try (ObjectOutputStream oos = new ObjectOutputStream(
                new BufferedOutputStream(
                        new FileOutputStream("addresses.bin"), FILE_BUFFER_SIZE)
        )) {
            oos.writeObject(ranges);
        }
        System.out.printf("written ranges: %d%n", ranges.length);
    }

    /**
     * Keep a flag which address has been seen already.
     *
     * @param addrBytes IPv4 address in four bytes
     */
    static void seenAddress(byte[] addrBytes) {
        int rangeIndex = (int) addrBytes[0] & 0xff;
        int rangeOffset = ((int) addrBytes[1] & 0xff * 0xffff)
                + ((int) addrBytes[2] & 0xff * 0xff)
                + ((int) addrBytes[3] & 0xff);
        ranges[rangeIndex].set(rangeOffset);
    }

    /**
     * Check if the passed address was seen before.
     *
     * @param addrBytes IPv4 address in four bytes
     * @return <code>true</code> if the address was seen before, otherwise
     * <code>false</code>
     */
    static boolean seenBefore(byte[] addrBytes) {
        int rangeIndex = (int) addrBytes[0] & 0xff;
        int rangeOffset = ((int) addrBytes[1] & 0xff * 0xffff) + ((int) addrBytes[2] & 0xff * 0xff) + ((int) addrBytes[3] & 0xff);
        return ranges[rangeIndex].get(rangeOffset);
    }

    /**
     * Convert the IPv4 address into pretty string.
     *
     * @param addrBytes IPv4 address in four bytes
     * @return pretty String of the IPv4 address
     */
    static String prettyAddress(byte[] addrBytes) {
        return String.format("%03d.%03d.%03d.%03d",
                (int) addrBytes[0] & 0xff,
                (int) addrBytes[1] & 0xff,
                (int) addrBytes[2] & 0xff,
                (int) addrBytes[3] & 0xff);
    }

    /**
     * Generate a random IPv4 address.
     *
     * @return four bytes of a random generated IPv4 address
     */
    private static byte[] randomAddress() {
        byte[] bytes = new byte[4];
        for (int i = 0; i < bytes.length; i++) {
            bytes[i] = (byte) random.nextInt(256);
        }
        return bytes;
    }
}
static BitSet set;

static int pos(int i, int j, int k, int m) {
    return ((256*256*256) * i) + ((256*256) * j) + (256 * k) + m;
}

static boolean get(byte[] addr) {
    return set.get(pos(addr[0], addr[1], addr[2], addr[3]));
}

static void set(byte[] addr, boolean flag) {
    set.set(pos(addr[0], addr[1], addr[2], addr[3]), flag);
}