Java 如何在两个数组中同时查找相同的byte[]-对象？_Java_Multithreading_Concurrency_Hash_Hash Collision

Java 如何在两个数组中同时查找相同的byte[]-对象？

java multithreading concurrency hash

Java 如何在两个数组中同时查找相同的byte[]-对象？,java,multithreading,concurrency,hash,hash-collision,Java,Multithreading,Concurrency,Hash,Hash Collision,我正在尝试对散列实现冲突攻击（我正在学习“密码学”课程）。因此，我有两个哈希数组（=字节序列字节[]），希望找到两个数组中都存在的哈希。经过一些研究和大量思考，我确信单核机器上的最佳解决方案是哈希集（添加第一个数组的所有元素，并通过包含检查第二个数组的元素是否已经存在）但是，我想实现一个并发解决方案，因为我可以访问一台具有8个内核和12GB RAM的机器。我能想到的最好的解决方案是ConcurrentHashSet，它可以通过Collections.newSetFromMap创建（新的Conc

我正在尝试对散列实现冲突攻击（我正在学习“密码学”课程）。因此，我有两个哈希数组（=字节序列

字节[]

），希望找到两个数组中都存在的哈希。经过一些研究和大量思考，我确信单核机器上的最佳解决方案是

哈希集

（添加第一个数组的所有元素，并通过

包含检查第二个数组的元素是否已经存在）
但是，我想实现一个并发解决方案，因为我可以访问一台具有8个内核和12GB RAM的机器。我能想到的最好的解决方案是ConcurrentHashSet，它可以通过Collections.newSetFromMap创建（新的ConcurrentHashMap使我的程序并行化。
我认为使用任何形式的HashMap
都是完全浪费时间。我猜您正在计算各种数据的多字节散列，这些已经是散列
了，不需要再对它们执行任何散列
虽然您没有说明，但我猜您的散列是字节
序列。显然，a或a是存储这些序列的理想选择
因此，我建议您实现一个trie/dawg
，并使用它存储第一个数组中的所有哈希值。然后您可以使用所有计算能力并行查找此trie
中第二个数组中的每个元素。无需锁
已添加
下面是我设计的一个简单的Dawg
实现。它似乎很有效
public class Dawg {
  // All my children.
  Dawg[] children = new Dawg[256];
  // Am I a leaf.
  boolean isLeaf = false;

  // Add a new word.
  public void add ( byte[] word ) {
    // Finds its location, growing as necessary.
    Dawg loc = find ( word, 0, true );
    loc.isLeaf = true;
  }

  // String form.
  public void add ( String word ) {
    add(word.getBytes());
  }

  // Returns true if word is in the dawg.
  public boolean contains ( byte [] word ) {
    // Finds its location, no growing allowed.
    Dawg d = find ( word, 0, false );
    return d != null && d.isLeaf; 
  }

  // String form.
  public boolean contains ( String word ) {
    return contains(word.getBytes());
  }

  // Find the Dawg - growing the tree as necessary if requested.
  private Dawg find ( byte [] word, int i, boolean grow ) {
    Dawg child = children[word[i]];
    if ( child == null ) {
      // Not present!
      if ( grow ) {
        // Grow the tree.
        child = new Dawg();
        children[word[i]] = child;
      }
    }
    // Found it?
    if ( child != null ) {
      // More to find?
      if ( i < word.length - 1 ) {
        child = child.find(word, i+1, grow);
      }
    }
    return child;
  }

  public static void main ( String[] args ) {
    Dawg d = new Dawg();
    d.add("H");
    d.add("Hello");
    d.add("World");
    d.add("Hell");
    System.out.println("Hello is "+(d.contains("Hello")?"in":"out"));
    System.out.println("World is "+(d.contains("World")?"in":"out"));
    System.out.println("Hell is "+(d.contains("Hell")?"in":"out"));
    System.out.println("Hal is "+(d.contains("Hal")?"in":"out"));
    System.out.println("Hel is "+(d.contains("Hel")?"in":"out"));
    System.out.println("H is "+(d.contains("H")?"in":"out"));
  }
}

公共类Dawg{
//我所有的孩子。
Dawg[]子代=新Dawg[256]；
//我是一片叶子。
布尔isLeaf=false；
//添加一个新词。
公共无效添加（字节[]字）{
//找到它的位置，根据需要增长。
Dawg loc=查找（字，0，真）；
loc.isLeaf=真；
}
//字符串形式。
公共空添加（字符串字）{
添加（word.getBytes（））；
}
//如果单词在dawg中，则返回true。
公共布尔包含（字节[]字）{
//找到它的位置，不允许增长。
Dawg d=查找（字，0，假）；
返回d！=null&&d.isLeaf；
}
//字符串形式。
公共布尔包含（字符串字）{
返回包含（word.getBytes（））；
}
//如果需要的话，找到Dawg-根据需要种植树木。
私有Dawg查找（字节[]字，整数i，布尔增长）{
Dawg child=children[word[i]]；
if（child==null）{
//不在场！
如果（成长）{
//种植这棵树。
child=新的Dawg（）；
children[word[i]]=child；
}
}
//找到了吗？
if（child！=null）{
//还有更多要找的吗？
如果（i

已添加
这可能是并发无锁版本的一个良好开端。众所周知，这些东西很难测试，所以我不能保证它会工作，但在我看来它肯定会工作
import java.util.concurrent.atomic.AtomicReferenceArray;


public class LFDawg {
  // All my children.
  AtomicReferenceArray<LFDawg> children = new AtomicReferenceArray<LFDawg> ( 256 );
  // Am I a leaf.
  boolean isLeaf = false;

  // Add a new word.
  public void add ( byte[] word ) {
    // Finds its location, growing as necessary.
    LFDawg loc = find( word, 0, true );
    loc.isLeaf = true;
  }

  // String form.
  public void add ( String word ) {
    add( word.getBytes() );
  }

  // Returns true if word is in the dawg.
  public boolean contains ( byte[] word ) {
    // Finds its location, no growing allowed.
    LFDawg d = find( word, 0, false );
    return d != null && d.isLeaf;
  }

  // String form.
  public boolean contains ( String word ) {
    return contains( word.getBytes() );
  }

  // Find the Dawg - growing the tree as necessary if requested.
  private LFDawg find ( byte[] word, int i, boolean grow ) {
    LFDawg child = children.get( word[i] );
    if ( child == null ) {
      // Not present!
      if ( grow ) {
        // Grow the tree.
        child = new LFDawg();
        if ( !children.compareAndSet( word[i], null, child ) ) {
          // Someone else got there before me. Get the one they set.
          child = children.get( word[i] );
        }
      }
    }
    // Found it?
    if ( child != null ) {
      // More to find?
      if ( i < word.length - 1 ) {
        child = child.find( word, i + 1, grow );
      }
    }
    return child;
  }

  public static void main ( String[] args ) {
    LFDawg d = new LFDawg();
    d.add( "H" );
    d.add( "Hello" );
    d.add( "World" );
    d.add( "Hell" );
    System.out.println( "Hello is " + ( d.contains( "Hello" ) ? "in" : "out" ) );
    System.out.println( "World is " + ( d.contains( "World" ) ? "in" : "out" ) );
    System.out.println( "Hell is " + ( d.contains( "Hell" ) ? "in" : "out" ) );
    System.out.println( "Hal is " + ( d.contains( "Hal" ) ? "in" : "out" ) );
    System.out.println( "Hel is " + ( d.contains( "Hel" ) ? "in" : "out" ) );
    System.out.println( "H is " + ( d.contains( "H" ) ? "in" : "out" ) );
  }
}

导入java.util.concurrent.AtomicReferenceArray；
公共类{
//我所有的孩子。
AtomicReferenceArray子对象=新的AtomicReferenceArray（256）；
//我是一片叶子。
布尔isLeaf=false；
//添加一个新词。
公共无效添加（字节[]字）{
//找到它的位置，根据需要增长。
LFDawg loc=find（字，0，真）；
loc.isLeaf=真；
}
//字符串形式。
公共空添加（字符串字）{
添加（word.getBytes（））；
}
//如果单词在dawg中，则返回true。
公共布尔包含（字节[]字）{
//找到它的位置，不允许增长。
LFDawg d=查找（字，0，假）；
返回d！=null&&d.isLeaf；
}
//字符串形式。
公共布尔包含（字符串字）{
返回包含（word.getBytes（））；
}
//如果需要的话，找到Dawg-根据需要种植树木。
私有LFDawg查找（字节[]字，整数i，布尔增长）{
LFDawg child=children.get（word[i]）；
if（child==null）{
//不在场！
如果（成长）{
//种植这棵树。
child=新的LFDawg（）；
if（！children.compareAndSet（字[i]，空，子级））{
//有人比我先到的，去拿他们放的那个。
child=children.get（word[i]）；
}
}
}
//找到了吗？
if（child！=null）{
//还有更多要找的吗？
如果（i一种更简单的方法是将第一个数组分成N个相等（或接近相等）的部分（8个核，N=8似乎合理）。然后通过查看第二个数组中是否有散列，以“正常”方式解决程序
originalProcess(List<byte[]> list1, List<byte[]> list2) {
   HashSet<byte[]> bigHugeHashOfList1 = new HashSet<byte[]>();
   bigHugeHashOfList1.addAll(list1);
   for (byte[] hash : list2)
      if (bigHugeHashOfList1.contains(hash)
         // do something
}

preprocess(List<byte[]> list1, List<byte[]> list2) {
   List<byte[]>[] splitLists = new ArrayList<byte[]>[8];
   for (int i=0; i<8; i++)
      splitLists[i] = new ArrayList<byte[]>();
   for (byte[] hash : list1) {
      int idx = hash[0]&7; // I'm taking the 3 low order bits, YMMV
      splitLists[idx].add(hash);
      // a minor speedup would be to create the HashSet here instead of in originalProcess()
   }

   // now, using your favorite parallel/concurrency technique,
   // do the equivalent of
   for (int i=0; i<8; i++)
      originalProcess(splitLists[i], list2);
}