在Neo4J中通过全文搜索创建关系

在Neo4J中通过全文搜索创建关系,neo4j,full-text-search,cypher,Neo4j,Full Text Search,Cypher,我构建了一个Neo4j图形数据库,其中包含大约50000个带有标签DIAGNOSE的节点,每个节点都有一个最多50个字符的字符串属性文本。同一个图形数据库包含大约120000个带有标签BASETEXT的节点,每个节点都有一个字符串属性TEXTVALUE,最多包含175000个字符。我的目标是创建一个关系(b:BASETEXT)–[:ASSOCIATED]->(d:DIAGNOSE),以防DIAGNOSE.TEXT包含在BASETEXT.TEXTVALUE中–总共进行大约2.9*10^9次搜索。我

我构建了一个Neo4j图形数据库,其中包含大约50000个带有标签DIAGNOSE的节点,每个节点都有一个最多50个字符的字符串属性文本。同一个图形数据库包含大约120000个带有标签BASETEXT的节点,每个节点都有一个字符串属性TEXTVALUE,最多包含175000个字符。我的目标是创建一个关系(b:BASETEXT)–[:ASSOCIATED]->(d:DIAGNOSE),以防DIAGNOSE.TEXT包含在BASETEXT.TEXTVALUE中–总共进行大约2.9*10^9次搜索。我在cypher中尝试了以下两种方法:

方法1:

match (b:BASETEXT), (d:DIAGNOSE)
where b.TEXTVALUE =~ (".* " + d.TEXT + " .*")
merge (b) -[:ASSOCIATED]-> (d);
方法2(在每个诊断节点和每个BASETEXT节点之间创建一个关系,如果文本在TEXTVALUE中,则为包含的关系属性赋值为true,否则赋值为false,最后删除所有关联为.CONTAINED=false的关系):

上述方法都不起作用。方法1在半小时内没有找到终点,方法2找到终点,但需要60天。 关于如何在Neo4J中正确实现文本搜索并解决问题(最好是在Cypher中)有什么建议吗?

我还创建了一个

我认为这是一个在cypher中还没有得到很好支持的用例

如果您想尝试一种简单的密码变体,它应该可以工作(但速度较慢),请尝试以下方法:

MATCH (d:DIAGNOSE)
WHERE NOT () -[:ASSOCIATED]-> (d)
WITH d 
SKIP 0 LIMIT 1000
MATCH (b:BASETEXT)
WHERE (b.TEXTVALUE =~ (".* " + d.TEXT + " .*"))
CREATE (b) -[:ASSOCIATED]-> (d)
RETURN count(*);
在Java中执行此操作应该快得多:

public class ConnectIndexTest {
    private static final String PATH = "target/connect.db";
    public static final Label BASETEXT = DynamicLabel.label("BASETEXT");
    public static final Label DIAGNOSE = DynamicLabel.label("DIAGNOSE");
    public static final String TEXTVALUE = "TEXTVALUE";
    public static final String TEXT = "TEXT";
    public static final String INDEX_NAME = "basetext";
    private static final RelationshipType ASSOCIATED = DynamicRelationshipType.withName("ASSOCIATED");
    private GraphDatabaseService db;

    @Before
    public void setUp() throws Exception {
//      db = new GraphDatabaseFactory().newEmbeddedDatabase(PATH);
        db = new TestGraphDatabaseFactory().newImpermanentDatabase();
        try (Transaction tx = db.beginTx()) {
            for (int i = 100_000; i < 250_000; i++) db.createNode(BASETEXT).setProperty(TEXTVALUE, "foo " + i + " bar");
            tx.success();
        }
        try (Transaction tx = db.beginTx()) {
            for (int i = 100_000; i < 250_000; i += 2) db.createNode(DIAGNOSE).setProperty(TEXT, String.valueOf(i));
            tx.success();
        }
    }

    // 120k BASETEXT Nodes
    // 50k DIAGNOSE Nodes
    @Test
    public void testConnect() throws Exception {
        GlobalGraphOperations ops = GlobalGraphOperations.at(db);
        try (Transaction tx = db.beginTx()) {
            Index<Node> index = db.index().forNodes(INDEX_NAME, LuceneIndexImplementation.FULLTEXT_CONFIG);
            for (Node baseText : ops.getAllNodesWithLabel(BASETEXT)) {
                index.add(baseText, TEXTVALUE, baseText.getProperty(TEXTVALUE));
            }
            tx.success();
        }
        int count = 0;
        Transaction tx = db.beginTx();
        try {
            Index<Node> index = db.index().forNodes(INDEX_NAME);
            for (Node diagnose : ops.getAllNodesWithLabel(DIAGNOSE)) {
                String text = (String) diagnose.getProperty(TEXT);
                IndexHits<Node> hits = index.query(TEXTVALUE, "\"" + text + "\"");// quote in case text contains spaces
                for (Node baseText : hits) {
                    baseText.createRelationshipTo(diagnose, ASSOCIATED);
                    // batch transaction
                    if (++count % 50000 == 0) {
                        System.out.println("count = " + count);
                        tx.success();
                        tx.close();
                        tx = db.beginTx();
                    }
                }
            }

            tx.success();
        } finally {
            tx.close();
        }
        System.out.println("count = " + count);
    }
}
公共类连接扩展测试{
私有静态最终字符串PATH=“target/connect.db”;
公共静态最终标签BASETEXT=DynamicLabel.Label(“BASETEXT”);
公共静态最终标签诊断=DynamicLabel.Label(“诊断”);
公共静态最终字符串TEXTVALUE=“TEXTVALUE”;
公共静态最终字符串TEXT=“TEXT”;
公共静态最终字符串索引\u NAME=“basetext”;
private static final RelationshipType ASSOCIATED=DynamicRelationshipType.withName(“ASSOCIATED”);
私有GraphDatabaseService数据库;
@以前
public void setUp()引发异常{
//db=新GraphDatabaseFactory().newEmbeddedDatabase(路径);
db=新的TestGraphDatabaseFactory()。新的无常数据库();
try(事务tx=db.beginTx()){
对于(inti=100_000;i<250_000;i++)db.createNode(BASETEXT).setProperty(TEXTVALUE,“foo”+i+“bar”);
成功();
}
try(事务tx=db.beginTx()){
对于(int i=100_000;i<250_000;i+=2)db.createNode(DIAGNOSE).setProperty(TEXT,String.valueOf(i));
成功();
}
}
//120k基本文本节点
//50k诊断节点
@试验
public void testConnect()引发异常{
GlobalGraphOperations ops=GlobalGraphOperations.at(db);
try(事务tx=db.beginTx()){
Index Index=db.Index().forNodes(索引名称,LuceneIndexImplementation.FULLTEXT\u配置);
用于(节点baseText:ops.GetAllNodeWithLabel(baseText)){
index.add(baseText、TEXTVALUE、baseText.getProperty(TEXTVALUE));
}
成功();
}
整数计数=0;
事务tx=db.beginTx();
试一试{
Index Index=db.Index().forNodes(索引名称);
用于(节点诊断:ops.getAllNodeWithLabel(诊断)){
String text=(String)diagnose.getProperty(text);
IndexHits hits=index.query(TEXTVALUE,\“”+text+“\”);//在文本包含空格的情况下引用
对于(节点baseText:hits){
baseText.createRelationshipTo(诊断,关联);
//批量交易
如果(++计数%50000==0){
System.out.println(“count=“+count”);
成功();
tx.close();
tx=db.beginTx();
}
}
}
成功();
}最后{
tx.close();
}
System.out.println(“count=“+count”);
}
}

不要使用正则表达式,而是使用Lucene遗留索引进行全文搜索:例如,在上记录了一次关于文档分类的网络研讨会。我想这与您的用例有些相似。
public class ConnectIndexTest {
    private static final String PATH = "target/connect.db";
    public static final Label BASETEXT = DynamicLabel.label("BASETEXT");
    public static final Label DIAGNOSE = DynamicLabel.label("DIAGNOSE");
    public static final String TEXTVALUE = "TEXTVALUE";
    public static final String TEXT = "TEXT";
    public static final String INDEX_NAME = "basetext";
    private static final RelationshipType ASSOCIATED = DynamicRelationshipType.withName("ASSOCIATED");
    private GraphDatabaseService db;

    @Before
    public void setUp() throws Exception {
//      db = new GraphDatabaseFactory().newEmbeddedDatabase(PATH);
        db = new TestGraphDatabaseFactory().newImpermanentDatabase();
        try (Transaction tx = db.beginTx()) {
            for (int i = 100_000; i < 250_000; i++) db.createNode(BASETEXT).setProperty(TEXTVALUE, "foo " + i + " bar");
            tx.success();
        }
        try (Transaction tx = db.beginTx()) {
            for (int i = 100_000; i < 250_000; i += 2) db.createNode(DIAGNOSE).setProperty(TEXT, String.valueOf(i));
            tx.success();
        }
    }

    // 120k BASETEXT Nodes
    // 50k DIAGNOSE Nodes
    @Test
    public void testConnect() throws Exception {
        GlobalGraphOperations ops = GlobalGraphOperations.at(db);
        try (Transaction tx = db.beginTx()) {
            Index<Node> index = db.index().forNodes(INDEX_NAME, LuceneIndexImplementation.FULLTEXT_CONFIG);
            for (Node baseText : ops.getAllNodesWithLabel(BASETEXT)) {
                index.add(baseText, TEXTVALUE, baseText.getProperty(TEXTVALUE));
            }
            tx.success();
        }
        int count = 0;
        Transaction tx = db.beginTx();
        try {
            Index<Node> index = db.index().forNodes(INDEX_NAME);
            for (Node diagnose : ops.getAllNodesWithLabel(DIAGNOSE)) {
                String text = (String) diagnose.getProperty(TEXT);
                IndexHits<Node> hits = index.query(TEXTVALUE, "\"" + text + "\"");// quote in case text contains spaces
                for (Node baseText : hits) {
                    baseText.createRelationshipTo(diagnose, ASSOCIATED);
                    // batch transaction
                    if (++count % 50000 == 0) {
                        System.out.println("count = " + count);
                        tx.success();
                        tx.close();
                        tx = db.beginTx();
                    }
                }
            }

            tx.success();
        } finally {
            tx.close();
        }
        System.out.println("count = " + count);
    }
}