在Neo4J中通过全文搜索创建关系
我构建了一个Neo4j图形数据库,其中包含大约50000个带有标签DIAGNOSE的节点,每个节点都有一个最多50个字符的字符串属性文本。同一个图形数据库包含大约120000个带有标签BASETEXT的节点,每个节点都有一个字符串属性TEXTVALUE,最多包含175000个字符。我的目标是创建一个关系(b:BASETEXT)–[:ASSOCIATED]->(d:DIAGNOSE),以防DIAGNOSE.TEXT包含在BASETEXT.TEXTVALUE中–总共进行大约2.9*10^9次搜索。我在cypher中尝试了以下两种方法: 方法1:在Neo4J中通过全文搜索创建关系,neo4j,full-text-search,cypher,Neo4j,Full Text Search,Cypher,我构建了一个Neo4j图形数据库,其中包含大约50000个带有标签DIAGNOSE的节点,每个节点都有一个最多50个字符的字符串属性文本。同一个图形数据库包含大约120000个带有标签BASETEXT的节点,每个节点都有一个字符串属性TEXTVALUE,最多包含175000个字符。我的目标是创建一个关系(b:BASETEXT)–[:ASSOCIATED]->(d:DIAGNOSE),以防DIAGNOSE.TEXT包含在BASETEXT.TEXTVALUE中–总共进行大约2.9*10^9次搜索。我
match (b:BASETEXT), (d:DIAGNOSE)
where b.TEXTVALUE =~ (".* " + d.TEXT + " .*")
merge (b) -[:ASSOCIATED]-> (d);
方法2(在每个诊断节点和每个BASETEXT节点之间创建一个关系,如果文本在TEXTVALUE中,则为包含的关系属性赋值为true,否则赋值为false,最后删除所有关联为.CONTAINED=false的关系):
上述方法都不起作用。方法1在半小时内没有找到终点,方法2找到终点,但需要60天。
关于如何在Neo4J中正确实现文本搜索并解决问题(最好是在Cypher中)有什么建议吗?我还创建了一个
我认为这是一个在cypher中还没有得到很好支持的用例
如果您想尝试一种简单的密码变体,它应该可以工作(但速度较慢),请尝试以下方法:
MATCH (d:DIAGNOSE)
WHERE NOT () -[:ASSOCIATED]-> (d)
WITH d
SKIP 0 LIMIT 1000
MATCH (b:BASETEXT)
WHERE (b.TEXTVALUE =~ (".* " + d.TEXT + " .*"))
CREATE (b) -[:ASSOCIATED]-> (d)
RETURN count(*);
在Java中执行此操作应该快得多:
public class ConnectIndexTest {
private static final String PATH = "target/connect.db";
public static final Label BASETEXT = DynamicLabel.label("BASETEXT");
public static final Label DIAGNOSE = DynamicLabel.label("DIAGNOSE");
public static final String TEXTVALUE = "TEXTVALUE";
public static final String TEXT = "TEXT";
public static final String INDEX_NAME = "basetext";
private static final RelationshipType ASSOCIATED = DynamicRelationshipType.withName("ASSOCIATED");
private GraphDatabaseService db;
@Before
public void setUp() throws Exception {
// db = new GraphDatabaseFactory().newEmbeddedDatabase(PATH);
db = new TestGraphDatabaseFactory().newImpermanentDatabase();
try (Transaction tx = db.beginTx()) {
for (int i = 100_000; i < 250_000; i++) db.createNode(BASETEXT).setProperty(TEXTVALUE, "foo " + i + " bar");
tx.success();
}
try (Transaction tx = db.beginTx()) {
for (int i = 100_000; i < 250_000; i += 2) db.createNode(DIAGNOSE).setProperty(TEXT, String.valueOf(i));
tx.success();
}
}
// 120k BASETEXT Nodes
// 50k DIAGNOSE Nodes
@Test
public void testConnect() throws Exception {
GlobalGraphOperations ops = GlobalGraphOperations.at(db);
try (Transaction tx = db.beginTx()) {
Index<Node> index = db.index().forNodes(INDEX_NAME, LuceneIndexImplementation.FULLTEXT_CONFIG);
for (Node baseText : ops.getAllNodesWithLabel(BASETEXT)) {
index.add(baseText, TEXTVALUE, baseText.getProperty(TEXTVALUE));
}
tx.success();
}
int count = 0;
Transaction tx = db.beginTx();
try {
Index<Node> index = db.index().forNodes(INDEX_NAME);
for (Node diagnose : ops.getAllNodesWithLabel(DIAGNOSE)) {
String text = (String) diagnose.getProperty(TEXT);
IndexHits<Node> hits = index.query(TEXTVALUE, "\"" + text + "\"");// quote in case text contains spaces
for (Node baseText : hits) {
baseText.createRelationshipTo(diagnose, ASSOCIATED);
// batch transaction
if (++count % 50000 == 0) {
System.out.println("count = " + count);
tx.success();
tx.close();
tx = db.beginTx();
}
}
}
tx.success();
} finally {
tx.close();
}
System.out.println("count = " + count);
}
}
公共类连接扩展测试{
私有静态最终字符串PATH=“target/connect.db”;
公共静态最终标签BASETEXT=DynamicLabel.Label(“BASETEXT”);
公共静态最终标签诊断=DynamicLabel.Label(“诊断”);
公共静态最终字符串TEXTVALUE=“TEXTVALUE”;
公共静态最终字符串TEXT=“TEXT”;
公共静态最终字符串索引\u NAME=“basetext”;
private static final RelationshipType ASSOCIATED=DynamicRelationshipType.withName(“ASSOCIATED”);
私有GraphDatabaseService数据库;
@以前
public void setUp()引发异常{
//db=新GraphDatabaseFactory().newEmbeddedDatabase(路径);
db=新的TestGraphDatabaseFactory()。新的无常数据库();
try(事务tx=db.beginTx()){
对于(inti=100_000;i<250_000;i++)db.createNode(BASETEXT).setProperty(TEXTVALUE,“foo”+i+“bar”);
成功();
}
try(事务tx=db.beginTx()){
对于(int i=100_000;i<250_000;i+=2)db.createNode(DIAGNOSE).setProperty(TEXT,String.valueOf(i));
成功();
}
}
//120k基本文本节点
//50k诊断节点
@试验
public void testConnect()引发异常{
GlobalGraphOperations ops=GlobalGraphOperations.at(db);
try(事务tx=db.beginTx()){
Index Index=db.Index().forNodes(索引名称,LuceneIndexImplementation.FULLTEXT\u配置);
用于(节点baseText:ops.GetAllNodeWithLabel(baseText)){
index.add(baseText、TEXTVALUE、baseText.getProperty(TEXTVALUE));
}
成功();
}
整数计数=0;
事务tx=db.beginTx();
试一试{
Index Index=db.Index().forNodes(索引名称);
用于(节点诊断:ops.getAllNodeWithLabel(诊断)){
String text=(String)diagnose.getProperty(text);
IndexHits hits=index.query(TEXTVALUE,\“”+text+“\”);//在文本包含空格的情况下引用
对于(节点baseText:hits){
baseText.createRelationshipTo(诊断,关联);
//批量交易
如果(++计数%50000==0){
System.out.println(“count=“+count”);
成功();
tx.close();
tx=db.beginTx();
}
}
}
成功();
}最后{
tx.close();
}
System.out.println(“count=“+count”);
}
}
不要使用正则表达式,而是使用Lucene遗留索引进行全文搜索:例如,在上记录了一次关于文档分类的网络研讨会。我想这与您的用例有些相似。
public class ConnectIndexTest {
private static final String PATH = "target/connect.db";
public static final Label BASETEXT = DynamicLabel.label("BASETEXT");
public static final Label DIAGNOSE = DynamicLabel.label("DIAGNOSE");
public static final String TEXTVALUE = "TEXTVALUE";
public static final String TEXT = "TEXT";
public static final String INDEX_NAME = "basetext";
private static final RelationshipType ASSOCIATED = DynamicRelationshipType.withName("ASSOCIATED");
private GraphDatabaseService db;
@Before
public void setUp() throws Exception {
// db = new GraphDatabaseFactory().newEmbeddedDatabase(PATH);
db = new TestGraphDatabaseFactory().newImpermanentDatabase();
try (Transaction tx = db.beginTx()) {
for (int i = 100_000; i < 250_000; i++) db.createNode(BASETEXT).setProperty(TEXTVALUE, "foo " + i + " bar");
tx.success();
}
try (Transaction tx = db.beginTx()) {
for (int i = 100_000; i < 250_000; i += 2) db.createNode(DIAGNOSE).setProperty(TEXT, String.valueOf(i));
tx.success();
}
}
// 120k BASETEXT Nodes
// 50k DIAGNOSE Nodes
@Test
public void testConnect() throws Exception {
GlobalGraphOperations ops = GlobalGraphOperations.at(db);
try (Transaction tx = db.beginTx()) {
Index<Node> index = db.index().forNodes(INDEX_NAME, LuceneIndexImplementation.FULLTEXT_CONFIG);
for (Node baseText : ops.getAllNodesWithLabel(BASETEXT)) {
index.add(baseText, TEXTVALUE, baseText.getProperty(TEXTVALUE));
}
tx.success();
}
int count = 0;
Transaction tx = db.beginTx();
try {
Index<Node> index = db.index().forNodes(INDEX_NAME);
for (Node diagnose : ops.getAllNodesWithLabel(DIAGNOSE)) {
String text = (String) diagnose.getProperty(TEXT);
IndexHits<Node> hits = index.query(TEXTVALUE, "\"" + text + "\"");// quote in case text contains spaces
for (Node baseText : hits) {
baseText.createRelationshipTo(diagnose, ASSOCIATED);
// batch transaction
if (++count % 50000 == 0) {
System.out.println("count = " + count);
tx.success();
tx.close();
tx = db.beginTx();
}
}
}
tx.success();
} finally {
tx.close();
}
System.out.println("count = " + count);
}
}