Python 根据字符串比较展开并删除重复项
我有一个mongodb(2.6版)文档,如下所示:Python 根据字符串比较展开并删除重复项,python,mongodb,pymongo,aggregation-framework,Python,Mongodb,Pymongo,Aggregation Framework,我有一个mongodb(2.6版)文档,如下所示: { "Uniprot": { "GO": [ "cytoplasm [GO:0005737]", "nucleolus [GO:0005730]", "calcium ion binding [GO:0005509]", "zinc ion binding [GO:0008270]" ], "G
{
"Uniprot": {
"GO": [
"cytoplasm [GO:0005737]",
"nucleolus [GO:0005730]",
"calcium ion binding [GO:0005509]",
"zinc ion binding [GO:0008270]"
],
"GO cc": [
"cytoplasm [GO:0005737]",
"nucleolus [GO:0005730]"
],
"GO bp": [
""
],
"GO mf": [
"calcium ion binding [GO:0005509]",
"zinc ion binding [GO:0008270]"
],
"GO_ID": [
"GO:0005509",
"GO:0005737",
"GO:0005730",
"GO:0008270"
]
}
}
使用以下代码,其中test_3是上一个文档的集合:
project = {"$project":{"_id": False,
"Uniprot": "$Uniprot.Uniprot",
"GO Description": {
"$cond":[
{"$eq": ["$Uniprot.GO", "$Uniprot.GO cc"]},
"$Uniprot.GO cc",
{"$cond":[
{"$eq": ["$Uniprot.GO", "$Uniprot.GO mf"]},
"$Uniprot.GO mf",
{"$cond":[
{"$eq": ["$Uniprot.GO", "$Uniprot.GO bp"]},
"$Uniprot.GO bp", False]}]}]},
"GO Type": {"$cond":[
{"$eq": ["$Uniprot.GO", "$Uniprot.GO cc"]},
"Cellular component",
{"$cond":[
{"$eq": ["$Uniprot.GO", "$Uniprot.GO mf"]},
"Molecular function",
{"$cond":[
{"$eq": ["$Uniprot.GO", "$Uniprot.GO bp"]},
"Biological Process", False]}]}]},
"GO ID":"$Uniprot.GO_ID"}}
redact = {"$redact":{"$cond":[{"$or":[
{"$eq":["$Uniprot.GO", "$Uniprot.GO cc"]},
{"$eq":["$Uniprot.GO", "$Uniprot.GO mf"]},
{"$eq":["$Uniprot.GO", "$Uniprot.GO bp"]}]},
"$$KEEP", "$$PRUNE"]}}
d = test_3.aggregate([{"$match":{"Uniprot.Uniprot": "P33764"}},
{"$unwind":"$Uniprot.GO"},
{"$unwind":"$Uniprot.GO cc"},
{"$unwind":"$Uniprot.GO bp"},
{"$unwind":"$Uniprot.GO mf"},
redact,
{"$unwind":"$Uniprot.GO_ID"},
project
])
输出为:
{u'GO ID': u'GO:0005509', u'GO Type': u'Cellular component', u'GO Description': u'cytoplasm [GO:0005737]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0005737', u'GO Type': u'Cellular component', u'GO Description': u'cytoplasm [GO:0005737]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0005730', u'GO Type': u'Cellular component', u'GO Description': u'cytoplasm [GO:0005737]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0008270', u'GO Type': u'Cellular component', u'GO Description': u'cytoplasm [GO:0005737]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0005509', u'GO Type': u'Cellular component', u'GO Description': u'cytoplasm [GO:0005737]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0005737', u'GO Type': u'Cellular component', u'GO Description': u'cytoplasm [GO:0005737]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0005730', u'GO Type': u'Cellular component', u'GO Description': u'cytoplasm [GO:0005737]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0008270', u'GO Type': u'Cellular component', u'GO Description': u'cytoplasm [GO:0005737]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0005509', u'GO Type': u'Cellular component', u'GO Description': u'nucleolus [GO:0005730]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0005737', u'GO Type': u'Cellular component', u'GO Description': u'nucleolus [GO:0005730]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0005730', u'GO Type': u'Cellular component', u'GO Description': u'nucleolus [GO:0005730]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0008270', u'GO Type': u'Cellular component', u'GO Description': u'nucleolus [GO:0005730]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0005509', u'GO Type': u'Cellular component', u'GO Description': u'nucleolus [GO:0005730]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0005737', u'GO Type': u'Cellular component', u'GO Description': u'nucleolus [GO:0005730]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0005730', u'GO Type': u'Cellular component', u'GO Description': u'nucleolus [GO:0005730]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0008270', u'GO Type': u'Cellular component', u'GO Description': u'nucleolus [GO:0005730]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0005509', u'GO Type': u'Molecular function', u'GO Description': u'calcium ion binding [GO:0005509]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0005737', u'GO Type': u'Molecular function', u'GO Description': u'calcium ion binding [GO:0005509]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0005730', u'GO Type': u'Molecular function', u'GO Description': u'calcium ion binding [GO:0005509]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0008270', u'GO Type': u'Molecular function', u'GO Description': u'calcium ion binding [GO:0005509]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0005509', u'GO Type': u'Molecular function', u'GO Description': u'calcium ion binding [GO:0005509]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0005737', u'GO Type': u'Molecular function', u'GO Description': u'calcium ion binding [GO:0005509]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0005730', u'GO Type': u'Molecular function', u'GO Description': u'calcium ion binding [GO:0005509]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0008270', u'GO Type': u'Molecular function', u'GO Description': u'calcium ion binding [GO:0005509]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0005509', u'GO Type': u'Molecular function', u'GO Description': u'zinc ion binding [GO:0008270]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0005737', u'GO Type': u'Molecular function', u'GO Description': u'zinc ion binding [GO:0008270]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0005730', u'GO Type': u'Molecular function', u'GO Description': u'zinc ion binding [GO:0008270]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0008270', u'GO Type': u'Molecular function', u'GO Description': u'zinc ion binding [GO:0008270]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0005509', u'GO Type': u'Molecular function', u'GO Description': u'zinc ion binding [GO:0008270]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0005737', u'GO Type': u'Molecular function', u'GO Description': u'zinc ion binding [GO:0008270]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0005730', u'GO Type': u'Molecular function', u'GO Description': u'zinc ion binding [GO:0008270]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0008270', u'GO Type': u'Molecular function', u'GO Description': u'zinc ion binding [GO:0008270]', u'Uniprot': u'P33764'}
现在,我使用以下方法修改输出:
b = []
for i in d:
go = i["GO Description"][-13:]
if i["GO ID"] == go[2:-1]:
entry = i.copy()
entry["GO Description"] = i["GO Description"][:-13]
if entry not in b:
b.append(entry)
for i in b:
print i
要获得我的预期输出:
{u'GO ID': u'GO:0005737', u'GO Type': u'Cellular component', u'GO Description': u'cytoplasm', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0005730', u'GO Type': u'Cellular component', u'GO Description': u'nucleolus', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0005509', u'GO Type': u'Molecular function', u'GO Description': u'calcium ion binding', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0008270', u'GO Type': u'Molecular function', u'GO Description': u'zinc ion binding', u'Uniprot': u'P33764'}
但是,这种方式速度很慢,我希望在mongodb中完成,而不需要在python中进行进一步处理。我怎么做?
我注意到的事情:由于GO和GO_ID的展开,聚合会产生重复的行(这就是为什么b中没有
条目的原因),我需要检查GO ID
是否在GO Description
上,因为我找不到使用$search
或$text
的方法 你能告诉我们预期的产量吗?每当有人问这样的问题时。预期输出,将帮助您快速获得答案。@chridam csharpcoder我编辑了这个问题。