Python 根据字符串比较展开并删除重复项

Python 根据字符串比较展开并删除重复项,python,mongodb,pymongo,aggregation-framework,Python,Mongodb,Pymongo,Aggregation Framework,我有一个mongodb(2.6版)文档,如下所示: { "Uniprot": { "GO": [ "cytoplasm [GO:0005737]", "nucleolus [GO:0005730]", "calcium ion binding [GO:0005509]", "zinc ion binding [GO:0008270]" ], "G

我有一个mongodb(2.6版)文档,如下所示:

{
    "Uniprot": {
        "GO": [
            "cytoplasm [GO:0005737]",
            "nucleolus [GO:0005730]",
            "calcium ion binding [GO:0005509]",
            "zinc ion binding [GO:0008270]"
        ],
        "GO cc": [
            "cytoplasm [GO:0005737]",
            "nucleolus [GO:0005730]"
        ],
        "GO bp": [
            ""
        ],
        "GO mf": [
            "calcium ion binding [GO:0005509]",
            "zinc ion binding [GO:0008270]"
        ],
        "GO_ID": [
            "GO:0005509",
            "GO:0005737",
            "GO:0005730",
            "GO:0008270"
        ]
    }
}
使用以下代码,其中test_3是上一个文档的集合:

project = {"$project":{"_id": False,
               "Uniprot": "$Uniprot.Uniprot",
               "GO Description": {
                    "$cond":[
                    {"$eq": ["$Uniprot.GO", "$Uniprot.GO cc"]},
                    "$Uniprot.GO cc",
                        {"$cond":[
                        {"$eq": ["$Uniprot.GO", "$Uniprot.GO mf"]},
                        "$Uniprot.GO mf",
                            {"$cond":[
                            {"$eq": ["$Uniprot.GO", "$Uniprot.GO bp"]},
                            "$Uniprot.GO bp", False]}]}]},
               "GO Type": {"$cond":[
                    {"$eq": ["$Uniprot.GO", "$Uniprot.GO cc"]},
                    "Cellular component",
                        {"$cond":[
                        {"$eq": ["$Uniprot.GO", "$Uniprot.GO mf"]},
                        "Molecular function",
                            {"$cond":[
                            {"$eq": ["$Uniprot.GO", "$Uniprot.GO bp"]},
                            "Biological Process", False]}]}]},
               "GO ID":"$Uniprot.GO_ID"}}
redact = {"$redact":{"$cond":[{"$or":[
                            {"$eq":["$Uniprot.GO", "$Uniprot.GO cc"]},
                            {"$eq":["$Uniprot.GO", "$Uniprot.GO mf"]},
                            {"$eq":["$Uniprot.GO", "$Uniprot.GO bp"]}]},
                                            "$$KEEP", "$$PRUNE"]}}
d = test_3.aggregate([{"$match":{"Uniprot.Uniprot": "P33764"}},
                      {"$unwind":"$Uniprot.GO"},
                      {"$unwind":"$Uniprot.GO cc"},
                      {"$unwind":"$Uniprot.GO bp"},
                      {"$unwind":"$Uniprot.GO mf"},
                      redact,
                      {"$unwind":"$Uniprot.GO_ID"},
                      project
                      ])
输出为:

{u'GO ID': u'GO:0005509', u'GO Type': u'Cellular component', u'GO Description': u'cytoplasm [GO:0005737]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0005737', u'GO Type': u'Cellular component', u'GO Description': u'cytoplasm [GO:0005737]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0005730', u'GO Type': u'Cellular component', u'GO Description': u'cytoplasm [GO:0005737]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0008270', u'GO Type': u'Cellular component', u'GO Description': u'cytoplasm [GO:0005737]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0005509', u'GO Type': u'Cellular component', u'GO Description': u'cytoplasm [GO:0005737]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0005737', u'GO Type': u'Cellular component', u'GO Description': u'cytoplasm [GO:0005737]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0005730', u'GO Type': u'Cellular component', u'GO Description': u'cytoplasm [GO:0005737]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0008270', u'GO Type': u'Cellular component', u'GO Description': u'cytoplasm [GO:0005737]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0005509', u'GO Type': u'Cellular component', u'GO Description': u'nucleolus [GO:0005730]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0005737', u'GO Type': u'Cellular component', u'GO Description': u'nucleolus [GO:0005730]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0005730', u'GO Type': u'Cellular component', u'GO Description': u'nucleolus [GO:0005730]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0008270', u'GO Type': u'Cellular component', u'GO Description': u'nucleolus [GO:0005730]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0005509', u'GO Type': u'Cellular component', u'GO Description': u'nucleolus [GO:0005730]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0005737', u'GO Type': u'Cellular component', u'GO Description': u'nucleolus [GO:0005730]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0005730', u'GO Type': u'Cellular component', u'GO Description': u'nucleolus [GO:0005730]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0008270', u'GO Type': u'Cellular component', u'GO Description': u'nucleolus [GO:0005730]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0005509', u'GO Type': u'Molecular function', u'GO Description': u'calcium ion binding [GO:0005509]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0005737', u'GO Type': u'Molecular function', u'GO Description': u'calcium ion binding [GO:0005509]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0005730', u'GO Type': u'Molecular function', u'GO Description': u'calcium ion binding [GO:0005509]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0008270', u'GO Type': u'Molecular function', u'GO Description': u'calcium ion binding [GO:0005509]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0005509', u'GO Type': u'Molecular function', u'GO Description': u'calcium ion binding [GO:0005509]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0005737', u'GO Type': u'Molecular function', u'GO Description': u'calcium ion binding [GO:0005509]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0005730', u'GO Type': u'Molecular function', u'GO Description': u'calcium ion binding [GO:0005509]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0008270', u'GO Type': u'Molecular function', u'GO Description': u'calcium ion binding [GO:0005509]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0005509', u'GO Type': u'Molecular function', u'GO Description': u'zinc ion binding [GO:0008270]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0005737', u'GO Type': u'Molecular function', u'GO Description': u'zinc ion binding [GO:0008270]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0005730', u'GO Type': u'Molecular function', u'GO Description': u'zinc ion binding [GO:0008270]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0008270', u'GO Type': u'Molecular function', u'GO Description': u'zinc ion binding [GO:0008270]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0005509', u'GO Type': u'Molecular function', u'GO Description': u'zinc ion binding [GO:0008270]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0005737', u'GO Type': u'Molecular function', u'GO Description': u'zinc ion binding [GO:0008270]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0005730', u'GO Type': u'Molecular function', u'GO Description': u'zinc ion binding [GO:0008270]', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0008270', u'GO Type': u'Molecular function', u'GO Description': u'zinc ion binding [GO:0008270]', u'Uniprot': u'P33764'}
现在,我使用以下方法修改输出:

b = []
for i in d:
    go = i["GO Description"][-13:]
    if i["GO ID"] == go[2:-1]:
        entry = i.copy()
        entry["GO Description"] = i["GO Description"][:-13]
        if entry not in b:
            b.append(entry)
for i in b:
    print i
要获得我的预期输出:

{u'GO ID': u'GO:0005737', u'GO Type': u'Cellular component', u'GO Description': u'cytoplasm', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0005730', u'GO Type': u'Cellular component', u'GO Description': u'nucleolus', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0005509', u'GO Type': u'Molecular function', u'GO Description': u'calcium ion binding', u'Uniprot': u'P33764'}
{u'GO ID': u'GO:0008270', u'GO Type': u'Molecular function', u'GO Description': u'zinc ion binding', u'Uniprot': u'P33764'}
但是,这种方式速度很慢,我希望在mongodb中完成,而不需要在python中进行进一步处理。我怎么做?
我注意到的事情:由于GO和GO_ID的展开,聚合会产生重复的行(这就是为什么b中没有
条目的原因),我需要检查
GO ID
是否在
GO Description
上,因为我找不到使用
$search
$text
的方法

你能告诉我们预期的产量吗?每当有人问这样的问题时。预期输出,将帮助您快速获得答案。@chridam csharpcoder我编辑了这个问题。