Python 3.x 如何使用pymongo在MogoDB中加速带条件的插入过程_Python 3.x_Mongodb_Pymongo_Insert Update

Python 3.x 如何使用pymongo在MogoDB中加速带条件的插入过程

python-3.x mongodb

Python 3.x 如何使用pymongo在MogoDB中加速带条件的插入过程,python-3.x,mongodb,pymongo,insert-update,Python 3.x,Mongodb,Pymongo,Insert Update,我有一个数据框（以.csv格式），其中包含有关工作合同的信息。每一行代表一个特定的合同（更确切地说是“激活”），在每一列中包含有关该合同的信息，如工人id、开始日期、雇主id等。我在这里仅提供以下几行中的一个示例。原始样本由超过1700万行组成，每个合同包含更多的信息。通过pymongo在我的MongoDB集合中插入数据时，我希望每个人都有一个特定的文档（即工人id），其中包含合同信息的数组。注意：某些数据不会在同一工人id（例如性别、出生）上的条目之间更改我想下面的例子可以阐明我的目标 #

我有一个数据框（以.csv格式），其中包含有关工作合同的信息。每一行代表一个特定的合同（更确切地说是“激活”），在每一列中包含有关该合同的信息，如工人id、开始日期、雇主id等。我在这里仅提供以下几行中的一个示例。原始样本由超过1700万行组成，每个合同包含更多的信息。通过pymongo在我的MongoDB集合中插入数据时，我希望每个人都有一个特定的文档（即工人id），其中包含合同信息的数组。注意：某些数据不会在同一工人id（例如性别、出生）上的条目之间更改

我想下面的例子可以阐明我的目标

# Libraries

import datetime
import numpy as np
import pandas as pd
from pandas import DataFrame
from pymongo import MongoClient

# Create example DataFrame
data = { 'worker_id': [ 1234, 4556, 4556, 1234, 5578, 9987 ],
         'birth': [ '1990-02-27', '1970-01-21', '1970-01-21', '1990-02-27', '1968-07-05', '1990-05-05' ],
         'gender': [ 'F', 'M', 'M', 'F', 'X', 'F' ],
         'employer_id': [ 5567, 7789, 7789, 7789, 4321, 2234 ],
         'start': [ '2012-09-14', '2011-12-31', '2010-12-31', '2009-10-31', '2015-04-28', '2008-01-01' ],
         'end': [ '2012-10-14', '2012-01-01', '2011-05-25', '2010-10-31', '2018-01-01', 'NaT' ],
         'contract': [ 'A.01.00', 'A.02.00', 'A.04.02', 'A.01.00', 'A.02.00', 'A.05.00' ]
       }

df = pd.DataFrame(data, columns = [ 'worker_id', 'birth', 'gender', 'employer_id', 'start', 'end', 'contract' ])

# Sort by 'worker_id' and 'start'
df.sort_values(['worker_id', 'start'], ascending = [True, True], inplace = True)

# Change data type for 'worker_id' and 'employer_id'
df['worker_id'] = df['worker_id'].astype(np.int64)
df['employer_id'] = df['employer_id'].astype(np.int64)

# Connect to MongoDB
client = MongoClient('localhost', 27017)
db = client['example_db']
col = db['example_collection']

for index, row in df.iterrows():
    
    dt_birth = None if str(row['birth']) == 'NaT' else datetime.datetime.strptime(str(row['birth']), '%Y-%m-%d')
    dt_start = None if str(row['start']) == 'NaT' else datetime.datetime.strptime(str(row['start']), '%Y-%m-%d') 
    dt_end = None if str(row['end']) == 'NaT' else datetime.datetime.strptime(str(row['end']), '%Y-%m-%d') 

    # Check whether a document with the 'worker_id' does already exist
    find = col.find_one( { "worker_id" : row['worker_id'] } )  
    
    if find == None: 
        
        # If 'worker_id' is not already in the collection, a new document is generated
        col.insert_one( {
            "worker_id" : row['worker_id'],
            "birth" : dt_birth, 
            "gender" : str(row['gender']),
            "activations" : [ {
                "employer_id" : row['employer_id'],
                "start" : dt_start,
                "end" : dt_end,
                "contract" : str(row['contract']),
                "coef" : 6.5, # It will be the same for each object in 'activations'
                "ord_id" : np.float64(1) # The first one has value 1, then it will increment
                } ] 
            } )

    else:
        
        # Identify the last 'ord_id' in 'activations'
        pipeline = [
            { "$match": { 
                "worker_id": row['worker_id'] } },
            { "$project": { 
                "_id": 0,
                "last_activation": { "$arrayElemAt": [ "$activations.ord_id", 0 ] } 
                } } 
            ]
        
        last_ord_id = DataFrame(list(col.aggregate(pipeline)))
        
        # For each row with the 'worker_id' already in the db, 
        # the function pushes the new information in the array 'activations'
        col.update_one(    
            { "worker_id" : row['worker_id'] },
            { "$push": { "activations":
            {
            "$each": [ {
                "employer_id" : row['employer_id'],
                "start" : dt_start,
                "end" : dt_end,
                "contract" : str(row['contract']),
                "coef" : 6.5,
                "ord_id" : last_ord_id.loc[0,'last_activation'] + 1 # Increment 'ord_id' by one
            } ],
            "$position": 0 # Place as first object in 'activations'
            } } } )
            
    col.create_index("worker_id") # Set 'worker_id' as index

最终，MongoDB中的数据库将如下所示：

{
    "_id" : ObjectId("5f690c7e6267ee26f8b84034"),
    "worker_id" : 9987,
    "birth" : ISODate("1990-05-05T02:00:00.000+02:00"),
    "gender" : "F",
    "activations" : [
        {
            "employer_id" : 2234,
            "start" : ISODate("2008-01-01T01:00:00.000+01:00"),
            "end" : null,
            "contract" : "A.05.00",
            "coef" : 6.5,
            "ord_id" : 1
        }
    ]
},
{
    "_id" : ObjectId("5f690c7e6267ee26f8b84033"),
    "worker_id" : 5578,
    "birth" : ISODate("1968-07-05T02:00:00.000+02:00"),
    "gender" : "X",
    "activations" : [
        {
            "employer_id" : 4321,
            "start" : ISODate("2015-04-28T02:00:00.000+02:00"),
            "end" : ISODate("2018-01-01T01:00:00.000+01:00"),
            "contract" : "A.02.00",
            "coef" : 6.5,
            "ord_id" : 1
        }
    ]
},
{
    "_id" : ObjectId("5f690c7e6267ee26f8b84032"),
    "worker_id" : 4556,
    "birth" : ISODate("1970-01-21T01:00:00.000+01:00"),
    "gender" : "M",
    "activations" : [
        {
            "employer_id" : 7789,
            "start" : ISODate("2011-12-31T01:00:00.000+01:00"),
            "end" : ISODate("2012-01-01T01:00:00.000+01:00"),
            "contract" : "A.02.00",
            "coef" : 6.5,
            "ord_id" : 2
        },
        {
            "employer_id" : 7789,
            "start" : ISODate("2010-12-31T01:00:00.000+01:00"),
            "end" : ISODate("2011-05-25T02:00:00.000+02:00"),
            "contract" : "A.04.02",
            "coef" : 6.5,
            "ord_id" : 1
        }
    ]
},
{
    "_id" : ObjectId("5f690c7e6267ee26f8b84031"),
    "worker_id" : 1234,
    "birth" : ISODate("1990-02-27T01:00:00.000+01:00"),
    "gender" : "F",
    "activations" : [
        {
            "employer_id" : 5567,
            "start" : ISODate("2012-09-14T02:00:00.000+02:00"),
            "end" : ISODate("2012-10-14T02:00:00.000+02:00"),
            "contract" : "A.01.00",
            "coef" : 6.5,
            "ord_id" : 2
        },
        {
            "employer_id" : 7789,
            "start" : ISODate("2009-10-31T01:00:00.000+01:00"),
            "end" : ISODate("2010-10-31T02:00:00.000+02:00"),
            "contract" : "A.01.00",
            "coef" : 6.5,
            "ord_id" : 1
        }
    ]
}

我想知道我写的过程是否能以某种方式实现并加快。不幸的是，我对Python和MongoDB的了解有限。提前感谢您提出的任何可能的建议