Python 3.x MongoDB-查询以获取每个项目的最新报价

Python 3.x MongoDB-查询以获取每个项目的最新报价,python-3.x,mongodb,pymongo,Python 3.x,Mongodb,Pymongo,我试着效仿 我们有一个数百万股票行情和价格的数据库。通常需要获取每个股票代码的最后一行(最近一行)。在大型数据库中,我们在ticker和下面使用的createdDateTime字段上有复合索引 因此,问题分为两部分: 高效运行以最小化IO和运行时间的最佳索引是什么 我在下面执行的聚合查询返回0行以上的测试数据。它应该返回两行,每行都有一个股票代码的最新时间 在我急于创建可复制样本的过程中,我有几个错误: 我将CreatedDataTimeIsoDateTime添加到四个测试行中的每一行 在最后

我试着效仿

我们有一个数百万股票行情和价格的数据库。通常需要获取每个股票代码的最后一行(最近一行)。在大型数据库中,我们在ticker和下面使用的createdDateTime字段上有复合索引

因此,问题分为两部分:

  • 高效运行以最小化IO和运行时间的最佳索引是什么
  • 我在下面执行的聚合查询返回0行以上的测试数据。它应该返回两行,每行都有一个股票代码的最新时间

  • 在我急于创建可复制样本的过程中,我有几个错误:

  • 我将CreatedDataTimeIsoDateTime添加到四个测试行中的每一行

  • 在最后的打印中,我必须使用ticker\u id而不是ticker。正在获取一个关键错误

  • 下面是更正后的代码,我仍在进行一些测试和质量保证,以确定它是否正确:

    import requests
    import sys
    import traceback
    import pprint
    import json
    import bson
    from datetime import datetime as datetime1
    import datetime
    from time import time
    import time as time2
    import configHandler
    #import boto3
    import pymongo
    from pymongo import MongoClient, UpdateOne
    from pymongo.errors import BulkWriteError
    from datetime import datetime
    import datetime as datetime1
    
    ##########################################################################################
    startTime = time()
    startDateNowFmt = datetime1.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    (config_dict, config_user_dict) = configHandler.getConfigVariables()
    
    
    print("Start TestMongoDBQuerySpeedTuningAggregate, DateTime=" + str(startDateNowFmt))
    
    # print ("ConnectionString:"  + config_dict['MONGODB_CONNECTION_STRING'])
    cluster = MongoClient(config_dict['MONGODB_CONNECTION_STRING'])
    db = cluster[config_dict['MONGODB_CLUSTER']]
    dbCollectionName = "TestQuotesAggregate"
    # dbCollectionName = "ProdPolygonIOQuotes"
    dbCollection = db[dbCollectionName]
    
    doInsert = True    # only need to do this first time
    
    if doInsert:
    
        row1 = {'ticker': 'Test1',
                'timestampIsoDateTime': '2020-09-29T15:31:15',
                'createdDateTimeIsoDateTime': '2020-09-29T15:31:15',
                'createdDateTimeYear': 2020,
                'createdDateTimeMonth': 9,
                'createdDateTimeDay': 29,
                'createdDateTimeHour': 15,
                'createdDateTimeMinute': 31,
                'todaysChangePerc': -11,
                'minuteClose': 100}
    
        row2 = {'ticker': 'Test1',
                'timestampIsoDateTime': '2020-09-29T15:32:15',
                'createdDateTimeIsoDateTime': '2020-09-29T15:32:15',
                'createdDateTimeYear': 2020,
                'createdDateTimeMonth': 9,
                'createdDateTimeDay': 29,
                'createdDateTimeHour': 15,
                'createdDateTimeMinute': 32,
                'todaysChangePerc': -11.1,
                'minuteClose': 99}
    
        row3 = {'ticker': 'Test2',
                'timestampIsoDateTime': '2020-09-29T15:31:15',
                'createdDateTimeIsoDateTime': '2020-09-29T15:31:15',
                'createdDateTimeYear': 2020,
                'createdDateTimeMonth': 9,
                'createdDateTimeDay': 29,
                'createdDateTimeHour': 15,
                'createdDateTimeMinute': 31,
                'todaysChangePerc': -12,
                'minuteClose': 200}
    
        row4 = {'ticker': 'Test2',
                'timestampIsoDateTime': '2020-09-29T15:32:15',
                'createdDateTimeIsoDateTime': '2020-09-29T15:31:15',
                'createdDateTimeYear': 2020,
                'createdDateTimeMonth': 9,
                'createdDateTimeDay': 29,
                'createdDateTimeHour': 15,
                'createdDateTimeMinute': 32,
                'todaysChangePerc': -12.1,
                'minuteClose': 195}
    
        insert_rows = [row1, row2, row3, row4]
        dbCollection.insert_many(insert_rows)
    
    
    
    print("Before aggregation - show the data we have to work with")
    docs1 = dbCollection.find({})
    for doc in docs1:
        print(doc['ticker'], doc['createdDateTimeIsoDateTime'], doc['minuteClose'], doc['todaysChangePerc'])
    
    docs = dbCollection.aggregate([
                {'$match': {
                          '$and': [
                                   {'todaysChangePerc': {'$lt': -10}},
                                   {'createdDateTimeYear': 2020},
                                   {'createdDateTimeMonth': 9},
                                   {'createdDateTimeDay': 29},
                                   {'createdDateTimeHour': 15},
                                   {'createdDateTimeMinute': {"$gt": 1}}
                          ]
                }},
                {'$group': {
                        '_id': '$ticker',
                        'temp_data': {'$last': '$createdDateTimeIsoDateTime'},
                                      'minuteClose': {'$last': '$minuteClose'},
                                      'todaysChangePerc': {'$last': '$todaysChangePerc'}
                }},
                {'$project': {
                         'ticker_id': '$_id',
                         'minuteClose': '$minuteClose',
                         'todaysChangePerc': '$todaysChangePerc'
                }},
                {'$sort': {
                        'timestampIsoDateTime': -1
                }}
            ])
    
    
    #pprint.pprint(docs.explain())
    
    # pprint.pprint(docs)
    countDocs = 0
    print("After aggregation - show the data we have to work with")
    for doc in docs:
        print(doc['ticker_id'], doc['minuteClose'], doc['todaysChangePerc'])
        countDocs += 1
    
    
    endTime = time()
    # print("StartTime=" + str(startTime) + " EndTime=" + str(endTime))
    elapsedTime = endTime - startTime
    endDateNowFmt = datetime1.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    print("\n")
    print("Count Docs:", countDocs)
    print("Server Start DateTime=" + str(startDateNowFmt))
    print("Server End   DateTime=" + str(endDateNowFmt))
    print("ElapsedTime=" + str(elapsedTime) + " seconds")
    

    第二个改进,如果有人可以验证我在$match和$group之间的新排序是否正确

    为了使$last起作用,需要在使用$last语句之前对行进行排序。最后,我可以按股票代码对最终数据列表进行排序

    docs = dbCollection.aggregate([
                {'$match': {
                          '$and': [
                                   {'todaysChangePerc': {'$lt': -10}},
                                   {'createdDateTimeYear': 2020},
                                   {'createdDateTimeMonth': 9},
                                   {'createdDateTimeDay': 29},
                                   {'createdDateTimeHour': 15},
                                   {'createdDateTimeMinute': {"$gt": 1}}
                          ]
                }},
                {'$sort': {
                        'ticker': 1,
                        'timestampIsoDateTime': 1
                }},
                {'$group': {
                        '_id': '$ticker',
                        'temp_data': {'$last': '$createdDateTimeIsoDateTime'},
                                      'minuteClose': {'$last': '$minuteClose'},
                                      'todaysChangePerc': {'$last': '$todaysChangePerc'},
                                      'timestampIsoDateTime': {'$last': '$timestampIsoDateTime'}
                }},
                {'$project': {
                         'ticker': '$_id',
                         'minuteClose': '$minuteClose',
                         'todaysChangePerc': '$todaysChangePerc',
                         'timestampIsoDateTime': '$timestampIsoDateTime'
                }},
                {'$sort': {
                        'ticker': 1
                }}
            ])
    
    docs = dbCollection.aggregate([
                {'$match': {
                          '$and': [
                                   {'todaysChangePerc': {'$lt': -10}},
                                   {'createdDateTimeYear': 2020},
                                   {'createdDateTimeMonth': 9},
                                   {'createdDateTimeDay': 29},
                                   {'createdDateTimeHour': 15},
                                   {'createdDateTimeMinute': {"$gt": 1}}
                          ]
                }},
                {'$sort': {
                        'ticker': 1,
                        'timestampIsoDateTime': 1
                }},
                {'$group': {
                        '_id': '$ticker',
                        'temp_data': {'$last': '$createdDateTimeIsoDateTime'},
                                      'minuteClose': {'$last': '$minuteClose'},
                                      'todaysChangePerc': {'$last': '$todaysChangePerc'},
                                      'timestampIsoDateTime': {'$last': '$timestampIsoDateTime'}
                }},
                {'$project': {
                         'ticker': '$_id',
                         'minuteClose': '$minuteClose',
                         'todaysChangePerc': '$todaysChangePerc',
                         'timestampIsoDateTime': '$timestampIsoDateTime'
                }},
                {'$sort': {
                        'ticker': 1
                }}
            ])