Python 3.x MongoDB-查询以获取每个项目的最新报价
我试着效仿 我们有一个数百万股票行情和价格的数据库。通常需要获取每个股票代码的最后一行(最近一行)。在大型数据库中,我们在ticker和下面使用的createdDateTime字段上有复合索引 因此,问题分为两部分:Python 3.x MongoDB-查询以获取每个项目的最新报价,python-3.x,mongodb,pymongo,Python 3.x,Mongodb,Pymongo,我试着效仿 我们有一个数百万股票行情和价格的数据库。通常需要获取每个股票代码的最后一行(最近一行)。在大型数据库中,我们在ticker和下面使用的createdDateTime字段上有复合索引 因此,问题分为两部分: 高效运行以最小化IO和运行时间的最佳索引是什么 我在下面执行的聚合查询返回0行以上的测试数据。它应该返回两行,每行都有一个股票代码的最新时间 在我急于创建可复制样本的过程中,我有几个错误: 我将CreatedDataTimeIsoDateTime添加到四个测试行中的每一行 在最后
在我急于创建可复制样本的过程中,我有几个错误:
import requests
import sys
import traceback
import pprint
import json
import bson
from datetime import datetime as datetime1
import datetime
from time import time
import time as time2
import configHandler
#import boto3
import pymongo
from pymongo import MongoClient, UpdateOne
from pymongo.errors import BulkWriteError
from datetime import datetime
import datetime as datetime1
##########################################################################################
startTime = time()
startDateNowFmt = datetime1.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
(config_dict, config_user_dict) = configHandler.getConfigVariables()
print("Start TestMongoDBQuerySpeedTuningAggregate, DateTime=" + str(startDateNowFmt))
# print ("ConnectionString:" + config_dict['MONGODB_CONNECTION_STRING'])
cluster = MongoClient(config_dict['MONGODB_CONNECTION_STRING'])
db = cluster[config_dict['MONGODB_CLUSTER']]
dbCollectionName = "TestQuotesAggregate"
# dbCollectionName = "ProdPolygonIOQuotes"
dbCollection = db[dbCollectionName]
doInsert = True # only need to do this first time
if doInsert:
row1 = {'ticker': 'Test1',
'timestampIsoDateTime': '2020-09-29T15:31:15',
'createdDateTimeIsoDateTime': '2020-09-29T15:31:15',
'createdDateTimeYear': 2020,
'createdDateTimeMonth': 9,
'createdDateTimeDay': 29,
'createdDateTimeHour': 15,
'createdDateTimeMinute': 31,
'todaysChangePerc': -11,
'minuteClose': 100}
row2 = {'ticker': 'Test1',
'timestampIsoDateTime': '2020-09-29T15:32:15',
'createdDateTimeIsoDateTime': '2020-09-29T15:32:15',
'createdDateTimeYear': 2020,
'createdDateTimeMonth': 9,
'createdDateTimeDay': 29,
'createdDateTimeHour': 15,
'createdDateTimeMinute': 32,
'todaysChangePerc': -11.1,
'minuteClose': 99}
row3 = {'ticker': 'Test2',
'timestampIsoDateTime': '2020-09-29T15:31:15',
'createdDateTimeIsoDateTime': '2020-09-29T15:31:15',
'createdDateTimeYear': 2020,
'createdDateTimeMonth': 9,
'createdDateTimeDay': 29,
'createdDateTimeHour': 15,
'createdDateTimeMinute': 31,
'todaysChangePerc': -12,
'minuteClose': 200}
row4 = {'ticker': 'Test2',
'timestampIsoDateTime': '2020-09-29T15:32:15',
'createdDateTimeIsoDateTime': '2020-09-29T15:31:15',
'createdDateTimeYear': 2020,
'createdDateTimeMonth': 9,
'createdDateTimeDay': 29,
'createdDateTimeHour': 15,
'createdDateTimeMinute': 32,
'todaysChangePerc': -12.1,
'minuteClose': 195}
insert_rows = [row1, row2, row3, row4]
dbCollection.insert_many(insert_rows)
print("Before aggregation - show the data we have to work with")
docs1 = dbCollection.find({})
for doc in docs1:
print(doc['ticker'], doc['createdDateTimeIsoDateTime'], doc['minuteClose'], doc['todaysChangePerc'])
docs = dbCollection.aggregate([
{'$match': {
'$and': [
{'todaysChangePerc': {'$lt': -10}},
{'createdDateTimeYear': 2020},
{'createdDateTimeMonth': 9},
{'createdDateTimeDay': 29},
{'createdDateTimeHour': 15},
{'createdDateTimeMinute': {"$gt": 1}}
]
}},
{'$group': {
'_id': '$ticker',
'temp_data': {'$last': '$createdDateTimeIsoDateTime'},
'minuteClose': {'$last': '$minuteClose'},
'todaysChangePerc': {'$last': '$todaysChangePerc'}
}},
{'$project': {
'ticker_id': '$_id',
'minuteClose': '$minuteClose',
'todaysChangePerc': '$todaysChangePerc'
}},
{'$sort': {
'timestampIsoDateTime': -1
}}
])
#pprint.pprint(docs.explain())
# pprint.pprint(docs)
countDocs = 0
print("After aggregation - show the data we have to work with")
for doc in docs:
print(doc['ticker_id'], doc['minuteClose'], doc['todaysChangePerc'])
countDocs += 1
endTime = time()
# print("StartTime=" + str(startTime) + " EndTime=" + str(endTime))
elapsedTime = endTime - startTime
endDateNowFmt = datetime1.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print("\n")
print("Count Docs:", countDocs)
print("Server Start DateTime=" + str(startDateNowFmt))
print("Server End DateTime=" + str(endDateNowFmt))
print("ElapsedTime=" + str(elapsedTime) + " seconds")
第二个改进,如果有人可以验证我在$match和$group之间的新排序是否正确 为了使$last起作用,需要在使用$last语句之前对行进行排序。最后,我可以按股票代码对最终数据列表进行排序
docs = dbCollection.aggregate([
{'$match': {
'$and': [
{'todaysChangePerc': {'$lt': -10}},
{'createdDateTimeYear': 2020},
{'createdDateTimeMonth': 9},
{'createdDateTimeDay': 29},
{'createdDateTimeHour': 15},
{'createdDateTimeMinute': {"$gt": 1}}
]
}},
{'$sort': {
'ticker': 1,
'timestampIsoDateTime': 1
}},
{'$group': {
'_id': '$ticker',
'temp_data': {'$last': '$createdDateTimeIsoDateTime'},
'minuteClose': {'$last': '$minuteClose'},
'todaysChangePerc': {'$last': '$todaysChangePerc'},
'timestampIsoDateTime': {'$last': '$timestampIsoDateTime'}
}},
{'$project': {
'ticker': '$_id',
'minuteClose': '$minuteClose',
'todaysChangePerc': '$todaysChangePerc',
'timestampIsoDateTime': '$timestampIsoDateTime'
}},
{'$sort': {
'ticker': 1
}}
])
docs = dbCollection.aggregate([
{'$match': {
'$and': [
{'todaysChangePerc': {'$lt': -10}},
{'createdDateTimeYear': 2020},
{'createdDateTimeMonth': 9},
{'createdDateTimeDay': 29},
{'createdDateTimeHour': 15},
{'createdDateTimeMinute': {"$gt": 1}}
]
}},
{'$sort': {
'ticker': 1,
'timestampIsoDateTime': 1
}},
{'$group': {
'_id': '$ticker',
'temp_data': {'$last': '$createdDateTimeIsoDateTime'},
'minuteClose': {'$last': '$minuteClose'},
'todaysChangePerc': {'$last': '$todaysChangePerc'},
'timestampIsoDateTime': {'$last': '$timestampIsoDateTime'}
}},
{'$project': {
'ticker': '$_id',
'minuteClose': '$minuteClose',
'todaysChangePerc': '$todaysChangePerc',
'timestampIsoDateTime': '$timestampIsoDateTime'
}},
{'$sort': {
'ticker': 1
}}
])