Python 查找第一个数据帧给定的两个日期之间第二个数据帧的最小值和最大值_Python_Pandas_Dataframe

Python 查找第一个数据帧给定的两个日期之间第二个数据帧的最小值和最大值

python pandas dataframe

Python 查找第一个数据帧给定的两个日期之间第二个数据帧的最小值和最大值,python,pandas,dataframe,Python,Pandas,Dataframe,我有2个虚拟数据帧 np.random.seed(12345) df1=pd.DataFrame({'name' : ['A']*4+['B']*4, 'start_date': pd.to_datetime(['2000-03-15', '2000-06-12','2000-09-01', '2001-01-17','2000-03-19', '2000-06-14','2000-09-14', '2001-01-22']),

我有2个虚拟数据帧

np.random.seed(12345)

df1=pd.DataFrame({'name'    :  ['A']*4+['B']*4,
                'start_date':   pd.to_datetime(['2000-03-15', '2000-06-12','2000-09-01', '2001-01-17','2000-03-19', '2000-06-14','2000-09-14', '2001-01-22']),
                'end_date':pd.to_datetime(['2000-06-12','2000-09-01', '2001-01-17','2001-03-19', '2000-06-14','2000-09-14', '2001-01-22','2001-02-01'])})

date=pd.date_range('2000-01-01','2002-01-01')
name=['A']*len(date)+['B']*len(date)
date=date.append(date)
import numpy as np
low=np.random.rand(len(date))
high=low+np.random.rand(len(date))
df2=pd.DataFrame({'name': name, 'date': date, 'low':low,'high':high})

result = ddf1.apply(lambda x: testing(x['name'], x['start_date'], x['end_date']), axis=1, meta=(None, 'object')).compute()
df4 = pd.DataFrame(result.tolist())

对于df1中的每一行，我都有名字、开始日期和结束日期

我想在高值中找到最大值，在低值中找到最小值，它与名称相同，在df2中介于开始日期和结束日期之间

下面是我目前的解决方案

df1=df1.set_index('name')
df2=df2.set_index(['name','date'])
df2=df2.sort_index()
df1['max']=-1
df1['min']=-1
for name in df1.index.unique():
    df=df2.loc[name]
    tmphigh=[]
    tmplow=[]
    for (_,start_date,end_date,_,_) in df1.loc[name].itertuples(name=None):
        newdf=df.iloc[df.index.searchsorted(start_date): df.index.searchsorted(end_date)]
        tmphigh.append(newdf.high.max())
        tmplow.append(newdf.low.min())
    df1.loc[[name],['max']]=tmphigh
    df1.loc[[name],['min']]=tmplow

然而，申请超过百万的row仍然需要相当长的时间。我想知道是否有更快的方法

[编辑]：多亏了Pramote Kuacharoen，我能够修改他的一些代码，并比我现有的代码实现6倍的加速

分离成循环的原因是，我发现在apply函数中包含df2[name]的生成将导致计算时间的显著增加

因此，我单独计算它，这可能有助于减少函数调用，以提取df2中名称下的所有值

如果有人能提出比我的方法更好的方法，我会很高兴。但现在这对我来说已经足够了

下面是我目前的解决方案

from tqdm import tqdm
df1a=df1.groupby('name')
df2a=df2.groupby('name')
mergedf=df1
mergedf['maximum']=-1
mergedf['minimum']=-1
def get_min_max(row):
    dfx=df2x.iloc[df2x.index.searchsorted(row['start_date']): df2x.index.searchsorted(row['end_date'])]
    maximum = dfx['high'].max()
    minimum = dfx['low'].min() 
    return pd.Series({'maximum': maximum, 'minimum': minimum})
for name,df in tqdm(df1a):
    df2x=df2a.get_group(name)
    mergedf.loc[[name],['maximum','minimum']]=df.apply(get_min_max,axis=1)

尝试在一次呼叫中查找最小值和最大值。这可能会节省一些时间

def find_min_max(row):
    dfx = df2[df2['name'] == row['name']].loc[row['start_date']:row['end_date'], ['high', 'low']]
    maximum = dfx['high'].max()
    minimum = dfx['low'].min()
    return pd.Series({'maximum': maximum, 'minimum': minimum})

df1.merge(df1.apply(find_min_max, axis=1), left_index=True, right_index=True)

试试这个：多处理和共享内存。将其保存在.py文件中，并使用命令行运行它。应该快得多。我将n_工人设置为4。你可以改变它

import numpy as np
import pandas as pd
from multiprocessing.shared_memory import SharedMemory
from concurrent.futures import ProcessPoolExecutor, as_completed


def find_min_max(name, data_info):

    shm_name, shape, dtype = data_info[0]
    shm1 = SharedMemory(shm_name)
    np1 = np.recarray(shape=shape, dtype=dtype, buf=shm1.buf)

    shm_name, shape, dtype = data_info[1]
    shm2 = SharedMemory(shm_name)
    np2 = np.recarray(shape=shape, dtype=dtype, buf=shm2.buf)

    data1 = np1[np1['name'] == name]
    data2 = np2[np2['name'] == name]

    for rec in data1:
        idx1 = np.searchsorted(data2['date'], rec['start_date'])
        idx2 = np.searchsorted(data2['date'], rec['end_date'])
        data = data2[idx1:idx2]
        np1[rec['index']]['maximum'] = data['high'].max()
        np1[rec['index']]['minimum'] = data['low'].min()


def main():

    np.random.seed(12345)

    df1 = pd.DataFrame({'name':  ['A']*4+['B']*4,
                        'start_date':   pd.to_datetime(['2000-03-15', '2000-06-12', '2000-09-01', '2001-01-17', '2000-03-19', '2000-06-14', '2000-09-14', '2001-01-22']),
                        'end_date': pd.to_datetime(['2000-06-12', '2000-09-01', '2001-01-17', '2001-03-19', '2000-06-14', '2000-09-14', '2001-01-22', '2001-02-01'])})

    date = pd.date_range('2000-01-01', '2002-01-01')
    name = ['A']*len(date)+['B']*len(date)
    date = date.append(date)
    low = np.random.rand(len(date))
    high = low+np.random.rand(len(date))
    df2 = pd.DataFrame({'name': name, 'date': date, 'low': low, 'high': high})

    df1 = df1.sort_values('name')
    df2 = df2.sort_values(['name', 'date'])
    df1['maximum'] = -1.0
    df1['minimum'] = -1.0

    np1 = df1.to_records(column_dtypes={
        'name': '|S20', 'start_date': '<M8[ns]', 'end_date': '<M8[ns]'})
    np2 = df2.to_records(column_dtypes={
        'name': '|S20', 'date': '<M8[ns]', 'low': '<f8', 'high': '<f8'})

    names = [str.encode(name) for name in df1['name'].unique()]
    del df1
    del df2

    shm1 = SharedMemory(name='d1', create=True, size=np1.nbytes)
    shm2 = SharedMemory(name='d2', create=True, size=np2.nbytes)

    shm1_np_array = np.recarray(
        shape=np1.shape, dtype=np1.dtype, buf=shm1.buf)
    np.copyto(shm1_np_array, np1)
    shm2_np_array = np.recarray(
        shape=np2.shape, dtype=np2.dtype, buf=shm2.buf)
    np.copyto(shm2_np_array, np2)

    data_info = [
        (shm1.name, np1.shape, np1.dtype),
        (shm2.name, np2.shape, np2.dtype)
    ]

    del np1
    del np2

    # Set number of workers
    n_workers = 4

    with ProcessPoolExecutor(n_workers) as exe:
        fs = [exe.submit(find_min_max, name, data_info)
              for name in names]
        for _ in as_completed(fs):
            pass

    print(shm1_np_array)

    shm1.close()
    shm2.close()
    shm1.unlink()
    shm2.unlink()


if __name__ == "__main__":
    main()

将numpy导入为np
作为pd进口熊猫
从multiprocessing.shared\u内存导入SharedMemory
从concurrent.futures导入ProcessPoolExecutor，完成时
def find_min_max（名称、数据信息）：
shm_名称、形状、数据类型=数据_信息[0]
shm1=共享内存（shm_名称）
np1=np.recarray（shape=shape，dtype=dtype，buf=shm1.buf）
shm_名称、形状、数据类型=数据_信息[1]
shm2=共享内存（shm\U名称）
np2=np.recarray（shape=shape，dtype=dtype，buf=shm2.buf）
数据1=np1[np1['name']==name]
数据2=np2[np2['name']==name]
对于数据1中的rec：
idx1=np.searchsorted（数据2['date']，记录['start_date']）
idx2=np.searchsorted（数据2['date']，记录['end_date']）
data=data2[idx1:idx2]
np1[rec['index']['maximum']=数据['high'].max（）
np1[rec['index']['minimum']=数据['low'].min（）
def main（）：
np.random.seed（12345）
df1=pd.DataFrame（{'name'：['A']*4+['B']*4，
‘开始日期’：pd.至日期时间（‘2000-03-15’、‘2000-06-12’、‘2000-09-01’、‘2001-01-17’、‘2000-03-19’、‘2000-06-14’、‘2000-09-14’、‘2001-01-22’），
“结束日期”：pd.至日期时间（['2000-06-12'，'2000-09-01'，'2001-01-17'，'2001-03-19'，'2000-06-14'，'2000-09-14'，'2001-01-22'，'2001-02-01']））
日期=pd.日期范围（'2000-01-01'，'2002-01-01'）
姓名=['A']*len（日期）+['B']*len（日期）
日期=日期。附加（日期）
低=np.random.rand（长度（日期））
高=低+np.random.rand（长度（日期））
df2=pd.DataFrame（{'name'：name，'date'：date，'low'：low，'high'：high}）
df1=df1.sort_值（'name'）
df2=df2.sort_值（['name'，'date']）
df1[“最大值”]=-1.0
df1[“最小值”]=-1.0
np1=df1.to_记录（列类型={
‘name’：‘S20’、‘start_date’：’既然性能是你的问题，我想会有很大帮助
import pandas as pd
import numpy as np
import dask.dataframe as dd

创建dask df
ddf1 = dd.from_pandas(df1, npartitions=5) # You can change 5 to higher

逻辑函数
def get_high_low(name, start, end):
    mask = df2['name'].eq(name) & df2['date'].between(start, end)
    low = df2.loc[mask]['low'].min()
    high = df2.loc[mask]['high'].max()
    return {'name': name, 'start_date': start, 'end_date': end, 'max': high, 'min': low}

将结果数据收集到新的数据帧中
np.random.seed(12345)

df1=pd.DataFrame({'name'    :  ['A']*4+['B']*4,
                'start_date':   pd.to_datetime(['2000-03-15', '2000-06-12','2000-09-01', '2001-01-17','2000-03-19', '2000-06-14','2000-09-14', '2001-01-22']),
                'end_date':pd.to_datetime(['2000-06-12','2000-09-01', '2001-01-17','2001-03-19', '2000-06-14','2000-09-14', '2001-01-22','2001-02-01'])})

date=pd.date_range('2000-01-01','2002-01-01')
name=['A']*len(date)+['B']*len(date)
date=date.append(date)
import numpy as np
low=np.random.rand(len(date))
high=low+np.random.rand(len(date))
df2=pd.DataFrame({'name': name, 'date': date, 'low':low,'high':high})

result = ddf1.apply(lambda x: testing(x['name'], x['start_date'], x['end_date']), axis=1, meta=(None, 'object')).compute()
df4 = pd.DataFrame(result.tolist())

抱歉，此方法的速度似乎比我的原始循环慢…我创建的循环可以在一小时内完成。但我已经运行apply方法一个多小时了，它仍然没有完成。不确定我是否做错了什么…但我认为这可能是由于apply在循环已完成时为每一行迭代调用掩码将该操作提取出来。您的数据有多大？df1有500k行，df2只有3700万行。df1.apply（find_min_max，axis=1）并查看运行所需的时间。您可能会考虑的其他选项：对数据进行分区，使用多处理。感谢您介绍dask。dask似乎符合我的目的。但在这种情况下，我无法使用您的答案让它在我的数据集上工作。我已替换了测试以在解决方案中获得高\u低。我添加了诊断，但它无法执行任何任务。~~~从dask.diagnostics导入ProgressBar ProgressBar（）.register（）中您是否遇到错误？我的解决方案与您提供的示例一致无错误。当我将df1的大小从1k行减少到1k行，将df2的大小减少到100万行时，效果很好。但更大的数据将导致加载所需时间的类似增加，或者只是没有进展。这听起来像是硬件问题，您的PC即使使用mu也无法处理如此大的文件LTI处理