Python 对于循环通过列,数据帧效率低下
我有每个单元格和日期的降水数据(1800行和15k列)Python 对于循环通过列,数据帧效率低下,python,pandas,dataframe,for-loop,if-statement,Python,Pandas,Dataframe,For Loop,If Statement,我有每个单元格和日期的降水数据(1800行和15k列) 486335 486336 486337 2019-07-03 13:35:54.445 0 2 22 2019-07-04 13:35:54.445 0 1 1 2019-07-05 13:35:54.445 16 8 22 2019-07-06 13:35:54.445
486335 486336 486337
2019-07-03 13:35:54.445 0 2 22
2019-07-04 13:35:54.445 0 1 1
2019-07-05 13:35:54.445 16 8 22
2019-07-06 13:35:54.445 0 0 0
2019-07-07 13:35:54.445 0 11 0
我想找出达到特定降雨量(>15mm)的日期,并计算该事件发生后降雨量减少的天数(您可以避免行上的迭代,因为它不能很好地扩展到大型数据帧 这是一种不同的方法,不确定它是否对您的完整数据帧更有效:
periods=[]
for cell in df.columns:
sub = pd.DataFrame({'amount': df[cell].values}, index=df.index)
sub['flag'] = pd.cut(sub['amount'], [0.11, 15, np.inf],
labels=[0, 1]).astype(np.float)
sub.loc[sub.flag>0, 'flag']=sub.loc[sub.flag>0, 'flag'].cumsum()
sub.flag.ffill(inplace=True)
x = sub[sub.flag>0].reset_index().groupby('flag').agg(
{'index':['min', 'max'], 'amount': 'sum'})
x.columns = ['start', 'end', 'amount']
x['period_range'] = (x.end - x.start).dt.days + 1
x['cell'] = cell
x.reindex(columns=['start', 'end', 'period_range', 'cell'])
periods.append(x)
resul = pd.concat(periods).reset_index(drop=True)
因为我没有完整的数据集,所以我不能说什么占用了时间,但我想这是因为在获取周期和在循环中执行的排序操作时进行了索引访问。 它在逻辑上应该与您的代码等效,除了一些更改:
duration = 0 #days with no or less than pp_max_1 rain
count = False
index_list = df.index #Index for updating df / Integear
period_range = 0 #Amount of days after Event without much rain Integear
period_amount = 0 #Amount of PP in dry days except event Integear
event_amount = 0.0 #Amount of heavy rainfall on the event date Float
pp = 0 #actual precipitation
pp_sum = 0.0 #mm
pp_min = 15.0 #mm min pp for start to count dry days until duration_min_after
pp_max_1 = 0.11 #max pp for 1 day while counting dry days
dry_days = 0 #dry days after event
dry_periods= list()
for counter_columns, column in enumerate(df.columns, 1):
for period, y in df[column].items():
if not count and y >= pp_min:
duration += 1
count = True
start_period = period
event_amount = y
pp_sum += y
elif count and (y >= pp_min or y >= pp_max_1):
end_period = period
dry_periods.append({
"start_period": start_period ,
"end_period": end_period,
"period_range": duration,
"period_amount": pp_sum ,
"event_amount": event_amount,
"cell": column})
duration = 0
count = False
pp_sum = 0
elif count and pp <= pp_max_1:
duration += 1
pp_sum += y
print("column :",counter_columns, "finished")
dry_periods.sort(key=lambda record: record['period_range'])
print(dry_periods)
上面。我觉得这很可疑,但这只是你程序中的重写条件。如果可以的话,也许你可以删除其中一个比较,因为我猜pp_min
periods=[]
for cell in df.columns:
sub = pd.DataFrame({'amount': df[cell].values}, index=df.index)
sub['flag'] = pd.cut(sub['amount'], [0.11, 15, np.inf],
labels=[0, 1]).astype(np.float)
sub.loc[sub.flag>0, 'flag']=sub.loc[sub.flag>0, 'flag'].cumsum()
sub.flag.ffill(inplace=True)
x = sub[sub.flag>0].reset_index().groupby('flag').agg(
{'index':['min', 'max'], 'amount': 'sum'})
x.columns = ['start', 'end', 'amount']
x['period_range'] = (x.end - x.start).dt.days + 1
x['cell'] = cell
x.reindex(columns=['start', 'end', 'period_range', 'cell'])
periods.append(x)
resul = pd.concat(periods).reset_index(drop=True)
duration = 0 #days with no or less than pp_max_1 rain
count = False
index_list = df.index #Index for updating df / Integear
period_range = 0 #Amount of days after Event without much rain Integear
period_amount = 0 #Amount of PP in dry days except event Integear
event_amount = 0.0 #Amount of heavy rainfall on the event date Float
pp = 0 #actual precipitation
pp_sum = 0.0 #mm
pp_min = 15.0 #mm min pp for start to count dry days until duration_min_after
pp_max_1 = 0.11 #max pp for 1 day while counting dry days
dry_days = 0 #dry days after event
dry_periods= list()
for counter_columns, column in enumerate(df.columns, 1):
for period, y in df[column].items():
if not count and y >= pp_min:
duration += 1
count = True
start_period = period
event_amount = y
pp_sum += y
elif count and (y >= pp_min or y >= pp_max_1):
end_period = period
dry_periods.append({
"start_period": start_period ,
"end_period": end_period,
"period_range": duration,
"period_amount": pp_sum ,
"event_amount": event_amount,
"cell": column})
duration = 0
count = False
pp_sum = 0
elif count and pp <= pp_max_1:
duration += 1
pp_sum += y
print("column :",counter_columns, "finished")
dry_periods.sort(key=lambda record: record['period_range'])
print(dry_periods)
elif count and (y >= pp_min or y >= pp_max_1):