Python 如何用熊猫编写迭代公式?
我正在处理数据帧以生成迭代值 例如:Python 如何用熊猫编写迭代公式?,python,python-3.x,pandas,numpy,dataframe,Python,Python 3.x,Pandas,Numpy,Dataframe,我正在处理数据帧以生成迭代值 例如: import pandas as pd import numpy as np df = pd.DataFrame([[1,1970,np.nan,np.nan],[1,1971,np.nan,np.nan],[1,1972,np.nan,0.081],[1,1973,np.nan,0.222],[1,1974,np.nan,0], [1,1975,np.nan,0],[1,1976,np.nan,0],[1,1977,np.nan,0],[2,1970,n
import pandas as pd
import numpy as np
df = pd.DataFrame([[1,1970,np.nan,np.nan],[1,1971,np.nan,np.nan],[1,1972,np.nan,0.081],[1,1973,np.nan,0.222],[1,1974,np.nan,0],
[1,1975,np.nan,0],[1,1976,np.nan,0],[1,1977,np.nan,0],[2,1970,np.nan,np.nan],[2,1971,np.nan,np.nan],[2,1972,np.nan,0.081],[2,1973,np.nan,0.222],[2,1974,np.nan,0],
[2,1975,np.nan,0],[2,1976,np.nan,0],[2,1977,np.nan,0]],columns=['id','t','y','x'])
import pandas as pd
import numpy as np
df = pd.DataFrame([[1,1970,np.nan,np.nan],[1,1971,np.nan,np.nan],[1,1972,np.nan,0.081],[1,1974,np.nan,0],
[1,1975,np.nan,0],[1,1976,np.nan,0],[1,1977,np.nan,0],[2,1970,np.nan,np.nan],[2,1971,np.nan,np.nan],[2,1972,np.nan,0.081],[2,1973,np.nan,0.222],[2,1974,np.nan,0],
[2,1975,np.nan,0],[2,1976,np.nan,0],[2,1977,np.nan,0]],columns=['id','t','y','x'])
迭代公式为:
y_t = (1 - 0.5) * y_{t-1} + x_t
其中,y\u 0
是组时间(1/0.6)
内第一个未缺失的X观察值:
对于第一组,第一个非缺失的X
值为0.081
,因此y\u 0=0.081/0.6=0.135
我还有一个问题。如果原始数据帧是不平衡的面板。对于第1组,数据框中没有1973年。对于缺失年份观测,该年中的所有变量均缺失
例如:
import pandas as pd
import numpy as np
df = pd.DataFrame([[1,1970,np.nan,np.nan],[1,1971,np.nan,np.nan],[1,1972,np.nan,0.081],[1,1973,np.nan,0.222],[1,1974,np.nan,0],
[1,1975,np.nan,0],[1,1976,np.nan,0],[1,1977,np.nan,0],[2,1970,np.nan,np.nan],[2,1971,np.nan,np.nan],[2,1972,np.nan,0.081],[2,1973,np.nan,0.222],[2,1974,np.nan,0],
[2,1975,np.nan,0],[2,1976,np.nan,0],[2,1977,np.nan,0]],columns=['id','t','y','x'])
import pandas as pd
import numpy as np
df = pd.DataFrame([[1,1970,np.nan,np.nan],[1,1971,np.nan,np.nan],[1,1972,np.nan,0.081],[1,1974,np.nan,0],
[1,1975,np.nan,0],[1,1976,np.nan,0],[1,1977,np.nan,0],[2,1970,np.nan,np.nan],[2,1971,np.nan,np.nan],[2,1972,np.nan,0.081],[2,1973,np.nan,0.222],[2,1974,np.nan,0],
[2,1975,np.nan,0],[2,1976,np.nan,0],[2,1977,np.nan,0]],columns=['id','t','y','x'])
所需输出为:
id t y x
1 1970 nan nan
1 1971 nan nan
1 1972 0.135 0.081
1 1974 nan 0
1 1975 nan 0
1 1976 nan 0
1 1977 nan 0
2 1970 nan nan
2 1971 nan nan
2 1972 0.135 0.081
2 1973 0.2895 0.222
2 1974 0.14475 0
2 1975 0.072375 0
2 1976 0.0361875 0
2 1977 0.01809375 0
我从blutab的函数中修改了apply函数,但它不起作用
def rolling_apply(group):
y = []
first_index = group.index[0]
idx=pd.date_range(start=group.index[0], end=group.last_valid_index(), freq='Y')
group=group.reindex(idx)
first_valid_index = group.x.first_valid_index().year - first_index.year
for index, x in enumerate(group.x):
if index < first_valid_index:
y.append(np.nan)
elif index == first_valid_index:
y.append( x/0.6)
else:
temp = (1-0.5)*y[-1] + x
y.append(temp)
group.y = y
#group=group.reset_index()
group=group[group['id'].notnull()]
return group
df = pd.DataFrame([[1,1970,np.nan,np.nan],[1,1971,np.nan,np.nan],[1,1972,np.nan,0.081],[1,1974,np.nan,0],
[1,1975,np.nan,0],[1,1976,np.nan,0],[1,1977,np.nan,0],[2,1970,np.nan,np.nan],[2,1971,np.nan,np.nan],[2,1972,np.nan,0.081],[2,1973,np.nan,0.222],[2,1974,np.nan,0],
[2,1975,np.nan,0],[2,1976,np.nan,0],[2,1977,np.nan,0]],columns=['id','t','y','x'])
df['year']=df['t']
df['month']=12
df['day']=31
df['date']=pd.to_datetime(df[['year','month','day']])
df=df.set_index(df['date'])
df['y'] = df.groupby(df.id).apply(rolling_apply).set_index(df['date']).y
def滚动应用(组):
y=[]
第一个索引=组索引[0]
idx=pd.date\u range(start=group.index[0],end=group.last\u valid\u index(),freq='Y')
组=组。重新索引(idx)
first\u valid\u index=组x.first\u valid\u index().年-first\u index.year
对于索引,枚举中的x(组x):
如果索引<第一个有效索引:
y、 追加(np.nan)
elif索引==第一个有效索引:
y、 附加(x/0.6)
其他:
温度=(1-0.5)*y[-1]+x
y、 附加(临时)
组y=y
#组=组。重置索引()
group=group[group['id'].notnull()
返回组
df=pd.DataFrame([[11970,np.nan,np.nan],[11971,np.nan,np.nan],[11972,np.nan,0.081],[11974,np.nan,0],
[11975,np.nan,0],[11976,np.nan,0],[11977,np.nan,0],[21970,np.nan,np.nan],[21971,np.nan,np.nan],[21972,np.nan,0.081],[21973,np.nan,0.222],[21974,np.nan,0],
[21975,np.nan,0],[21976,np.nan,0],[21977,np.nan,0]],列=['id','t','y','x'])
df['year']=df['t']
df['月]=12
df[‘天’]=31
df['date']=pd.to_datetime(df['year','month','day']]
df=df.set_索引(df['date'])
df['y']=df.groupby(df.id).apply(滚动应用).set_索引(df['date']).y
非常感谢。要进行滚动应用,可以使用pandas.groupby().apply()。在应用程序中,可以使用循环对每个组进行计算
def rolling_apply(group):
y = []
first_index = group.index[0]
first_valid_index = group.x.first_valid_index() - first_index
for index, x in enumerate(group.x):
if index < first_valid_index:
y.append(np.nan)
elif index == first_valid_index:
y.append( x/0.6)
else:
temp = (1-0.5)*y[-1] + x
y.append(temp)
group.y = y
return group
df['y'] = df.groupby(df.id).apply(rolling_apply).y
def滚动应用(组):
y=[]
第一个索引=组索引[0]
first\u valid\u index=group.x.first\u valid\u index()-first\u index
对于索引,枚举中的x(组x):
如果索引<第一个有效索引:
y、 追加(np.nan)
elif索引==第一个有效索引:
y、 附加(x/0.6)
其他:
温度=(1-0.5)*y[-1]+x
y、 附加(临时)
组y=y
返回组
df['y']=df.groupby(df.id).apply(滚动应用).y
您所说的“y_0是组时间(1/0.6)内第一个未缺失的观察值”是什么意思?您正在对数据帧进行分组吗?如果是,是id栏吗?@Roy2012,谢谢您的回复。对于第一组,第一个未缺失的X值为0.081,因此y_0=0.081/0.6=0.135。到小组中“t”的时候了。请在OP中解释你自己的努力。@siamaksafari,当然,我的代码来了。它对我很有效。我还有一个问题。如果小组不平衡,也就是说,我们在小组内有时间间隔。所以在你的函数中,你不能使用y[-1],如何解决这个问题?也许你可以先添加缺少的日期,然后应用上面的公式:让我知道这是否有帮助-或者请在缺少年份的地方发布一个输入和输出示例