Python 3.x 条形图的分组和重采样:
我有一个数据框,它记录了不同年份中几个不同位置的浓度,具有很高的时间频率(基本上,你的错误来自于这个不明确的索引,你传递的是一列连续的浮点值,用于按行选择索引,目前是日期时间类型Python 3.x 条形图的分组和重采样:,python-3.x,pandas,dataframe,pandas-groupby,timeserieschart,Python 3.x,Pandas,Dataframe,Pandas Groupby,Timeserieschart,我有一个数据框,它记录了不同年份中几个不同位置的浓度,具有很高的时间频率(基本上,你的错误来自于这个不明确的索引,你传递的是一列连续的浮点值,用于按行选择索引,目前是日期时间类型 df_new[df_new['value']] # INDEXING DATETIME USING FLOAT VALUES ... df_month[df_month['value']] # COLUMN value DOES NOT EXIST 您可能打算在重采样期间选择列值(从其
df_new[df_new['value']] # INDEXING DATETIME USING FLOAT VALUES
...
df_month[df_month['value']] # COLUMN value DOES NOT EXIST
您可能打算在重采样期间选择列值(从其他列中选择)
Unnamed: 0 location value \
date location value
2017-10-21 08:45:00+05:30 8335 M 339.3
2017-08-18 17:45:00+05:30 8344 M 45.1
2017-11-08 13:15:00+05:30 8347 L 594.4
2017-10-21 13:15:00+05:30 8659 N 189.9
2017-08-18 15:45:00+05:30 8662 N 46.5
diurnal = df_new['value'].resample('12h')
diurnal.mean()[diurnal.count() >= 9]
daily_mean = diurnal_mean.resample('d').mean()
df_month = daily_mean.resample('m').mean() # REMOVE value BEING UNDERLYING SERIES
df_yearly = df_month.resample('y')
但是,上面没有任何位置可以保留打印位置。因此,使用groupby(pd.Grouper(…)
用随机、可复制的数据证明: 数据
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
np.random.seed(242020)
random_df = pd.DataFrame({'date': (np.random.choice(pd.date_range('2017-01-01', '2019-12-31'), 5000) +
pd.to_timedelta(np.random.randint(60*60, 60*60*24, 5000), unit='s')),
'location': np.random.choice(list("KLM"), 5000),
'value': np.random.uniform(10, 1000, 5000)
})
聚合
loc_list = list("KLM")
# NEW DATA FRAME WITH DATA FILTERING
df = (random_df.set_index(random_df['date'])
.assign(Year = lambda x: x['date'].dt.year,
location = lambda x: x['location'].where(x["location"] != "mm", "M"))
.query('(location == @loc_list) and (value >= 2 and value <= 400)')
)
# 12h AGGREGATION
diurnal = (df_new.groupby(["location", pd.Grouper(freq='12h')])["value"]
.agg(["count", "mean"])
.reset_index().set_index(['date'])
.query("count >= 2")
)
# d, m, y AGGREGATION
daily_mean = diurnal.groupby(["location", pd.Grouper(freq='d')])["mean"].mean()
df_month = diurnal.groupby(["location", pd.Grouper(freq='m')])["mean"].mean()
df_yearly = (diurnal.groupby(["location", pd.Grouper(freq='y')])["mean"].mean()
.reset_index()
.assign(Year = lambda x: x["date"].dt.year)
)
print(df_yearly)
# location date mean Year
# 0 K 2017-12-31 188.984592 2017
# 1 K 2018-12-31 199.521702 2018
# 2 K 2019-12-31 216.497268 2019
# 3 L 2017-12-31 214.347873 2017
# 4 L 2018-12-31 199.232711 2018
# 5 L 2019-12-31 177.689221 2019
# 6 M 2017-12-31 222.412711 2017
# 7 M 2018-12-31 241.597977 2018
# 8 M 2019-12-31 215.554228 2019
请发布a的
df
数据,然后理想地显示所需的结果。这是df
之前还是之后?作为测试,请尝试完全运行您发布的内容(数据+代码)在空的Python环境中,确保它复制错误或不希望的结果。这是一个虚拟数据帧,显示了我要绘制的内容;值列理想情况下应该包括最终重采样的质量控制浓度。
# AGGREGATE TO KEEP LOCATION AND 12h
diurnal = (df_new.groupby(["location", pd.Grouper(freq='12h')])["value"]
.agg(["count", "mean"])
.reset_index().set_index(['date'])
)
# FILTER
diurnal_sub = diurnal[diurnal["count"] >= 9]
# MULTIPLE DATE TIME LEVEL MEANS
daily_mean = diurnal_sub.groupby(["location", pd.Grouper(freq='d')])["mean"].mean()
df_month = diurnal_sub.groupby(["location", pd.Grouper(freq='m')])["mean"].mean()
df_yearly = diurnal_sub.groupby(["location", pd.Grouper(freq='y')])["mean"].mean()
print(df_yearly)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
np.random.seed(242020)
random_df = pd.DataFrame({'date': (np.random.choice(pd.date_range('2017-01-01', '2019-12-31'), 5000) +
pd.to_timedelta(np.random.randint(60*60, 60*60*24, 5000), unit='s')),
'location': np.random.choice(list("KLM"), 5000),
'value': np.random.uniform(10, 1000, 5000)
})
loc_list = list("KLM")
# NEW DATA FRAME WITH DATA FILTERING
df = (random_df.set_index(random_df['date'])
.assign(Year = lambda x: x['date'].dt.year,
location = lambda x: x['location'].where(x["location"] != "mm", "M"))
.query('(location == @loc_list) and (value >= 2 and value <= 400)')
)
# 12h AGGREGATION
diurnal = (df_new.groupby(["location", pd.Grouper(freq='12h')])["value"]
.agg(["count", "mean"])
.reset_index().set_index(['date'])
.query("count >= 2")
)
# d, m, y AGGREGATION
daily_mean = diurnal.groupby(["location", pd.Grouper(freq='d')])["mean"].mean()
df_month = diurnal.groupby(["location", pd.Grouper(freq='m')])["mean"].mean()
df_yearly = (diurnal.groupby(["location", pd.Grouper(freq='y')])["mean"].mean()
.reset_index()
.assign(Year = lambda x: x["date"].dt.year)
)
print(df_yearly)
# location date mean Year
# 0 K 2017-12-31 188.984592 2017
# 1 K 2018-12-31 199.521702 2018
# 2 K 2019-12-31 216.497268 2019
# 3 L 2017-12-31 214.347873 2017
# 4 L 2018-12-31 199.232711 2018
# 5 L 2019-12-31 177.689221 2019
# 6 M 2017-12-31 222.412711 2017
# 7 M 2018-12-31 241.597977 2018
# 8 M 2019-12-31 215.554228 2019
sns.set()
fig, axs = plt.subplots(figsize=(12,5))
sns.barplot(x='location', y='mean', hue='Year', data= df_yearly, ax=axs)
plt.title("Location Value Yearly Aggregation", weight="bold", size=16)
plt.show()
plt.clf()
plt.close()