Python 根据日期将数据一分为二
我想按年份将我的数据分成两半。因此,在我下面的样本数据中,我需要得到两个独立的数据框,一个是每年前50%的数据框,另一个是另一半的数据框。附加条件是,50%需要以“LG”列为基础 有人能帮我吗 样本数据:Python 根据日期将数据一分为二,python,pandas,numpy,Python,Pandas,Numpy,我想按年份将我的数据分成两半。因此,在我下面的样本数据中,我需要得到两个独立的数据框,一个是每年前50%的数据框,另一个是另一半的数据框。附加条件是,50%需要以“LG”列为基础 有人能帮我吗 样本数据: import pandas as pd import numpy as np df = pd.DataFrame( {'LG' : ('AR1', 'AR1', 'AR1', 'AR1', 'AR1', 'AR1', 'PO1', 'PO1', 'AR1', 'AR1', 'PO1
import pandas as pd
import numpy as np
df = pd.DataFrame(
{'LG' : ('AR1', 'AR1', 'AR1', 'AR1', 'AR1', 'AR1', 'PO1', 'PO1', 'AR1', 'AR1', 'PO1', 'PO1'),
'Date': ('2011-1-1', '2011-3-1', '2011-4-1', '2011-2-1', '2012-1-1', '2012-2-1', '2012-1-1', '2012-2-1', '2013-1-1', '2013-2-1', '2013-1-1', '2013-2-1'),
'Year': (2011, 2011, 2011, 2011, 2012, 2012, 2012, 2012, 2013, 2013, 2013, 2013)})
pd.to_datetime(df['Date'])
Date LG Year
0 2011-01-01 AR1 2011
1 2011-03-01 AR1 2011
2 2011-04-01 AR1 2011
3 2011-02-01 AR1 2011
4 2012-01-01 AR1 2012
5 2012-02-01 AR1 2012
6 2012-01-01 PO1 2012
7 2012-02-01 PO1 2012
8 2013-01-01 AR1 2013
9 2013-02-01 AR1 2013
10 2013-01-01 PO1 2013
11 2013-02-01 PO1 2013
# group by 'Year' and 'LG'
idx = ['Year', 'LG']
# build a grouper
group_by = df.groupby(idx, as_index=False)
# need frame to re-expand the group size
df1 = df.set_index(idx)
df1['g_size'] = group_by.size()
# find the rows in the top half of respective group
top_half = (group_by.cumcount() / df1.g_size.values).values < 0.5
# build new data frames
top = df.loc[top_half]
bot = df.loc[~top_half]
print(df)
print('-- top')
print(top)
print('-- bot')
print(bot)
print('--')
Date LG Year
0 2011-1-1 AR1 2011
1 2011-3-1 AR1 2011
2 2011-4-1 AR1 2011
3 2011-2-1 AR1 2011
4 2012-1-1 AR1 2012
5 2012-2-1 AR1 2012
6 2012-1-1 PO1 2012
7 2012-2-1 PO1 2012
8 2013-1-1 AR1 2013
9 2013-2-1 AR1 2013
10 2013-1-1 PO1 2013
11 2013-2-1 PO1 2013
-- top
Year LG Date
0 2011 AR1 2011-1-1
1 2011 AR1 2011-2-1
2 2012 AR1 2012-1-1
3 2012 PO1 2012-1-1
4 2013 AR1 2013-1-1
5 2013 PO1 2013-1-1
-- bot
Year LG Date
0 2011 AR1 2011-3-1
1 2011 AR1 2011-4-1
2 2012 AR1 2012-2-1
3 2012 PO1 2012-2-1
4 2013 AR1 2013-2-1
5 2013 PO1 2013-2-1
df = pd.DataFrame({
'LG': ('AR1', 'AR1', 'AR1', 'AR1', 'AR1', 'AR1',
'PO1', 'PO1', 'AR1', 'AR1', 'PO1', 'PO1'),
'Date': ('2011-1-1', '2011-3-1', '2011-4-1', '2011-2-1', '2012-1-1',
'2012-2-1', '2012-1-1', '2012-2-1', '2013-1-1', '2013-2-1',
'2013-1-1', '2013-2-1'),
'Year': (2011, 2011, 2011, 2011, 2012, 2012, 2012, 2012, 2013,
2013, 2013, 2013)
})
pd.to_datetime(df['Date'])
df:
import pandas as pd
import numpy as np
df = pd.DataFrame(
{'LG' : ('AR1', 'AR1', 'AR1', 'AR1', 'AR1', 'AR1', 'PO1', 'PO1', 'AR1', 'AR1', 'PO1', 'PO1'),
'Date': ('2011-1-1', '2011-3-1', '2011-4-1', '2011-2-1', '2012-1-1', '2012-2-1', '2012-1-1', '2012-2-1', '2013-1-1', '2013-2-1', '2013-1-1', '2013-2-1'),
'Year': (2011, 2011, 2011, 2011, 2012, 2012, 2012, 2012, 2013, 2013, 2013, 2013)})
pd.to_datetime(df['Date'])
Date LG Year
0 2011-01-01 AR1 2011
1 2011-03-01 AR1 2011
2 2011-04-01 AR1 2011
3 2011-02-01 AR1 2011
4 2012-01-01 AR1 2012
5 2012-02-01 AR1 2012
6 2012-01-01 PO1 2012
7 2012-02-01 PO1 2012
8 2013-01-01 AR1 2013
9 2013-02-01 AR1 2013
10 2013-01-01 PO1 2013
11 2013-02-01 PO1 2013
# group by 'Year' and 'LG'
idx = ['Year', 'LG']
# build a grouper
group_by = df.groupby(idx, as_index=False)
# need frame to re-expand the group size
df1 = df.set_index(idx)
df1['g_size'] = group_by.size()
# find the rows in the top half of respective group
top_half = (group_by.cumcount() / df1.g_size.values).values < 0.5
# build new data frames
top = df.loc[top_half]
bot = df.loc[~top_half]
print(df)
print('-- top')
print(top)
print('-- bot')
print(bot)
print('--')
Date LG Year
0 2011-1-1 AR1 2011
1 2011-3-1 AR1 2011
2 2011-4-1 AR1 2011
3 2011-2-1 AR1 2011
4 2012-1-1 AR1 2012
5 2012-2-1 AR1 2012
6 2012-1-1 PO1 2012
7 2012-2-1 PO1 2012
8 2013-1-1 AR1 2013
9 2013-2-1 AR1 2013
10 2013-1-1 PO1 2013
11 2013-2-1 PO1 2013
-- top
Year LG Date
0 2011 AR1 2011-1-1
1 2011 AR1 2011-2-1
2 2012 AR1 2012-1-1
3 2012 PO1 2012-1-1
4 2013 AR1 2013-1-1
5 2013 PO1 2013-1-1
-- bot
Year LG Date
0 2011 AR1 2011-3-1
1 2011 AR1 2011-4-1
2 2012 AR1 2012-2-1
3 2012 PO1 2012-2-1
4 2013 AR1 2013-2-1
5 2013 PO1 2013-2-1
df = pd.DataFrame({
'LG': ('AR1', 'AR1', 'AR1', 'AR1', 'AR1', 'AR1',
'PO1', 'PO1', 'AR1', 'AR1', 'PO1', 'PO1'),
'Date': ('2011-1-1', '2011-3-1', '2011-4-1', '2011-2-1', '2012-1-1',
'2012-2-1', '2012-1-1', '2012-2-1', '2013-1-1', '2013-2-1',
'2013-1-1', '2013-2-1'),
'Year': (2011, 2011, 2011, 2011, 2012, 2012, 2012, 2012, 2013,
2013, 2013, 2013)
})
pd.to_datetime(df['Date'])
在对
年份
和LG
进行分组后,将框架一分为二。基本思想是在组中找到小于组大小50%的位置
代码:
import pandas as pd
import numpy as np
df = pd.DataFrame(
{'LG' : ('AR1', 'AR1', 'AR1', 'AR1', 'AR1', 'AR1', 'PO1', 'PO1', 'AR1', 'AR1', 'PO1', 'PO1'),
'Date': ('2011-1-1', '2011-3-1', '2011-4-1', '2011-2-1', '2012-1-1', '2012-2-1', '2012-1-1', '2012-2-1', '2013-1-1', '2013-2-1', '2013-1-1', '2013-2-1'),
'Year': (2011, 2011, 2011, 2011, 2012, 2012, 2012, 2012, 2013, 2013, 2013, 2013)})
pd.to_datetime(df['Date'])
Date LG Year
0 2011-01-01 AR1 2011
1 2011-03-01 AR1 2011
2 2011-04-01 AR1 2011
3 2011-02-01 AR1 2011
4 2012-01-01 AR1 2012
5 2012-02-01 AR1 2012
6 2012-01-01 PO1 2012
7 2012-02-01 PO1 2012
8 2013-01-01 AR1 2013
9 2013-02-01 AR1 2013
10 2013-01-01 PO1 2013
11 2013-02-01 PO1 2013
# group by 'Year' and 'LG'
idx = ['Year', 'LG']
# build a grouper
group_by = df.groupby(idx, as_index=False)
# need frame to re-expand the group size
df1 = df.set_index(idx)
df1['g_size'] = group_by.size()
# find the rows in the top half of respective group
top_half = (group_by.cumcount() / df1.g_size.values).values < 0.5
# build new data frames
top = df.loc[top_half]
bot = df.loc[~top_half]
print(df)
print('-- top')
print(top)
print('-- bot')
print(bot)
print('--')
Date LG Year
0 2011-1-1 AR1 2011
1 2011-3-1 AR1 2011
2 2011-4-1 AR1 2011
3 2011-2-1 AR1 2011
4 2012-1-1 AR1 2012
5 2012-2-1 AR1 2012
6 2012-1-1 PO1 2012
7 2012-2-1 PO1 2012
8 2013-1-1 AR1 2013
9 2013-2-1 AR1 2013
10 2013-1-1 PO1 2013
11 2013-2-1 PO1 2013
-- top
Year LG Date
0 2011 AR1 2011-1-1
1 2011 AR1 2011-2-1
2 2012 AR1 2012-1-1
3 2012 PO1 2012-1-1
4 2013 AR1 2013-1-1
5 2013 PO1 2013-1-1
-- bot
Year LG Date
0 2011 AR1 2011-3-1
1 2011 AR1 2011-4-1
2 2012 AR1 2012-2-1
3 2012 PO1 2012-2-1
4 2013 AR1 2013-2-1
5 2013 PO1 2013-2-1
df = pd.DataFrame({
'LG': ('AR1', 'AR1', 'AR1', 'AR1', 'AR1', 'AR1',
'PO1', 'PO1', 'AR1', 'AR1', 'PO1', 'PO1'),
'Date': ('2011-1-1', '2011-3-1', '2011-4-1', '2011-2-1', '2012-1-1',
'2012-2-1', '2012-1-1', '2012-2-1', '2013-1-1', '2013-2-1',
'2013-1-1', '2013-2-1'),
'Year': (2011, 2011, 2011, 2011, 2012, 2012, 2012, 2012, 2013,
2013, 2013, 2013)
})
pd.to_datetime(df['Date'])
排序结果:
import pandas as pd
import numpy as np
df = pd.DataFrame(
{'LG' : ('AR1', 'AR1', 'AR1', 'AR1', 'AR1', 'AR1', 'PO1', 'PO1', 'AR1', 'AR1', 'PO1', 'PO1'),
'Date': ('2011-1-1', '2011-3-1', '2011-4-1', '2011-2-1', '2012-1-1', '2012-2-1', '2012-1-1', '2012-2-1', '2013-1-1', '2013-2-1', '2013-1-1', '2013-2-1'),
'Year': (2011, 2011, 2011, 2011, 2012, 2012, 2012, 2012, 2013, 2013, 2013, 2013)})
pd.to_datetime(df['Date'])
Date LG Year
0 2011-01-01 AR1 2011
1 2011-03-01 AR1 2011
2 2011-04-01 AR1 2011
3 2011-02-01 AR1 2011
4 2012-01-01 AR1 2012
5 2012-02-01 AR1 2012
6 2012-01-01 PO1 2012
7 2012-02-01 PO1 2012
8 2013-01-01 AR1 2013
9 2013-02-01 AR1 2013
10 2013-01-01 PO1 2013
11 2013-02-01 PO1 2013
# group by 'Year' and 'LG'
idx = ['Year', 'LG']
# build a grouper
group_by = df.groupby(idx, as_index=False)
# need frame to re-expand the group size
df1 = df.set_index(idx)
df1['g_size'] = group_by.size()
# find the rows in the top half of respective group
top_half = (group_by.cumcount() / df1.g_size.values).values < 0.5
# build new data frames
top = df.loc[top_half]
bot = df.loc[~top_half]
print(df)
print('-- top')
print(top)
print('-- bot')
print(bot)
print('--')
Date LG Year
0 2011-1-1 AR1 2011
1 2011-3-1 AR1 2011
2 2011-4-1 AR1 2011
3 2011-2-1 AR1 2011
4 2012-1-1 AR1 2012
5 2012-2-1 AR1 2012
6 2012-1-1 PO1 2012
7 2012-2-1 PO1 2012
8 2013-1-1 AR1 2013
9 2013-2-1 AR1 2013
10 2013-1-1 PO1 2013
11 2013-2-1 PO1 2013
-- top
Year LG Date
0 2011 AR1 2011-1-1
1 2011 AR1 2011-2-1
2 2012 AR1 2012-1-1
3 2012 PO1 2012-1-1
4 2013 AR1 2013-1-1
5 2013 PO1 2013-1-1
-- bot
Year LG Date
0 2011 AR1 2011-3-1
1 2011 AR1 2011-4-1
2 2012 AR1 2012-2-1
3 2012 PO1 2012-2-1
4 2013 AR1 2013-2-1
5 2013 PO1 2013-2-1
df = pd.DataFrame({
'LG': ('AR1', 'AR1', 'AR1', 'AR1', 'AR1', 'AR1',
'PO1', 'PO1', 'AR1', 'AR1', 'PO1', 'PO1'),
'Date': ('2011-1-1', '2011-3-1', '2011-4-1', '2011-2-1', '2012-1-1',
'2012-2-1', '2012-1-1', '2012-2-1', '2013-1-1', '2013-2-1',
'2013-1-1', '2013-2-1'),
'Year': (2011, 2011, 2011, 2011, 2012, 2012, 2012, 2012, 2013,
2013, 2013, 2013)
})
pd.to_datetime(df['Date'])
测试数据:
import pandas as pd
import numpy as np
df = pd.DataFrame(
{'LG' : ('AR1', 'AR1', 'AR1', 'AR1', 'AR1', 'AR1', 'PO1', 'PO1', 'AR1', 'AR1', 'PO1', 'PO1'),
'Date': ('2011-1-1', '2011-3-1', '2011-4-1', '2011-2-1', '2012-1-1', '2012-2-1', '2012-1-1', '2012-2-1', '2013-1-1', '2013-2-1', '2013-1-1', '2013-2-1'),
'Year': (2011, 2011, 2011, 2011, 2012, 2012, 2012, 2012, 2013, 2013, 2013, 2013)})
pd.to_datetime(df['Date'])
Date LG Year
0 2011-01-01 AR1 2011
1 2011-03-01 AR1 2011
2 2011-04-01 AR1 2011
3 2011-02-01 AR1 2011
4 2012-01-01 AR1 2012
5 2012-02-01 AR1 2012
6 2012-01-01 PO1 2012
7 2012-02-01 PO1 2012
8 2013-01-01 AR1 2013
9 2013-02-01 AR1 2013
10 2013-01-01 PO1 2013
11 2013-02-01 PO1 2013
# group by 'Year' and 'LG'
idx = ['Year', 'LG']
# build a grouper
group_by = df.groupby(idx, as_index=False)
# need frame to re-expand the group size
df1 = df.set_index(idx)
df1['g_size'] = group_by.size()
# find the rows in the top half of respective group
top_half = (group_by.cumcount() / df1.g_size.values).values < 0.5
# build new data frames
top = df.loc[top_half]
bot = df.loc[~top_half]
print(df)
print('-- top')
print(top)
print('-- bot')
print(bot)
print('--')
Date LG Year
0 2011-1-1 AR1 2011
1 2011-3-1 AR1 2011
2 2011-4-1 AR1 2011
3 2011-2-1 AR1 2011
4 2012-1-1 AR1 2012
5 2012-2-1 AR1 2012
6 2012-1-1 PO1 2012
7 2012-2-1 PO1 2012
8 2013-1-1 AR1 2013
9 2013-2-1 AR1 2013
10 2013-1-1 PO1 2013
11 2013-2-1 PO1 2013
-- top
Year LG Date
0 2011 AR1 2011-1-1
1 2011 AR1 2011-2-1
2 2012 AR1 2012-1-1
3 2012 PO1 2012-1-1
4 2013 AR1 2013-1-1
5 2013 PO1 2013-1-1
-- bot
Year LG Date
0 2011 AR1 2011-3-1
1 2011 AR1 2011-4-1
2 2012 AR1 2012-2-1
3 2012 PO1 2012-2-1
4 2013 AR1 2013-2-1
5 2013 PO1 2013-2-1
df = pd.DataFrame({
'LG': ('AR1', 'AR1', 'AR1', 'AR1', 'AR1', 'AR1',
'PO1', 'PO1', 'AR1', 'AR1', 'PO1', 'PO1'),
'Date': ('2011-1-1', '2011-3-1', '2011-4-1', '2011-2-1', '2012-1-1',
'2012-2-1', '2012-1-1', '2012-2-1', '2013-1-1', '2013-2-1',
'2013-1-1', '2013-2-1'),
'Year': (2011, 2011, 2011, 2011, 2012, 2012, 2012, 2012, 2013,
2013, 2013, 2013)
})
pd.to_datetime(df['Date'])
df['Date'].apply(pd.to_datetime)
是一种缓慢的表达方式pd.to_datetime(df['Date'])
。根据您的注释进行编辑谢谢,不过只有一期。2011年的AR1未正确拆分。2011-2-1在“底部”组,2011-3-1在“顶部”组。为什么?哦,你想把它分类?您没有显示任何示例输出,因此我没有假设。让我做点什么…是的,我的错。我指的是前50%的年份是日期,而不是如图所示的列。谢谢,那看起来怎么样?