Python 根据日期将数据一分为二

Python 根据日期将数据一分为二,python,pandas,numpy,Python,Pandas,Numpy,我想按年份将我的数据分成两半。因此,在我下面的样本数据中,我需要得到两个独立的数据框,一个是每年前50%的数据框,另一个是另一半的数据框。附加条件是,50%需要以“LG”列为基础 有人能帮我吗 样本数据: import pandas as pd import numpy as np df = pd.DataFrame( {'LG' : ('AR1', 'AR1', 'AR1', 'AR1', 'AR1', 'AR1', 'PO1', 'PO1', 'AR1', 'AR1', 'PO1

我想按年份将我的数据分成两半。因此,在我下面的样本数据中,我需要得到两个独立的数据框,一个是每年前50%的数据框,另一个是另一半的数据框。附加条件是,50%需要以“LG”列为基础

有人能帮我吗

样本数据:

import pandas as pd
import numpy as np

df = pd.DataFrame(
    {'LG' : ('AR1', 'AR1', 'AR1', 'AR1', 'AR1', 'AR1', 'PO1',  'PO1', 'AR1', 'AR1', 'PO1', 'PO1'),
     'Date': ('2011-1-1', '2011-3-1',  '2011-4-1', '2011-2-1', '2012-1-1', '2012-2-1', '2012-1-1', '2012-2-1', '2013-1-1', '2013-2-1', '2013-1-1', '2013-2-1'),
     'Year': (2011, 2011, 2011, 2011, 2012, 2012, 2012, 2012, 2013, 2013, 2013, 2013)})

pd.to_datetime(df['Date'])
         Date   LG  Year
0  2011-01-01  AR1  2011
1  2011-03-01  AR1  2011
2  2011-04-01  AR1  2011
3  2011-02-01  AR1  2011
4  2012-01-01  AR1  2012
5  2012-02-01  AR1  2012
6  2012-01-01  PO1  2012
7  2012-02-01  PO1  2012
8  2013-01-01  AR1  2013
9  2013-02-01  AR1  2013
10 2013-01-01  PO1  2013
11 2013-02-01  PO1  2013
# group by 'Year' and 'LG'
idx = ['Year', 'LG']

# build a grouper
group_by = df.groupby(idx, as_index=False)

# need frame to re-expand the group size
df1 = df.set_index(idx)
df1['g_size'] = group_by.size()

# find the rows in the top half of respective group
top_half = (group_by.cumcount() / df1.g_size.values).values < 0.5

# build new data frames
top = df.loc[top_half]
bot = df.loc[~top_half]
print(df)
print('-- top')
print(top)
print('-- bot')
print(bot)
print('--')
        Date   LG  Year
0   2011-1-1  AR1  2011
1   2011-3-1  AR1  2011
2   2011-4-1  AR1  2011
3   2011-2-1  AR1  2011
4   2012-1-1  AR1  2012
5   2012-2-1  AR1  2012
6   2012-1-1  PO1  2012
7   2012-2-1  PO1  2012
8   2013-1-1  AR1  2013
9   2013-2-1  AR1  2013
10  2013-1-1  PO1  2013
11  2013-2-1  PO1  2013
-- top
   Year   LG      Date
0  2011  AR1  2011-1-1
1  2011  AR1  2011-2-1
2  2012  AR1  2012-1-1
3  2012  PO1  2012-1-1
4  2013  AR1  2013-1-1
5  2013  PO1  2013-1-1
-- bot
   Year   LG      Date
0  2011  AR1  2011-3-1
1  2011  AR1  2011-4-1
2  2012  AR1  2012-2-1
3  2012  PO1  2012-2-1
4  2013  AR1  2013-2-1
5  2013  PO1  2013-2-1
df = pd.DataFrame({
    'LG': ('AR1', 'AR1', 'AR1', 'AR1', 'AR1', 'AR1',
           'PO1', 'PO1', 'AR1', 'AR1', 'PO1', 'PO1'),
    'Date': ('2011-1-1', '2011-3-1', '2011-4-1', '2011-2-1', '2012-1-1',
             '2012-2-1', '2012-1-1', '2012-2-1', '2013-1-1', '2013-2-1',
             '2013-1-1', '2013-2-1'),
    'Year': (2011, 2011, 2011, 2011, 2012, 2012, 2012, 2012, 2013,
             2013, 2013, 2013)
})
pd.to_datetime(df['Date'])
df:

import pandas as pd
import numpy as np

df = pd.DataFrame(
    {'LG' : ('AR1', 'AR1', 'AR1', 'AR1', 'AR1', 'AR1', 'PO1',  'PO1', 'AR1', 'AR1', 'PO1', 'PO1'),
     'Date': ('2011-1-1', '2011-3-1',  '2011-4-1', '2011-2-1', '2012-1-1', '2012-2-1', '2012-1-1', '2012-2-1', '2013-1-1', '2013-2-1', '2013-1-1', '2013-2-1'),
     'Year': (2011, 2011, 2011, 2011, 2012, 2012, 2012, 2012, 2013, 2013, 2013, 2013)})

pd.to_datetime(df['Date'])
         Date   LG  Year
0  2011-01-01  AR1  2011
1  2011-03-01  AR1  2011
2  2011-04-01  AR1  2011
3  2011-02-01  AR1  2011
4  2012-01-01  AR1  2012
5  2012-02-01  AR1  2012
6  2012-01-01  PO1  2012
7  2012-02-01  PO1  2012
8  2013-01-01  AR1  2013
9  2013-02-01  AR1  2013
10 2013-01-01  PO1  2013
11 2013-02-01  PO1  2013
# group by 'Year' and 'LG'
idx = ['Year', 'LG']

# build a grouper
group_by = df.groupby(idx, as_index=False)

# need frame to re-expand the group size
df1 = df.set_index(idx)
df1['g_size'] = group_by.size()

# find the rows in the top half of respective group
top_half = (group_by.cumcount() / df1.g_size.values).values < 0.5

# build new data frames
top = df.loc[top_half]
bot = df.loc[~top_half]
print(df)
print('-- top')
print(top)
print('-- bot')
print(bot)
print('--')
        Date   LG  Year
0   2011-1-1  AR1  2011
1   2011-3-1  AR1  2011
2   2011-4-1  AR1  2011
3   2011-2-1  AR1  2011
4   2012-1-1  AR1  2012
5   2012-2-1  AR1  2012
6   2012-1-1  PO1  2012
7   2012-2-1  PO1  2012
8   2013-1-1  AR1  2013
9   2013-2-1  AR1  2013
10  2013-1-1  PO1  2013
11  2013-2-1  PO1  2013
-- top
   Year   LG      Date
0  2011  AR1  2011-1-1
1  2011  AR1  2011-2-1
2  2012  AR1  2012-1-1
3  2012  PO1  2012-1-1
4  2013  AR1  2013-1-1
5  2013  PO1  2013-1-1
-- bot
   Year   LG      Date
0  2011  AR1  2011-3-1
1  2011  AR1  2011-4-1
2  2012  AR1  2012-2-1
3  2012  PO1  2012-2-1
4  2013  AR1  2013-2-1
5  2013  PO1  2013-2-1
df = pd.DataFrame({
    'LG': ('AR1', 'AR1', 'AR1', 'AR1', 'AR1', 'AR1',
           'PO1', 'PO1', 'AR1', 'AR1', 'PO1', 'PO1'),
    'Date': ('2011-1-1', '2011-3-1', '2011-4-1', '2011-2-1', '2012-1-1',
             '2012-2-1', '2012-1-1', '2012-2-1', '2013-1-1', '2013-2-1',
             '2013-1-1', '2013-2-1'),
    'Year': (2011, 2011, 2011, 2011, 2012, 2012, 2012, 2012, 2013,
             2013, 2013, 2013)
})
pd.to_datetime(df['Date'])

在对
年份
LG
进行分组后,将框架一分为二。基本思想是在组中找到小于组大小50%的位置

代码:

import pandas as pd
import numpy as np

df = pd.DataFrame(
    {'LG' : ('AR1', 'AR1', 'AR1', 'AR1', 'AR1', 'AR1', 'PO1',  'PO1', 'AR1', 'AR1', 'PO1', 'PO1'),
     'Date': ('2011-1-1', '2011-3-1',  '2011-4-1', '2011-2-1', '2012-1-1', '2012-2-1', '2012-1-1', '2012-2-1', '2013-1-1', '2013-2-1', '2013-1-1', '2013-2-1'),
     'Year': (2011, 2011, 2011, 2011, 2012, 2012, 2012, 2012, 2013, 2013, 2013, 2013)})

pd.to_datetime(df['Date'])
         Date   LG  Year
0  2011-01-01  AR1  2011
1  2011-03-01  AR1  2011
2  2011-04-01  AR1  2011
3  2011-02-01  AR1  2011
4  2012-01-01  AR1  2012
5  2012-02-01  AR1  2012
6  2012-01-01  PO1  2012
7  2012-02-01  PO1  2012
8  2013-01-01  AR1  2013
9  2013-02-01  AR1  2013
10 2013-01-01  PO1  2013
11 2013-02-01  PO1  2013
# group by 'Year' and 'LG'
idx = ['Year', 'LG']

# build a grouper
group_by = df.groupby(idx, as_index=False)

# need frame to re-expand the group size
df1 = df.set_index(idx)
df1['g_size'] = group_by.size()

# find the rows in the top half of respective group
top_half = (group_by.cumcount() / df1.g_size.values).values < 0.5

# build new data frames
top = df.loc[top_half]
bot = df.loc[~top_half]
print(df)
print('-- top')
print(top)
print('-- bot')
print(bot)
print('--')
        Date   LG  Year
0   2011-1-1  AR1  2011
1   2011-3-1  AR1  2011
2   2011-4-1  AR1  2011
3   2011-2-1  AR1  2011
4   2012-1-1  AR1  2012
5   2012-2-1  AR1  2012
6   2012-1-1  PO1  2012
7   2012-2-1  PO1  2012
8   2013-1-1  AR1  2013
9   2013-2-1  AR1  2013
10  2013-1-1  PO1  2013
11  2013-2-1  PO1  2013
-- top
   Year   LG      Date
0  2011  AR1  2011-1-1
1  2011  AR1  2011-2-1
2  2012  AR1  2012-1-1
3  2012  PO1  2012-1-1
4  2013  AR1  2013-1-1
5  2013  PO1  2013-1-1
-- bot
   Year   LG      Date
0  2011  AR1  2011-3-1
1  2011  AR1  2011-4-1
2  2012  AR1  2012-2-1
3  2012  PO1  2012-2-1
4  2013  AR1  2013-2-1
5  2013  PO1  2013-2-1
df = pd.DataFrame({
    'LG': ('AR1', 'AR1', 'AR1', 'AR1', 'AR1', 'AR1',
           'PO1', 'PO1', 'AR1', 'AR1', 'PO1', 'PO1'),
    'Date': ('2011-1-1', '2011-3-1', '2011-4-1', '2011-2-1', '2012-1-1',
             '2012-2-1', '2012-1-1', '2012-2-1', '2013-1-1', '2013-2-1',
             '2013-1-1', '2013-2-1'),
    'Year': (2011, 2011, 2011, 2011, 2012, 2012, 2012, 2012, 2013,
             2013, 2013, 2013)
})
pd.to_datetime(df['Date'])
排序结果:

import pandas as pd
import numpy as np

df = pd.DataFrame(
    {'LG' : ('AR1', 'AR1', 'AR1', 'AR1', 'AR1', 'AR1', 'PO1',  'PO1', 'AR1', 'AR1', 'PO1', 'PO1'),
     'Date': ('2011-1-1', '2011-3-1',  '2011-4-1', '2011-2-1', '2012-1-1', '2012-2-1', '2012-1-1', '2012-2-1', '2013-1-1', '2013-2-1', '2013-1-1', '2013-2-1'),
     'Year': (2011, 2011, 2011, 2011, 2012, 2012, 2012, 2012, 2013, 2013, 2013, 2013)})

pd.to_datetime(df['Date'])
         Date   LG  Year
0  2011-01-01  AR1  2011
1  2011-03-01  AR1  2011
2  2011-04-01  AR1  2011
3  2011-02-01  AR1  2011
4  2012-01-01  AR1  2012
5  2012-02-01  AR1  2012
6  2012-01-01  PO1  2012
7  2012-02-01  PO1  2012
8  2013-01-01  AR1  2013
9  2013-02-01  AR1  2013
10 2013-01-01  PO1  2013
11 2013-02-01  PO1  2013
# group by 'Year' and 'LG'
idx = ['Year', 'LG']

# build a grouper
group_by = df.groupby(idx, as_index=False)

# need frame to re-expand the group size
df1 = df.set_index(idx)
df1['g_size'] = group_by.size()

# find the rows in the top half of respective group
top_half = (group_by.cumcount() / df1.g_size.values).values < 0.5

# build new data frames
top = df.loc[top_half]
bot = df.loc[~top_half]
print(df)
print('-- top')
print(top)
print('-- bot')
print(bot)
print('--')
        Date   LG  Year
0   2011-1-1  AR1  2011
1   2011-3-1  AR1  2011
2   2011-4-1  AR1  2011
3   2011-2-1  AR1  2011
4   2012-1-1  AR1  2012
5   2012-2-1  AR1  2012
6   2012-1-1  PO1  2012
7   2012-2-1  PO1  2012
8   2013-1-1  AR1  2013
9   2013-2-1  AR1  2013
10  2013-1-1  PO1  2013
11  2013-2-1  PO1  2013
-- top
   Year   LG      Date
0  2011  AR1  2011-1-1
1  2011  AR1  2011-2-1
2  2012  AR1  2012-1-1
3  2012  PO1  2012-1-1
4  2013  AR1  2013-1-1
5  2013  PO1  2013-1-1
-- bot
   Year   LG      Date
0  2011  AR1  2011-3-1
1  2011  AR1  2011-4-1
2  2012  AR1  2012-2-1
3  2012  PO1  2012-2-1
4  2013  AR1  2013-2-1
5  2013  PO1  2013-2-1
df = pd.DataFrame({
    'LG': ('AR1', 'AR1', 'AR1', 'AR1', 'AR1', 'AR1',
           'PO1', 'PO1', 'AR1', 'AR1', 'PO1', 'PO1'),
    'Date': ('2011-1-1', '2011-3-1', '2011-4-1', '2011-2-1', '2012-1-1',
             '2012-2-1', '2012-1-1', '2012-2-1', '2013-1-1', '2013-2-1',
             '2013-1-1', '2013-2-1'),
    'Year': (2011, 2011, 2011, 2011, 2012, 2012, 2012, 2012, 2013,
             2013, 2013, 2013)
})
pd.to_datetime(df['Date'])
测试数据:

import pandas as pd
import numpy as np

df = pd.DataFrame(
    {'LG' : ('AR1', 'AR1', 'AR1', 'AR1', 'AR1', 'AR1', 'PO1',  'PO1', 'AR1', 'AR1', 'PO1', 'PO1'),
     'Date': ('2011-1-1', '2011-3-1',  '2011-4-1', '2011-2-1', '2012-1-1', '2012-2-1', '2012-1-1', '2012-2-1', '2013-1-1', '2013-2-1', '2013-1-1', '2013-2-1'),
     'Year': (2011, 2011, 2011, 2011, 2012, 2012, 2012, 2012, 2013, 2013, 2013, 2013)})

pd.to_datetime(df['Date'])
         Date   LG  Year
0  2011-01-01  AR1  2011
1  2011-03-01  AR1  2011
2  2011-04-01  AR1  2011
3  2011-02-01  AR1  2011
4  2012-01-01  AR1  2012
5  2012-02-01  AR1  2012
6  2012-01-01  PO1  2012
7  2012-02-01  PO1  2012
8  2013-01-01  AR1  2013
9  2013-02-01  AR1  2013
10 2013-01-01  PO1  2013
11 2013-02-01  PO1  2013
# group by 'Year' and 'LG'
idx = ['Year', 'LG']

# build a grouper
group_by = df.groupby(idx, as_index=False)

# need frame to re-expand the group size
df1 = df.set_index(idx)
df1['g_size'] = group_by.size()

# find the rows in the top half of respective group
top_half = (group_by.cumcount() / df1.g_size.values).values < 0.5

# build new data frames
top = df.loc[top_half]
bot = df.loc[~top_half]
print(df)
print('-- top')
print(top)
print('-- bot')
print(bot)
print('--')
        Date   LG  Year
0   2011-1-1  AR1  2011
1   2011-3-1  AR1  2011
2   2011-4-1  AR1  2011
3   2011-2-1  AR1  2011
4   2012-1-1  AR1  2012
5   2012-2-1  AR1  2012
6   2012-1-1  PO1  2012
7   2012-2-1  PO1  2012
8   2013-1-1  AR1  2013
9   2013-2-1  AR1  2013
10  2013-1-1  PO1  2013
11  2013-2-1  PO1  2013
-- top
   Year   LG      Date
0  2011  AR1  2011-1-1
1  2011  AR1  2011-2-1
2  2012  AR1  2012-1-1
3  2012  PO1  2012-1-1
4  2013  AR1  2013-1-1
5  2013  PO1  2013-1-1
-- bot
   Year   LG      Date
0  2011  AR1  2011-3-1
1  2011  AR1  2011-4-1
2  2012  AR1  2012-2-1
3  2012  PO1  2012-2-1
4  2013  AR1  2013-2-1
5  2013  PO1  2013-2-1
df = pd.DataFrame({
    'LG': ('AR1', 'AR1', 'AR1', 'AR1', 'AR1', 'AR1',
           'PO1', 'PO1', 'AR1', 'AR1', 'PO1', 'PO1'),
    'Date': ('2011-1-1', '2011-3-1', '2011-4-1', '2011-2-1', '2012-1-1',
             '2012-2-1', '2012-1-1', '2012-2-1', '2013-1-1', '2013-2-1',
             '2013-1-1', '2013-2-1'),
    'Year': (2011, 2011, 2011, 2011, 2012, 2012, 2012, 2012, 2013,
             2013, 2013, 2013)
})
pd.to_datetime(df['Date'])

df['Date'].apply(pd.to_datetime)
是一种缓慢的表达方式
pd.to_datetime(df['Date'])
。根据您的注释进行编辑谢谢,不过只有一期。2011年的AR1未正确拆分。2011-2-1在“底部”组,2011-3-1在“顶部”组。为什么?哦,你想把它分类?您没有显示任何示例输出,因此我没有假设。让我做点什么…是的,我的错。我指的是前50%的年份是日期,而不是如图所示的列。谢谢,那看起来怎么样?