Python 仅对频繁值进行一次热编码

Python 仅对频繁值进行一次热编码,python,pandas,dictionary,dataframe,conditional,Python,Pandas,Dictionary,Dataframe,Conditional,我希望对一个列进行一次热编码,但只针对那些非常频繁的列。所有低于阈值T的都将归入自己的类别 我的策略是创建一个“名称”->“频率”的字典。然后将频率转换为字符串。如果字符串不常见,则应将其替换为一些描述性字符串。我最好有两个区域/阈值:“不太常见”和“罕见”之类的 这是我目前的尝试。我把它分成几行,仅供调试参考。第三行不行。我在Python 3.6中使用conda tmp = df["name"].groupby(df["name"]) tmp = tmp.agg(['count']) tmp[

我希望对一个列进行一次热编码,但只针对那些非常频繁的列。所有低于阈值T的都将归入自己的类别

我的策略是创建一个“名称”->“频率”的字典。然后将频率转换为字符串。如果字符串不常见,则应将其替换为一些描述性字符串。我最好有两个区域/阈值:“不太常见”和“罕见”之类的

这是我目前的尝试。我把它分成几行,仅供调试参考。第三行不行。我在Python 3.6中使用conda

tmp = df["name"].groupby(df["name"])
tmp = tmp.agg(['count'])
tmp['count'] = tmp["count"].apply(lambda x: "Uncommon" if tmp["count"] < 1000.0 else str(x) )
labelDict = tmp.to_dict()
#some code?
df[columnName].replace(labelDict, inplace=True)
pd.get_dummies(df, columns=['name'])
一些示例输入(还有其他列): 名称=a,a,a,a,b,b,c,c,d

这就变成

name | count
a | 4
b | 3
c | 2
d | 1

Let's say T is =<2
dict:
a->4, b->3, c->"Uncommon", d->"Uncommon"

Remap dict to use the original values if name is numeric:
a->"a", b->"b", c->"Uncommon", d->"Uncommon"

As one hot:
date | id | name_a | name_b | name_Uncommon 
...  | ...|  1     | 0      | 0
...

我承认我找到了一个相关的解决方案,但不清楚如何修改它以满足我的需要。问题是,您不能在值为{a,b,c,…}的“第一”列上执行一个热处理,然后在值为{a,b,c,…}的“第二”列上执行一个热处理,并按值标记这些列。我会有一个名字冲突

考虑示例数据帧
df

np.random.seed([3,1415])
df = pd.DataFrame(dict(
        name=np.random.choice(
            list('abcdefghij'), 1000,
            p=np.arange(10, 0, -1) / 55
        )
    ))
threshold = 60
counts = df.name.value_counts()
counts

a    197
b    166
c    139
d    119
f    107
e    105
g     72
h     53
i     27
j     15
Name: name, dtype: int64
然后
replace
pd.get\u dummies

repl = counts[counts <= threshold].index
print(pd.get_dummies(df.name.replace(repl, 'uncommon')))

     a  b  c  d  e  f  g  uncommon
0    0  0  1  0  0  0  0         0
1    0  0  1  0  0  0  0         0
2    0  0  1  0  0  0  0         0
3    0  0  1  0  0  0  0         0
4    0  0  1  0  0  0  0         0
5    1  0  0  0  0  0  0         0
6    0  0  0  0  0  0  1         0
7    0  0  0  0  0  1  0         0
8    0  0  0  0  0  1  0         0
9    0  0  0  0  0  1  0         0
10   0  0  0  0  0  0  0         1
11   0  0  0  0  0  0  1         0
12   0  0  0  0  0  0  1         0
13   0  0  0  0  0  0  0         1
14   0  0  0  0  1  0  0         0
15   1  0  0  0  0  0  0         0
16   1  0  0  0  0  0  0         0
17   0  1  0  0  0  0  0         0

repl=counts[counts这是我提出的解决方案。1)获得频率。2)按小于1和2的阈值过滤并获得索引。3)利用设置的差异来识别罕见和罕见。4)用不常见/罕见替换标签。5)Get_假人进行一次热编码

def onehot2(df, threshold_uncommon, threshold_rare, column, prefix, normalize=False):
    freqencies = df[column].value_counts( sort=False, normalize=normalize)
    idx1 = freqencies[freqencies < threshold_uncommon].index
    idx2 = freqencies[freqencies < threshold_rare].index
    idx1 = idx1.difference(idx2)
    tmp = df
    tmp[column] = df[column].replace(idx1, 'uncommon') if idx1.shape[0] > 0 else df
    tmp[column] = tmp[column].replace(idx2, 'rare') if idx2.shape[0] > 0 else tmp
    d = pd.get_dummies(tmp, columns=[column], prefix=prefix, dummy_na=True)#
    return d

def onehot(df, threshold, column, prefix, normalize=False):
    freqencies = df[column].value_counts( sort=False, normalize=normalize)
    idx = freqencies[freqencies < threshold].index
    tmp = df
    if idx.shape[0] > 0:
        tmp[column] = df[column].replace(idx, 'uncommon')
    else:
        tmp = df
    d = pd.get_dummies(tmp, columns=[column], prefix=prefix, dummy_na=True)#
    return d
def onehot2(df,阈值不常见,阈值罕见,列,前缀,normalize=False):
频率=df[列]。值\计数(排序=假,规格化=规格化)
idx1=频率[频率<阈值].索引
idx2=频率[频率<阈值\罕见]。索引
idx1=idx1.差异(idx2)
tmp=df
tmp[column]=df[column]。如果idx1.shape[0]>0,则替换(idx1,'不常见')
tmp[column]=tmp[column]。如果idx2.shape[0]>0,则替换(idx2,'ravel')。否则为tmp
d=pd.get_dummies(tmp,columns=[column],prefix=prefix,dummy_na=True)#
返回d
def onehot(df、阈值、列、前缀、normalize=False):
频率=df[列]。值\计数(排序=假,规格化=规格化)
idx=频率[频率<阈值]。索引
tmp=df
如果idx.shape[0]>0:
tmp[column]=df[column]。替换(idx,“不常见”)
其他:
tmp=df
d=pd.get_dummies(tmp,columns=[column],prefix=prefix,dummy_na=True)#
返回d
在第二个函数中,我使用了1个阈值,而不是2个阈值

repl = counts[counts <= threshold].index
print(pd.get_dummies(df.name.replace(repl, 'uncommon')))

     a  b  c  d  e  f  g  uncommon
0    0  0  1  0  0  0  0         0
1    0  0  1  0  0  0  0         0
2    0  0  1  0  0  0  0         0
3    0  0  1  0  0  0  0         0
4    0  0  1  0  0  0  0         0
5    1  0  0  0  0  0  0         0
6    0  0  0  0  0  0  1         0
7    0  0  0  0  0  1  0         0
8    0  0  0  0  0  1  0         0
9    0  0  0  0  0  1  0         0
10   0  0  0  0  0  0  0         1
11   0  0  0  0  0  0  1         0
12   0  0  0  0  0  0  1         0
13   0  0  0  0  0  0  0         1
14   0  0  0  0  1  0  0         0
15   1  0  0  0  0  0  0         0
16   1  0  0  0  0  0  0         0
17   0  1  0  0  0  0  0         0
def onehot2(df, threshold_uncommon, threshold_rare, column, prefix, normalize=False):
    freqencies = df[column].value_counts( sort=False, normalize=normalize)
    idx1 = freqencies[freqencies < threshold_uncommon].index
    idx2 = freqencies[freqencies < threshold_rare].index
    idx1 = idx1.difference(idx2)
    tmp = df
    tmp[column] = df[column].replace(idx1, 'uncommon') if idx1.shape[0] > 0 else df
    tmp[column] = tmp[column].replace(idx2, 'rare') if idx2.shape[0] > 0 else tmp
    d = pd.get_dummies(tmp, columns=[column], prefix=prefix, dummy_na=True)#
    return d

def onehot(df, threshold, column, prefix, normalize=False):
    freqencies = df[column].value_counts( sort=False, normalize=normalize)
    idx = freqencies[freqencies < threshold].index
    tmp = df
    if idx.shape[0] > 0:
        tmp[column] = df[column].replace(idx, 'uncommon')
    else:
        tmp = df
    d = pd.get_dummies(tmp, columns=[column], prefix=prefix, dummy_na=True)#
    return d