Python中计算熵的最快方法
在我的项目中,我需要多次计算0-1向量的熵。这是我的密码:Python中计算熵的最快方法,python,numpy,entropy,Python,Numpy,Entropy,在我的项目中,我需要多次计算0-1向量的熵。这是我的密码: def entropy(labels): """ Computes entropy of 0-1 vector. """ n_labels = len(labels) if n_labels <= 1: return 0 counts = np.bincount(labels) probs = counts[np.nonzero(counts)] / n_labels
def entropy(labels):
""" Computes entropy of 0-1 vector. """
n_labels = len(labels)
if n_labels <= 1:
return 0
counts = np.bincount(labels)
probs = counts[np.nonzero(counts)] / n_labels
n_classes = len(probs)
if n_classes <= 1:
return 0
return - np.sum(probs * np.log(probs)) / np.log(n_classes)
def熵(标签):
“”“计算0-1向量的熵。”“”
n_标签=len(标签)
如果n_标签遵循unutbu的建议,我将创建一个纯python实现
def entropy2(labels):
""" Computes entropy of label distribution. """
n_labels = len(labels)
if n_labels <= 1:
return 0
counts = np.bincount(labels)
probs = counts / n_labels
n_classes = np.count_nonzero(probs)
if n_classes <= 1:
return 0
ent = 0.
# Compute standard entropy.
for i in probs:
ent -= i * log(i, base=n_classes)
return ent
def entropy(A, axis=None):
"""Computes the Shannon entropy of the elements of A. Assumes A is
an array-like of nonnegative ints whose max value is approximately
the number of unique values present.
>>> a = [0, 1]
>>> entropy(a)
1.0
>>> A = np.c_[a, a]
>>> entropy(A)
1.0
>>> A # doctest: +NORMALIZE_WHITESPACE
array([[0, 0], [1, 1]])
>>> entropy(A, axis=0) # doctest: +NORMALIZE_WHITESPACE
array([ 1., 1.])
>>> entropy(A, axis=1) # doctest: +NORMALIZE_WHITESPACE
array([[ 0.], [ 0.]])
>>> entropy([0, 0, 0])
0.0
>>> entropy([])
0.0
>>> entropy([5])
0.0
"""
if A is None or len(A) < 2:
return 0.
A = np.asarray(A)
if axis is None:
A = A.flatten()
counts = np.bincount(A) # needs small, non-negative ints
counts = counts[counts > 0]
if len(counts) == 1:
return 0. # avoid returning -0.0 to prevent weird doctests
probs = counts / float(A.size)
return -np.sum(probs * np.log2(probs))
elif axis == 0:
entropies = map(lambda col: entropy(col), A.T)
return np.array(entropies)
elif axis == 1:
entropies = map(lambda row: entropy(row), A)
return np.array(entropies).reshape((-1, 1))
else:
raise ValueError("unsupported axis: {}".format(axis))
def entropy2(标签):
“”“计算标签分布的熵。”“”
n_标签=len(标签)
如果n_labels以上答案是好的,但是如果您需要一个可以沿不同轴运行的版本,这里有一个可行的实现
def entropy2(labels):
""" Computes entropy of label distribution. """
n_labels = len(labels)
if n_labels <= 1:
return 0
counts = np.bincount(labels)
probs = counts / n_labels
n_classes = np.count_nonzero(probs)
if n_classes <= 1:
return 0
ent = 0.
# Compute standard entropy.
for i in probs:
ent -= i * log(i, base=n_classes)
return ent
def entropy(A, axis=None):
"""Computes the Shannon entropy of the elements of A. Assumes A is
an array-like of nonnegative ints whose max value is approximately
the number of unique values present.
>>> a = [0, 1]
>>> entropy(a)
1.0
>>> A = np.c_[a, a]
>>> entropy(A)
1.0
>>> A # doctest: +NORMALIZE_WHITESPACE
array([[0, 0], [1, 1]])
>>> entropy(A, axis=0) # doctest: +NORMALIZE_WHITESPACE
array([ 1., 1.])
>>> entropy(A, axis=1) # doctest: +NORMALIZE_WHITESPACE
array([[ 0.], [ 0.]])
>>> entropy([0, 0, 0])
0.0
>>> entropy([])
0.0
>>> entropy([5])
0.0
"""
if A is None or len(A) < 2:
return 0.
A = np.asarray(A)
if axis is None:
A = A.flatten()
counts = np.bincount(A) # needs small, non-negative ints
counts = counts[counts > 0]
if len(counts) == 1:
return 0. # avoid returning -0.0 to prevent weird doctests
probs = counts / float(A.size)
return -np.sum(probs * np.log2(probs))
elif axis == 0:
entropies = map(lambda col: entropy(col), A.T)
return np.array(entropies)
elif axis == 1:
entropies = map(lambda row: entropy(row), A)
return np.array(entropies).reshape((-1, 1))
else:
raise ValueError("unsupported axis: {}".format(axis))
def熵(A,轴=无):
“”“计算A元素的香农熵。假设A为
类似于非负整数的数组,其最大值约为
存在的唯一值的数目。
>>>a=[0,1]
>>>熵(a)
1
>>>A=np.c_uA[A,A]
>>>熵(A)
1
>>>A#doctest:+规范化空白
数组([[0,0],[1,1]])
>>>熵(A,轴=0)#doctest:+规范化_空格
数组([1,1.]))
>>>熵(A,轴=1)#doctest:+规范化_空格
数组([[0.],[0.]])
>>>熵([0,0,0])
0
>>>熵([])
0
>>>熵([5])
0
"""
如果A为无或len(A)<2:
返回0。
A=np.asarray(A)
如果轴为无:
A=A.展平()
counts=np。bincount(A)#需要小的非负整数
计数=计数[计数>0]
如果len(计数)==1:
返回0避免返回-0.0以防止奇怪的doctest
probs=计数/浮动(A.尺寸)
返回-np.和(probs*np.log2(probs))
elif轴==0:
熵=映射(λ列:熵(列),A.T)
返回np.数组(熵)
elif轴==1:
熵=映射(λ行:熵(行),A)
返回np.数组(熵)。重塑(-1,1))
其他:
raise VALUERROR(“不支持的轴:{}”。格式(轴))
一个不依赖于numpy的答案:
import math
from collections import Counter
def eta(data, unit='natural'):
base = {
'shannon' : 2.,
'natural' : math.exp(1),
'hartley' : 10.
}
if len(data) <= 1:
return 0
counts = Counter()
for d in data:
counts[d] += 1
ent = 0
probs = [float(c) / len(data) for c in counts.values()]
for p in probs:
if p > 0.:
ent -= p * math.log(p, base[unit])
return ent
@Jarad提供的答案也建议了时间安排。为此目的:
repeat_number = 1000000
e = timeit.repeat(
stmt='''eta(labels)''',
setup='''labels=[1,3,5,2,3,5,3,2,1,3,4,5];from __main__ import eta''',
repeat=3,
number=repeat_number)
Timeit结果:(我认为这比最好的numpy方法快约4倍)
使用pd.Series
和scipy.stats
数据,计算给定量的熵非常简单:
import pandas as pd
import scipy.stats
def ent(data):
"""Calculates entropy of the passed `pd.Series`
"""
p_data = data.value_counts() # counts occurrence of each value
entropy = scipy.stats.entropy(p_data) # get entropy from counts
return entropy
注意:scipy.stats
将标准化提供的数据,因此无需显式执行,也就是说,传递计数数组效果很好。我最喜欢的熵函数如下:
def entropy(labels):
prob_dict = {x:labels.count(x)/len(labels) for x in labels}
probs = np.array(list(prob_dict.values()))
return - probs.dot(np.log2(probs))
我仍然在寻找一种更好的方法来避免dict->values->list->np.array转换。如果我找到它,我会再次发表评论。@Sanjeet-Gupta答案很好,但可以浓缩。这个问题是专门询问“最快”的方式,但我只看到一个答案的时间,所以我将发布一个使用scipy和numpy与原始海报entropy2答案的比较,稍作修改
四种不同的方法:scipy/numpy,numpy/math,pandas/numpy,numpy
import numpy as np
from scipy.stats import entropy
from math import log, e
import pandas as pd
import timeit
def entropy1(labels, base=None):
value,counts = np.unique(labels, return_counts=True)
return entropy(counts, base=base)
def entropy2(labels, base=None):
""" Computes entropy of label distribution. """
n_labels = len(labels)
if n_labels <= 1:
return 0
value,counts = np.unique(labels, return_counts=True)
probs = counts / n_labels
n_classes = np.count_nonzero(probs)
if n_classes <= 1:
return 0
ent = 0.
# Compute entropy
base = e if base is None else base
for i in probs:
ent -= i * log(i, base)
return ent
def entropy3(labels, base=None):
vc = pd.Series(labels).value_counts(normalize=True, sort=False)
base = e if base is None else base
return -(vc * np.log(vc)/np.log(base)).sum()
def entropy4(labels, base=None):
value,counts = np.unique(labels, return_counts=True)
norm_counts = counts / counts.sum()
base = e if base is None else base
return -(norm_counts * np.log(norm_counts)/np.log(base)).sum()
时间结果:
# for loop to print out results of timeit
for approach,timeit_results in zip(['scipy/numpy', 'numpy/math', 'pandas/numpy', 'numpy'], [a,b,c,d]):
print('Method: {}, Avg.: {:.6f}'.format(approach, np.array(timeit_results).mean()))
Method: scipy/numpy, Avg.: 63.315312
Method: numpy/math, Avg.: 49.256894
Method: pandas/numpy, Avg.: 884.644023
Method: numpy, Avg.: 60.026938
获胜者:numpy/math(entropy2)
还值得注意的是,上面的entropy2
函数可以处理数字和文本数据。例如:entropy2(列表('abcdefabacdebcab'))
。原始海报的答案是从2013年开始的,它有一个特定的binning ints用例,但它不适用于文本。均匀分布的数据(高熵):
逐步计算香农熵:
import collections
import math
# calculate probability for each byte as number of occurrences / array length
probabilities = [n_x/len(s) for x,n_x in collections.Counter(s).items()]
# [0.00390625, 0.00390625, 0.00390625, ...]
# calculate per-character entropy fractions
e_x = [-p_x*math.log(p_x,2) for p_x in probabilities]
# [0.03125, 0.03125, 0.03125, ...]
# sum fractions to obtain Shannon entropy
entropy = sum(e_x)
>>> entropy
8.0
一行(假设导入集合
):
适当的功能:
import collections
import math
def H(s):
probabilities = [n_x/len(s) for x,n_x in collections.Counter(s).items()]
e_x = [-p_x*math.log(p_x,2) for p_x in probabilities]
return sum(e_x)
测试用例-英文文本取自:
我的做法如下:
labels = [0, 0, 1, 1]
from collections import Counter
from scipy import stats
stats.entropy(list(Counter(labels).values()), base=2)
该方法通过允许装箱扩展了其他解决方案。例如,bin=None
(默认值)不会将x
装箱,并将为x
的每个元素计算经验概率,而bin=256
在计算经验概率之前将x
分为256个箱子
import numpy as np
def entropy(x, bins=None):
N = x.shape[0]
if bins is None:
counts = np.bincount(x)
else:
counts = np.histogram(x, bins=bins)[0] # 0th idx is counts
p = counts[np.nonzero(counts)]/N # avoids log(0)
H = -np.dot( p, np.log2(p) )
return H
双熵不会是计算熵的最快方法,但它是严格的,并且以一种定义良好的方式建立在香农熵的基础上。它已经在包括图像相关应用在内的各个领域进行了测试。
它是在Github上用Python实现的。标签的典型长度是多少?
?长度不是固定的。了解标签的典型值将有助于基准测试。如果标签
太短,纯python实现实际上可能比使用NumPy更快。请确认,这个问题是关于离散(二进制)随机变量的熵的?而不是一个连续r.v.的微分熵,“基”应该设置为类的数量吗?我认为自然日志是标准的(以及您在原始问题中使用的)。很好,使用集合。计数器会更好。在python2中,标签。计数(x)/len(标签)
应该是标签。计数(x)/float(len(标签))
您使用的数组太小,测试基本上没有用处。您实际上只是在测量各种接口的呼叫开销。此页面上有一个“添加另一个应答”按钮。请随意提供更好的答案。使用这段代码,我也得到了答案的时间(“一个也不依赖numpy的答案…”)——它是方法:eta,平均值:10.461799
。正如有人建议的,我想知道你是否真的在这里测试呼叫开销。最好是将结果的时间取最小值,而不是平均值。请参阅。为什么需要probs=[p>0时probs中的p表示p.]?因为我在五行之后进行测试,我怀疑我根本不需要它:)编辑。加上一个表示无新依赖项,可以使用counts=计数器(数据),而不是循环数据的字符?虽然这可能回答了问题,只有代码的答案通常被视为低质量。提供更多关于为什么会提高这个答案质量的描述和上下文。
import collections
import math
def H(s):
probabilities = [n_x/len(s) for x,n_x in collections.Counter(s).items()]
e_x = [-p_x*math.log(p_x,2) for p_x in probabilities]
return sum(e_x)
>>> H(range(0,256))
8.0
>>> H(range(0,64))
6.0
>>> H(range(0,128))
7.0
>>> H([0,1])
1.0
>>> H('Standard English text usually falls somewhere between 3.5 and 5')
4.228788210509104
labels = [0, 0, 1, 1]
from collections import Counter
from scipy import stats
stats.entropy(list(Counter(labels).values()), base=2)
from collections import Counter
from scipy import stats
labels = [0.9, 0.09, 0.1]
stats.entropy(list(Counter(labels).keys()), base=2)
def entropy(base, prob_a, prob_b ):
import math
base=2
x=prob_a
y=prob_b
expression =-((x*math.log(x,base)+(y*math.log(y,base))))
return [expression]
import numpy as np
def entropy(x, bins=None):
N = x.shape[0]
if bins is None:
counts = np.bincount(x)
else:
counts = np.histogram(x, bins=bins)[0] # 0th idx is counts
p = counts[np.nonzero(counts)]/N # avoids log(0)
H = -np.dot( p, np.log2(p) )
return H