Python 如何使用numpy random.choice为每行创建二维数组?
我正在尝试创建一个二维数组(它是一个六列多行的数组),使用numpy随机选择,每行(而不是整个数组)的唯一值在1到50之间Python 如何使用numpy random.choice为每行创建二维数组?,python,arrays,numpy,Python,Arrays,Numpy,我正在尝试创建一个二维数组(它是一个六列多行的数组),使用numpy随机选择,每行(而不是整个数组)的唯一值在1到50之间 np.sort(np.random.choice(np.arange(1,50),size=(100,6),replace=False)) 但这会引起一个错误 ValueError: Cannot take a larger sample than population when 'replace=False' 有没有可能用一个没有回路的单衬套来实现这一点 编辑 好
np.sort(np.random.choice(np.arange(1,50),size=(100,6),replace=False))
但这会引起一个错误
ValueError: Cannot take a larger sample than population when 'replace=False'
有没有可能用一个没有回路的单衬套来实现这一点
编辑
好的,我知道答案了
这些是jupyter%时间cellmagic的结果
#@James' solution
np.stack([np.random.choice(np.arange(1,50),size=6,replace=False) for i in range(1_000_000)])
Wall time: 25.1 s
#@Divakar's solution
np.random.rand(1_000_000, 50).argpartition(6,axis=1)[:,:6]+1
Wall time: 1.36 s
#@CoryKramer's solution
np.array([np.random.choice(np.arange(1, 50), size=6, replace=False) for _ in range(1_000_000)])
Wall time: 25.5 s
我在@Paul Panzer的解决方案中更改了np.empty和np.random.randint的数据类型,因为它在我的电脑上不起作用
3.6.0 |Anaconda custom (64-bit)| (default, Dec 23 2016, 11:57:41) [MSC v.1900 64 bit (AMD64)]
最快的是
def pp(n):
draw = np.empty((n, 6), dtype=np.int64)
# generating random numbers is expensive, so draw a large one and
# make six out of one
draw[:, 0] = np.random.randint(0, 50*49*48*47*46*45, (n,),dtype=np.uint64)
draw[:, 1:] = np.arange(50, 45, -1)
draw = np.floor_divide.accumulate(draw, axis=-1)
draw[:, :-1] -= draw[:, 1:] * np.arange(50, 45, -1)
# map the shorter ranges (:49, :48, :47) to the non-occupied
# positions; this amounts to incrementing for each number on the
# left that is not larger. the nasty bit: if due to incrementing
# new numbers on the left are "overtaken" then for them we also
# need to increment.
for i in range(1, 6):
coll = np.sum(draw[:, :i] <= draw[:, i, None], axis=-1)
collidx = np.flatnonzero(coll)
if collidx.size == 0:
continue
coll = coll[collidx]
tot = coll
while True:
draw[collidx, i] += coll
coll = np.sum(draw[collidx, :i] <= draw[collidx, i, None], axis=-1)
relidx = np.flatnonzero(coll > tot)
if relidx.size == 0:
break
coll, tot = coll[relidx]-tot[relidx], coll[relidx]
collidx = collidx[relidx]
return draw + 1
#@Paul Panzer' solution
pp(1_000_000)
Wall time: 557 ms
def pp(n):
draw=np.empty((n,6),dtype=np.int64)
#生成随机数是昂贵的,所以画一个大的,然后
#六取一
draw[:,0]=np.random.randint(0,50*49*48*47*46*45,(n,),dtype=np.uint64)
画图[:,1:]=np.arange(50,45,-1)
绘制=np.地板\分割.累积(绘制,轴=-1)
draw[:,:-1]-=draw[:,1:]*np.arange(50,45,-1)
#将较短的范围(:49,:48,:47)映射到未占用区域
#职位;这相当于对数据集上的每个数字递增
#左边那个不大。讨厌的一点:如果是由于递增
#左边的新数字被“超越”,那么对于它们,我们也
#需要增加。
对于范围(1,6)内的i:
coll=np.sum(draw[:,:i]这不是纯粹的numpy
,但您可以将您的解决方案封装在列表中
>>> rows = 10
>>> cols = 6
>>> np.array([np.random.choice(np.arange(1, 50), size=cols, replace=False) for _ in range(rows)])
array([[ 9, 10, 21, 33, 34, 15],
[48, 46, 36, 7, 37, 45],
[21, 15, 5, 9, 31, 26],
[48, 24, 30, 18, 47, 23],
[22, 31, 19, 32, 3, 33],
[35, 44, 15, 46, 20, 43],
[11, 37, 44, 6, 16, 35],
[42, 49, 41, 28, 12, 19],
[19, 6, 32, 3, 1, 22],
[29, 33, 42, 5, 30, 43]])
您可以自己创建每一行,然后堆叠它们
np.stack([np.random.choice(np.arange(1,50),size=6,replace=False) for i in range(100)])
下面是一个使用rand+argsort/argpartition
技巧的向量化方法-
样本运行-
In [41]: rows = 10
In [42]: np.random.rand(rows, 50).argpartition(6,axis=1)[:,:6]+1
Out[42]:
array([[ 1, 9, 3, 26, 14, 44],
[32, 20, 27, 13, 25, 45],
[40, 12, 47, 16, 10, 29],
[ 6, 36, 32, 16, 18, 4],
[42, 46, 24, 9, 1, 31],
[15, 25, 47, 42, 34, 24],
[ 7, 16, 49, 31, 40, 20],
[28, 17, 47, 36, 8, 44],
[ 7, 42, 14, 4, 17, 35],
[39, 19, 37, 7, 8, 36]])
只是为了证明随机性-
In [56]: rows = 1000000
In [57]: out = np.random.rand(rows, 50).argpartition(6,axis=1)[:,:6]+1
In [58]: np.bincount(out.ravel())[1:]
Out[58]:
array([120048, 120026, 119942, 119838, 119885, 119669, 119965, 119491,
120280, 120108, 120293, 119399, 119917, 119974, 120195, 119796,
119887, 119505, 120235, 119857, 119499, 120560, 119891, 119693,
120081, 120369, 120011, 119714, 120218, 120581, 120111, 119867,
119791, 120265, 120457, 120048, 119813, 119702, 120266, 120445,
120016, 120190, 119576, 119737, 120153, 120215, 120144, 120196,
120218, 119863])
对一百万行数据的计时-
In [43]: rows = 1000000
In [44]: %timeit np.random.rand(rows, 50).argpartition(6,axis=1)[:,:6]+1
1 loop, best of 3: 1.07 s per loop
这是一个建设性的方法,第一个(50个选项),第二个(49个选项)等等。对于大型集合,它非常有竞争力(表中的pp):
包括benchmarking.Algo在内的代码有点复杂,因为到自由点的映射很复杂:
import numpy as np
import types
from timeit import timeit
def f_pp(n):
draw = np.empty((n, 6), dtype=int)
# generating random numbers is expensive, so draw a large one and
# make six out of one
draw[:, 0] = np.random.randint(0, 50*49*48*47*46*45, (n,))
draw[:, 1:] = np.arange(50, 45, -1)
draw = np.floor_divide.accumulate(draw, axis=-1)
draw[:, :-1] -= draw[:, 1:] * np.arange(50, 45, -1)
# map the shorter ranges (:49, :48, :47) to the non-occupied
# positions; this amounts to incrementing for each number on the
# left that is not larger. the nasty bit: if due to incrementing
# new numbers on the left are "overtaken" then for them we also
# need to increment.
for i in range(1, 6):
coll = np.sum(draw[:, :i] <= draw[:, i, None], axis=-1)
collidx = np.flatnonzero(coll)
if collidx.size == 0:
continue
coll = coll[collidx]
tot = coll
while True:
draw[collidx, i] += coll
coll = np.sum(draw[collidx, :i] <= draw[collidx, i, None], axis=-1)
relidx = np.flatnonzero(coll > tot)
if relidx.size == 0:
break
coll, tot = coll[relidx]-tot[relidx], coll[relidx]
collidx = collidx[relidx]
return draw + 1
def check_result(draw, name):
print(name[2:], ' checking plausibility...')
import scipy.stats
assert all(len(set(row)) == 6 for row in draw)
assert len(set(draw.ravel())) == 50
print(' var (exp obs)', scipy.stats.uniform(0.5, 50).var(), draw.var())
print(' mean (exp obs)', scipy.stats.uniform(0.5, 50).mean(), draw.mean())
def f_Divakar(n):
return np.random.rand(n, 50).argpartition(6,axis=1)[:,:6]+1
def f_James(n):
return np.stack([np.random.choice(np.arange(1,51),size=6,replace=False) for i in range(n)])
def f_CK(n):
return np.array([np.random.choice(np.arange(1, 51), size=6, replace=False) for _ in range(n)])
for n in (10, 1_000, 1_000_000):
print(f'n = {n}')
for name, func in list(globals().items()):
if not name.startswith('f_') or not isinstance(func, types.FunctionType):
continue
try:
print("{:16s}{:16.8f} ms".format(name[2:], timeit(
'f(n)', globals={'f':func, 'n':n}, number=10)*100))
except:
print("{:16s} apparently failed".format(name[2:]))
if(n >= 10000):
for name, func in list(globals().items()):
if name.startswith('f_') and isinstance(func, types.FunctionType):
check_result(func(n), name)
将numpy导入为np
导入类型
从timeit导入timeit
def f_pp(n):
draw=np.empty((n,6),dtype=int)
#生成随机数是昂贵的,所以画一个大的,然后
#六取一
draw[:,0]=np.random.randint(0,50*49*48*47*46*45,(n,)
画图[:,1:]=np.arange(50,45,-1)
绘制=np.地板\分割.累积(绘制,轴=-1)
draw[:,:-1]-=draw[:,1:]*np.arange(50,45,-1)
#将较短的范围(:49,:48,:47)映射到未占用区域
#位置;这相当于每个位置上的数字递增
#左边那个不算大。讨厌的一点:如果是因为递增
#左边的新数字被“超越”,那么对于它们,我们也
#需要增加。
对于范围(1,6)内的i:
coll=np.和(draw[:,:i]=10000):
对于名称,列表中的func(globals().items()):
如果name.startswith('f_')和isinstance(func,types.FunctionType):
检查结果(函数(n),名称)
我更喜欢使用np.stack
,原因如下:(来自vstack
文档)此函数仍然支持向后兼容,但您应该更喜欢np.concatenate或np.stack。np.stack函数是在NumPy 1.10中添加的。
Ah.我不知道。samples=瓶颈.argpartition(np.random.rand(rows,50),6,1)[:,:6])
可能更快,我还没有检查过这个,你能不能让它更通用一点?
# n = 10
# pp 0.18564210 ms
# Divakar 0.01960790 ms
# James 0.20074140 ms
# CK 0.17823420 ms
# n = 1000
# pp 0.80046050 ms
# Divakar 1.31817130 ms
# James 18.93511460 ms
# CK 20.83670820 ms
# n = 1000000
# pp 655.32905590 ms
# Divakar 1352.44713990 ms
# James 18471.08987370 ms
# CK 18369.79808050 ms
# pp checking plausibility...
# var (exp obs) 208.333333333 208.363840259
# mean (exp obs) 25.5 25.5064865
# Divakar checking plausibility...
# var (exp obs) 208.333333333 208.21113972
# mean (exp obs) 25.5 25.499471
# James checking plausibility...
# var (exp obs) 208.333333333 208.313436938
# mean (exp obs) 25.5 25.4979035
# CK checking plausibility...
# var (exp obs) 208.333333333 208.169585249
# mean (exp obs) 25.5 25.49
import numpy as np
import types
from timeit import timeit
def f_pp(n):
draw = np.empty((n, 6), dtype=int)
# generating random numbers is expensive, so draw a large one and
# make six out of one
draw[:, 0] = np.random.randint(0, 50*49*48*47*46*45, (n,))
draw[:, 1:] = np.arange(50, 45, -1)
draw = np.floor_divide.accumulate(draw, axis=-1)
draw[:, :-1] -= draw[:, 1:] * np.arange(50, 45, -1)
# map the shorter ranges (:49, :48, :47) to the non-occupied
# positions; this amounts to incrementing for each number on the
# left that is not larger. the nasty bit: if due to incrementing
# new numbers on the left are "overtaken" then for them we also
# need to increment.
for i in range(1, 6):
coll = np.sum(draw[:, :i] <= draw[:, i, None], axis=-1)
collidx = np.flatnonzero(coll)
if collidx.size == 0:
continue
coll = coll[collidx]
tot = coll
while True:
draw[collidx, i] += coll
coll = np.sum(draw[collidx, :i] <= draw[collidx, i, None], axis=-1)
relidx = np.flatnonzero(coll > tot)
if relidx.size == 0:
break
coll, tot = coll[relidx]-tot[relidx], coll[relidx]
collidx = collidx[relidx]
return draw + 1
def check_result(draw, name):
print(name[2:], ' checking plausibility...')
import scipy.stats
assert all(len(set(row)) == 6 for row in draw)
assert len(set(draw.ravel())) == 50
print(' var (exp obs)', scipy.stats.uniform(0.5, 50).var(), draw.var())
print(' mean (exp obs)', scipy.stats.uniform(0.5, 50).mean(), draw.mean())
def f_Divakar(n):
return np.random.rand(n, 50).argpartition(6,axis=1)[:,:6]+1
def f_James(n):
return np.stack([np.random.choice(np.arange(1,51),size=6,replace=False) for i in range(n)])
def f_CK(n):
return np.array([np.random.choice(np.arange(1, 51), size=6, replace=False) for _ in range(n)])
for n in (10, 1_000, 1_000_000):
print(f'n = {n}')
for name, func in list(globals().items()):
if not name.startswith('f_') or not isinstance(func, types.FunctionType):
continue
try:
print("{:16s}{:16.8f} ms".format(name[2:], timeit(
'f(n)', globals={'f':func, 'n':n}, number=10)*100))
except:
print("{:16s} apparently failed".format(name[2:]))
if(n >= 10000):
for name, func in list(globals().items()):
if name.startswith('f_') and isinstance(func, types.FunctionType):
check_result(func(n), name)