Python 如何使用numpy random.choice为每行创建二维数组？_Python_Arrays_Numpy

Python 如何使用numpy random.choice为每行创建二维数组？

python arrays numpy

Python 如何使用numpy random.choice为每行创建二维数组？,python,arrays,numpy,Python,Arrays,Numpy,我正在尝试创建一个二维数组（它是一个六列多行的数组），使用numpy随机选择，每行（而不是整个数组）的唯一值在1到50之间 np.sort(np.random.choice(np.arange(1,50),size=(100,6),replace=False)) 但这会引起一个错误 ValueError: Cannot take a larger sample than population when 'replace=False' 有没有可能用一个没有回路的单衬套来实现这一点编辑好

我正在尝试创建一个二维数组（它是一个六列多行的数组），使用numpy随机选择，每行（而不是整个数组）的唯一值在1到50之间

np.sort(np.random.choice(np.arange(1,50),size=(100,6),replace=False))

但这会引起一个错误

ValueError: Cannot take a larger sample than population when 'replace=False'

有没有可能用一个没有回路的单衬套来实现这一点

编辑

好的，我知道答案了

这些是jupyter%时间cellmagic的结果

#@James' solution
np.stack([np.random.choice(np.arange(1,50),size=6,replace=False) for i in range(1_000_000)])
Wall time: 25.1 s



#@Divakar's solution
np.random.rand(1_000_000, 50).argpartition(6,axis=1)[:,:6]+1
Wall time: 1.36 s



#@CoryKramer's solution
np.array([np.random.choice(np.arange(1, 50), size=6, replace=False) for _ in range(1_000_000)])
Wall time: 25.5 s

我在@Paul Panzer的解决方案中更改了np.empty和np.random.randint的数据类型，因为它在我的电脑上不起作用

3.6.0 |Anaconda custom (64-bit)| (default, Dec 23 2016, 11:57:41) [MSC v.1900 64 bit (AMD64)]
最快的是

def pp(n): draw = np.empty((n, 6), dtype=np.int64) # generating random numbers is expensive, so draw a large one and # make six out of one draw[:, 0] = np.random.randint(0, 50*49*48*47*46*45, (n,),dtype=np.uint64) draw[:, 1:] = np.arange(50, 45, -1) draw = np.floor_divide.accumulate(draw, axis=-1) draw[:, :-1] -= draw[:, 1:] * np.arange(50, 45, -1) # map the shorter ranges (:49, :48, :47) to the non-occupied # positions; this amounts to incrementing for each number on the # left that is not larger. the nasty bit: if due to incrementing # new numbers on the left are "overtaken" then for them we also # need to increment. for i in range(1, 6): coll = np.sum(draw[:, :i] <= draw[:, i, None], axis=-1) collidx = np.flatnonzero(coll) if collidx.size == 0: continue coll = coll[collidx] tot = coll while True: draw[collidx, i] += coll coll = np.sum(draw[collidx, :i] <= draw[collidx, i, None], axis=-1) relidx = np.flatnonzero(coll > tot) if relidx.size == 0: break coll, tot = coll[relidx]-tot[relidx], coll[relidx] collidx = collidx[relidx] return draw + 1 #@Paul Panzer' solution pp(1_000_000) Wall time: 557 ms

def pp（n）： draw=np.empty（（n，6），dtype=np.int64） #生成随机数是昂贵的，所以画一个大的，然后 #六取一 draw[：，0]=np.random.randint（0,50*49*48*47*46*45，（n，），dtype=np.uint64）画图[：，1:]=np.arange（50,45，-1）绘制=np.地板\分割.累积（绘制，轴=-1） draw[：，：-1]-=draw[：，1:]*np.arange（50,45，-1） #将较短的范围（：49，：48，：47）映射到未占用区域 #职位；这相当于对数据集上的每个数字递增 #左边那个不大。讨厌的一点：如果是由于递增 #左边的新数字被“超越”，那么对于它们，我们也 #需要增加。对于范围（1,6）内的i： coll=np.sum（draw[：，：i]这不是纯粹的numpy ，但您可以将您的解决方案封装在列表中 >>> rows = 10 >>> cols = 6 >>> np.array([np.random.choice(np.arange(1, 50), size=cols, replace=False) for _ in range(rows)]) array([[ 9, 10, 21, 33, 34, 15], [48, 46, 36, 7, 37, 45], [21, 15, 5, 9, 31, 26], [48, 24, 30, 18, 47, 23], [22, 31, 19, 32, 3, 33], [35, 44, 15, 46, 20, 43], [11, 37, 44, 6, 16, 35], [42, 49, 41, 28, 12, 19], [19, 6, 32, 3, 1, 22], [29, 33, 42, 5, 30, 43]]) 您可以自己创建每一行，然后堆叠它们 np.stack([np.random.choice(np.arange(1,50),size=6,replace=False) for i in range(100)]) 下面是一个使用rand+argsort/argpartition 技巧的向量化方法- 样本运行- In [41]: rows = 10 In [42]: np.random.rand(rows, 50).argpartition(6,axis=1)[:,:6]+1 Out[42]: array([[ 1, 9, 3, 26, 14, 44], [32, 20, 27, 13, 25, 45], [40, 12, 47, 16, 10, 29], [ 6, 36, 32, 16, 18, 4], [42, 46, 24, 9, 1, 31], [15, 25, 47, 42, 34, 24], [ 7, 16, 49, 31, 40, 20], [28, 17, 47, 36, 8, 44], [ 7, 42, 14, 4, 17, 35], [39, 19, 37, 7, 8, 36]]) 只是为了证明随机性- In [56]: rows = 1000000 In [57]: out = np.random.rand(rows, 50).argpartition(6,axis=1)[:,:6]+1 In [58]: np.bincount(out.ravel())[1:] Out[58]: array([120048, 120026, 119942, 119838, 119885, 119669, 119965, 119491, 120280, 120108, 120293, 119399, 119917, 119974, 120195, 119796, 119887, 119505, 120235, 119857, 119499, 120560, 119891, 119693, 120081, 120369, 120011, 119714, 120218, 120581, 120111, 119867, 119791, 120265, 120457, 120048, 119813, 119702, 120266, 120445, 120016, 120190, 119576, 119737, 120153, 120215, 120144, 120196, 120218, 119863]) 对一百万行数据的计时- In [43]: rows = 1000000 In [44]: %timeit np.random.rand(rows, 50).argpartition(6,axis=1)[:,:6]+1 1 loop, best of 3: 1.07 s per loop 这是一个建设性的方法，第一个（50个选项），第二个（49个选项）等等。对于大型集合，它非常有竞争力（表中的pp）：包括benchmarking.Algo在内的代码有点复杂，因为到自由点的映射很复杂： import numpy as np import types from timeit import timeit def f_pp(n): draw = np.empty((n, 6), dtype=int) # generating random numbers is expensive, so draw a large one and # make six out of one draw[:, 0] = np.random.randint(0, 50*49*48*47*46*45, (n,)) draw[:, 1:] = np.arange(50, 45, -1) draw = np.floor_divide.accumulate(draw, axis=-1) draw[:, :-1] -= draw[:, 1:] * np.arange(50, 45, -1) # map the shorter ranges (:49, :48, :47) to the non-occupied # positions; this amounts to incrementing for each number on the # left that is not larger. the nasty bit: if due to incrementing # new numbers on the left are "overtaken" then for them we also # need to increment. for i in range(1, 6): coll = np.sum(draw[:, :i] <= draw[:, i, None], axis=-1) collidx = np.flatnonzero(coll) if collidx.size == 0: continue coll = coll[collidx] tot = coll while True: draw[collidx, i] += coll coll = np.sum(draw[collidx, :i] <= draw[collidx, i, None], axis=-1) relidx = np.flatnonzero(coll > tot) if relidx.size == 0: break coll, tot = coll[relidx]-tot[relidx], coll[relidx] collidx = collidx[relidx] return draw + 1 def check_result(draw, name): print(name[2:], ' checking plausibility...') import scipy.stats assert all(len(set(row)) == 6 for row in draw) assert len(set(draw.ravel())) == 50 print(' var (exp obs)', scipy.stats.uniform(0.5, 50).var(), draw.var()) print(' mean (exp obs)', scipy.stats.uniform(0.5, 50).mean(), draw.mean()) def f_Divakar(n): return np.random.rand(n, 50).argpartition(6,axis=1)[:,:6]+1 def f_James(n): return np.stack([np.random.choice(np.arange(1,51),size=6,replace=False) for i in range(n)]) def f_CK(n): return np.array([np.random.choice(np.arange(1, 51), size=6, replace=False) for _ in range(n)]) for n in (10, 1_000, 1_000_000): print(f'n = {n}') for name, func in list(globals().items()): if not name.startswith('f_') or not isinstance(func, types.FunctionType): continue try: print("{:16s}{:16.8f} ms".format(name[2:], timeit( 'f(n)', globals={'f':func, 'n':n}, number=10)*100)) except: print("{:16s} apparently failed".format(name[2:])) if(n >= 10000): for name, func in list(globals().items()): if name.startswith('f_') and isinstance(func, types.FunctionType): check_result(func(n), name) 将numpy导入为np 导入类型从timeit导入timeit def f_pp（n）： draw=np.empty（（n，6），dtype=int） #生成随机数是昂贵的，所以画一个大的，然后 #六取一 draw[：，0]=np.random.randint（0,50*49*48*47*46*45，（n，）画图[：，1:]=np.arange（50,45，-1）绘制=np.地板\分割.累积（绘制，轴=-1） draw[：，：-1]-=draw[：，1:]*np.arange（50,45，-1） #将较短的范围（：49，：48，：47）映射到未占用区域 #位置；这相当于每个位置上的数字递增 #左边那个不算大。讨厌的一点：如果是因为递增 #左边的新数字被“超越”，那么对于它们，我们也 #需要增加。对于范围（1,6）内的i： coll=np.和（draw[：，：i]=10000）：对于名称，列表中的func（globals（）.items（））：如果name.startswith（'f_'）和isinstance（func，types.FunctionType）：检查结果（函数（n），名称）我更喜欢使用np.stack ，原因如下：（来自vstack 文档）此函数仍然支持向后兼容，但您应该更喜欢np.concatenate或np.stack。np.stack函数是在NumPy 1.10中添加的。 Ah.我不知道。samples=瓶颈.argpartition（np.random.rand（rows，50），6，1）[：，：6]）可能更快，我还没有检查过这个，你能不能让它更通用一点？ # n = 10 # pp 0.18564210 ms # Divakar 0.01960790 ms # James 0.20074140 ms # CK 0.17823420 ms # n = 1000 # pp 0.80046050 ms # Divakar 1.31817130 ms # James 18.93511460 ms # CK 20.83670820 ms # n = 1000000 # pp 655.32905590 ms # Divakar 1352.44713990 ms # James 18471.08987370 ms # CK 18369.79808050 ms # pp checking plausibility... # var (exp obs) 208.333333333 208.363840259 # mean (exp obs) 25.5 25.5064865 # Divakar checking plausibility... # var (exp obs) 208.333333333 208.21113972 # mean (exp obs) 25.5 25.499471 # James checking plausibility... # var (exp obs) 208.333333333 208.313436938 # mean (exp obs) 25.5 25.4979035 # CK checking plausibility... # var (exp obs) 208.333333333 208.169585249 # mean (exp obs) 25.5 25.49 import numpy as np import types from timeit import timeit def f_pp(n): draw = np.empty((n, 6), dtype=int) # generating random numbers is expensive, so draw a large one and # make six out of one draw[:, 0] = np.random.randint(0, 50*49*48*47*46*45, (n,)) draw[:, 1:] = np.arange(50, 45, -1) draw = np.floor_divide.accumulate(draw, axis=-1) draw[:, :-1] -= draw[:, 1:] * np.arange(50, 45, -1) # map the shorter ranges (:49, :48, :47) to the non-occupied # positions; this amounts to incrementing for each number on the # left that is not larger. the nasty bit: if due to incrementing # new numbers on the left are "overtaken" then for them we also # need to increment. for i in range(1, 6): coll = np.sum(draw[:, :i] <= draw[:, i, None], axis=-1) collidx = np.flatnonzero(coll) if collidx.size == 0: continue coll = coll[collidx] tot = coll while True: draw[collidx, i] += coll coll = np.sum(draw[collidx, :i] <= draw[collidx, i, None], axis=-1) relidx = np.flatnonzero(coll > tot) if relidx.size == 0: break coll, tot = coll[relidx]-tot[relidx], coll[relidx] collidx = collidx[relidx] return draw + 1 def check_result(draw, name): print(name[2:], ' checking plausibility...') import scipy.stats assert all(len(set(row)) == 6 for row in draw) assert len(set(draw.ravel())) == 50 print(' var (exp obs)', scipy.stats.uniform(0.5, 50).var(), draw.var()) print(' mean (exp obs)', scipy.stats.uniform(0.5, 50).mean(), draw.mean()) def f_Divakar(n): return np.random.rand(n, 50).argpartition(6,axis=1)[:,:6]+1 def f_James(n): return np.stack([np.random.choice(np.arange(1,51),size=6,replace=False) for i in range(n)]) def f_CK(n): return np.array([np.random.choice(np.arange(1, 51), size=6, replace=False) for _ in range(n)]) for n in (10, 1_000, 1_000_000): print(f'n = {n}') for name, func in list(globals().items()): if not name.startswith('f_') or not isinstance(func, types.FunctionType): continue try: print("{:16s}{:16.8f} ms".format(name[2:], timeit( 'f(n)', globals={'f':func, 'n':n}, number=10)*100)) except: print("{:16s} apparently failed".format(name[2:])) if(n >= 10000): for name, func in list(globals().items()): if name.startswith('f_') and isinstance(func, types.FunctionType): check_result(func(n), name)