Algorithm 无替换采样算法?
我试图测试一个特定的数据集群偶然发生的可能性。一种稳健的方法是蒙特卡罗模拟,在这种方法中,数据和组之间的关联被随机重新分配大量次(例如10000次),并且使用聚类度量将实际数据与模拟进行比较,以确定p值 我已经完成了大部分工作,指针将分组映射到数据元素,因此我计划随机重新分配指针到数据。问题是:什么是无需替换即可进行采样的快速方法,以便在复制数据集中随机重新分配每个指针 例如(这些数据只是一个简化的示例): 数据(n=12个值)-A组:0.1,0.2,0.4/B组:0.5,0.6,0.8/C组:0.4,0.5/D组:0.2,0.2,0.3,0.5 对于每个复制数据集,我将具有相同的集群大小(A=3、B=3、C=2、D=4)和数据值,但会将这些值重新分配给集群 为此,我可以生成范围为1-12的随机数,分配A组的第一个元素,然后生成范围为1-11的随机数,分配A组的第二个元素,依此类推。指针重新分配速度很快,我已经预先分配了所有数据结构,但是不替换的采样似乎是一个可能已经解决了很多次的问题Algorithm 无替换采样算法?,algorithm,statistics,pseudocode,Algorithm,Statistics,Pseudocode,我试图测试一个特定的数据集群偶然发生的可能性。一种稳健的方法是蒙特卡罗模拟,在这种方法中,数据和组之间的关联被随机重新分配大量次(例如10000次),并且使用聚类度量将实际数据与模拟进行比较,以确定p值 我已经完成了大部分工作,指针将分组映射到数据元素,因此我计划随机重新分配指针到数据。问题是:什么是无需替换即可进行采样的快速方法,以便在复制数据集中随机重新分配每个指针 例如(这些数据只是一个简化的示例): 数据(n=12个值)-A组:0.1,0.2,0.4/B组:0.5,0.6,0.8/C组:
首选逻辑或伪代码。请参阅我对这个问题的回答。同样的逻辑应该可以完成您想要做的事情。以下是一些基于Knuth的书Seminumeric Algorithms的算法3.4.2S的无需替换的采样代码
void SampleWithoutReplacement
(
int populationSize, // size of set sampling from
int sampleSize, // size of each sample
vector<int> & samples // output, zero-offset indicies to selected items
)
{
// Use Knuth's variable names
int& n = sampleSize;
int& N = populationSize;
int t = 0; // total input records dealt with
int m = 0; // number of items selected so far
double u;
while (m < n)
{
u = GetUniform(); // call a uniform(0,1) random number generator
if ( (N - t)*u >= n - m )
{
t++;
}
else
{
samples[m] = t;
t++; m++;
}
}
}
void samplewithout replacement
(
int populationSize,//从
int sampleSize,//每个样本的大小
向量和样本//输出,所选项目的零偏移标记
)
{
//使用Knuth的变量名
int&n=抽样;
int&N=人口规模;
int t=0;//处理的输入记录总数
int m=0;//到目前为止选择的项目数
双u;
而(m=N-m)
{
t++;
}
其他的
{
样本[m]=t;
t++;m++;
}
}
}
杰弗里·斯科特·维特(Jeffrey Scott Vitter)在《顺序随机抽样的有效算法》中提出了一种更有效但更复杂的方法,ACM数学软件交易,13(1),1987年3月,58-67 描述了另一种无需替换的采样算法 这与约翰·D·库克(John D.Cook)在回答中所描述的以及克努斯(Knuth)所描述的相似,但它有不同的假设:人口规模未知,但样本可以保存在记忆中。这一个叫做“Knuth算法s” 引用rosettacode文章:
基于.
的C++工作代码#包括
#包括
double GetUniform()
{
静态标准::默认值\u随机\u引擎re;
静态标准:均匀实分布距离(0,1);
返回距离(re);
}
//约翰·D·库克,https://stackoverflow.com/a/311716/15485
无效样品,无需更换
(
int populationSize,//从
int sampleSize,//每个样本的大小
标准::向量和样本//输出,所选项目的零偏移标记
)
{
//使用Knuth的变量名
int&n=抽样;
int&N=人口规模;
int t=0;//处理的输入记录总数
int m=0;//到目前为止选择的项目数
双u;
而(m=N-m)
{
t++;
}
其他的
{
样本[m]=t;
t++;m++;
}
}
}
#包括
int main(int,char**)
{
const size_t sz=10;
标准:向量样本(sz);
不更换样品(10*sz,sz,样品);
对于(大小i=0;i
iterator uniqueRandomValuesBelow*(N, M: int) =
## Returns a total of M unique random values i with 0 <= i < N
## These indices can be used to construct e.g. a random sample without replacement
assert(M <= N)
var t = 0 # total input records dealt with
var m = 0 # number of items selected so far
while (m < M):
let u = random(1.0) # call a uniform(0,1) random number generator
# meaning of the following terms:
# (N - t) is the total number of remaining draws left (initially just N)
# (M - m) is the number how many of these remaining draw must be positive (initially just M)
# => Probability for next draw = (M-m) / (N-t)
# i.e.: (required positive draws left) / (total draw left)
#
# This is implemented by the inequality expression below:
# - the larger (M-m), the larger the probability of a positive draw
# - for (N-t) == (M-m), the term on the left is always smaller => we will draw 100%
# - for (N-t) >> (M-m), we must get a very small u
#
# example: (N-t) = 7, (M-m) = 5
# => we draw the next with prob 5/7
# lets assume the draw fails
# => t += 1 => (N-t) = 6
# => we draw the next with prob 5/6
# lets assume the draw succeeds
# => t += 1, m += 1 => (N-t) = 5, (M-m) = 4
# => we draw the next with prob 4/5
# lets assume the draw fails
# => t += 1 => (N-t) = 4
# => we draw the next with prob 4/4, i.e.,
# we will draw with certainty from now on
# (in the next steps we get prob 3/3, 2/2, ...)
if (N - t)*u >= (M - m).toFloat: # this is essentially a draw with P = (M-m) / (N-t)
# no draw -- happens mainly for (N-t) >> (M-m) and/or high u
t += 1
else:
# draw t -- happens when (M-m) gets large and/or low u
yield t # this is where we output an index, can be used to sample
t += 1
m += 1
# example use
for i in uniqueRandomValuesBelow(100, 5):
echo i
迭代器uniqueryandomvalues下面*(N,M:int)=
##返回总共M个唯一的随机值i和0,我们将抽取100%
#-对于(N-t)>>(M-M),我们必须得到一个非常小的u
#
#示例:(N-t)=7,(M-M)=5
#=>我们用prob 5/7绘制下一个
#假设抽签失败
#=>t+=1=>(N-t)=6
#=>我们用prob 5/6绘制下一个
#假设抽签成功
#=>t+=1,m+=1=>(N-t)=5,(m-m)=4
#=>我们用prob 4/5绘制下一个
#假设抽签失败
#=>t+=1=>(N-t)=4
#=>我们用prob 4/4绘制下一个,即。,
#从现在起,我们将肯定地得出结论
#(在接下来的步骤中,我们得到prob 3/3,2/2,…)
如果(N-t)*u>=(M-M).toFloat:#这本质上是一个P=(M-M)/(N-t)的平局
#无牵引——主要发生在(N-t)>>(M-M)和/或高u
t+=1
其他:
#draw t——当(M-M)变大和/或u变低时发生
收益率t#这是我们输出指数的地方,可用于取样
t+=1
m+=1
#示例使用
对于以下(100,5)中的UniquerandomValues中的i:
回声一号
当总体规模远大于样本规模时,上述算法会变得效率低下,因为它们的复杂性为O(n),n是总体规模
iterator uniqueRandomValuesBelow*(N, M: int) =
## Returns a total of M unique random values i with 0 <= i < N
## These indices can be used to construct e.g. a random sample without replacement
assert(M <= N)
var t = 0 # total input records dealt with
var m = 0 # number of items selected so far
while (m < M):
let u = random(1.0) # call a uniform(0,1) random number generator
# meaning of the following terms:
# (N - t) is the total number of remaining draws left (initially just N)
# (M - m) is the number how many of these remaining draw must be positive (initially just M)
# => Probability for next draw = (M-m) / (N-t)
# i.e.: (required positive draws left) / (total draw left)
#
# This is implemented by the inequality expression below:
# - the larger (M-m), the larger the probability of a positive draw
# - for (N-t) == (M-m), the term on the left is always smaller => we will draw 100%
# - for (N-t) >> (M-m), we must get a very small u
#
# example: (N-t) = 7, (M-m) = 5
# => we draw the next with prob 5/7
# lets assume the draw fails
# => t += 1 => (N-t) = 6
# => we draw the next with prob 5/6
# lets assume the draw succeeds
# => t += 1, m += 1 => (N-t) = 5, (M-m) = 4
# => we draw the next with prob 4/5
# lets assume the draw fails
# => t += 1 => (N-t) = 4
# => we draw the next with prob 4/4, i.e.,
# we will draw with certainty from now on
# (in the next steps we get prob 3/3, 2/2, ...)
if (N - t)*u >= (M - m).toFloat: # this is essentially a draw with P = (M-m) / (N-t)
# no draw -- happens mainly for (N-t) >> (M-m) and/or high u
t += 1
else:
# draw t -- happens when (M-m) gets large and/or low u
yield t # this is where we output an index, can be used to sample
t += 1
m += 1
# example use
for i in uniqueRandomValuesBelow(100, 5):
echo i
# The Tree growing algorithm for uniform sampling without replacement
# by Pavel Ruzankin
quicksample = function (n,size)
# n - the number of items to choose from
# size - the sample size
{
s=as.integer(size)
if (s>n) {
stop("Sample size is greater than the number of items to choose from")
}
# upv=integer(s) #level up edge is pointing to
leftv=integer(s) #left edge is poiting to; must be filled with zeros
rightv=integer(s) #right edge is pointig to; must be filled with zeros
samp=integer(s) #the sample
ordn=integer(s) #relative ordinal number
ordn[1L]=1L #initial value for the root vertex
samp[1L]=sample(n,1L)
if (s > 1L) for (j in 2L:s) {
curn=sample(n-j+1L,1L) #current number sampled
curordn=0L #currend ordinal number
v=1L #current vertice
from=1L #how have come here: 0 - by left edge, 1 - by right edge
repeat {
curordn=curordn+ordn[v]
if (curn+curordn>samp[v]) { #going down by the right edge
if (from == 0L) {
ordn[v]=ordn[v]-1L
}
if (rightv[v]!=0L) {
v=rightv[v]
from=1L
} else { #creating a new vertex
samp[j]=curn+curordn
ordn[j]=1L
# upv[j]=v
rightv[v]=j
break
}
} else { #going down by the left edge
if (from==1L) {
ordn[v]=ordn[v]+1L
}
if (leftv[v]!=0L) {
v=leftv[v]
from=0L
} else { #creating a new vertex
samp[j]=curn+curordn-1L
ordn[j]=-1L
# upv[j]=v
leftv[v]=j
break
}
}
}
}
return(samp)
}