python莫名其妙地缩短了滑动窗口分析每次迭代的步长
我正在开发一个程序,该程序通过一系列滑动窗口对一条染色体上的Tajima's D进行估计。染色体本身也被分成许多不同的区域,有(希望)功能意义。滑动窗口分析由我的脚本在每个区域上执行 在程序开始时,我定义了滑动窗口的大小以及从一个窗口移动到下一个窗口的步骤的大小。我导入一个包含每个不同染色体区域坐标的文件,然后导入另一个包含我正在处理的所有SNP数据的文件(这是逐行读取的,因为它是一个大文件)。该程序循环浏览染色体位置列表。对于每个位置,它生成用于分析的步骤和窗口索引,将SNP数据划分为输出文件(与步骤对应),计算每个步骤文件的关键统计信息,并结合这些统计信息来估计每个窗口的Tajima D 该程序适用于SNP数据的小文件。它也适用于第一个染色体断点的第一次迭代。然而,对于大文件的SNP数据,当程序迭代每个染色体区域时,分析中的步长会莫名其妙地减小。对于第一个染色体区域,步长是2500个核苷酸(这是假定的)。然而,第二个染色体片段的步长是1966,第三个染色体片段的步长是732 如果有人对为什么会出现这种情况有任何建议,请让我知道。我尤其感到困惑,因为这个程序似乎适用于小文件,但不适用于大文件 我的代码如下:python莫名其妙地缩短了滑动窗口分析每次迭代的步长,python,memory,iteration,short,sliding-window,Python,Memory,Iteration,Short,Sliding Window,我正在开发一个程序,该程序通过一系列滑动窗口对一条染色体上的Tajima's D进行估计。染色体本身也被分成许多不同的区域,有(希望)功能意义。滑动窗口分析由我的脚本在每个区域上执行 在程序开始时,我定义了滑动窗口的大小以及从一个窗口移动到下一个窗口的步骤的大小。我导入一个包含每个不同染色体区域坐标的文件,然后导入另一个包含我正在处理的所有SNP数据的文件(这是逐行读取的,因为它是一个大文件)。该程序循环浏览染色体位置列表。对于每个位置,它生成用于分析的步骤和窗口索引,将SNP数据划分为输出文件
import sys
import math
import fileinput
import shlex
import string
windowSize = int(500)
stepSize = int(250)
n = int(50) #number of individuals in the anaysis
SNP_file = open("SNPs-1.txt",'r')
SNP_file.readline()
breakpoints = open("C:/Users/gwilymh/Desktop/Python/Breakpoint coordinates.txt", 'r')
breakpoints = list(breakpoints)
numSegments = len(breakpoints)
# Open a file to store the Tajima's D results:
outputFile = open("C:/Users/gwilymh/Desktop/Python/Sliding Window Analyses-2/Tajima's D estimates.txt", 'a')
outputFile.write(str("segmentNumber\tchrSegmentName\tsegmentStart\tsegmentStop\twindowNumber\twindowStart\twindowStop\tWindowSize\tnSNPs\tS\tD\n"))
#Calculating parameters a1, a2, b1, b2, c1 and c2
numPairwiseComparisons=n*((n-1)/2)
b1=(n+1)/(3*(n-1))
b2=(2*(n**2+n+3))/(9*n*(n-1))
num=list(range(1,n)) # n-1 values as a list
i=0
a1=0
for i in num:
a1=a1+(1/i)
i=i+1
j=0
a2=0
for j in num:
a2=a2+(1/j**2)
j=j+1
c1=(b1/a1)-(1/a1**2)
c2=(1/(a1**2+a2))*(b2 - ((n+2)/(a1*n))+ (a2/a1**2) )
counter6=0
#For each segment, assign a number and identify the start and stop coodrinates and the segment name
for counter6 in range(counter6,numSegments):
segment = shlex.shlex(breakpoints[counter6],posix = True)
segment.whitespace += '\t'
segment.whitespace_split = True
segment = list(segment)
segmentName = segment[0]
segmentNumber = int(counter6+1)
segmentStartPos = int(segment[1])
segmentStopPos = int(segment[2])
outputFile1 = open((("C:/Users/gwilymh/Desktop/Python/Sliding Window Analyses-2/%s_%s_Count of SNPs and mismatches per step.txt")%(str(segmentNumber),str(segmentName))), 'a')
#Make output files to index the lcoations of each window within each segment
windowFileIndex = open((("C:/Users/gwilymh/Desktop/Python/Sliding Window Analyses-2/%s_%s_windowFileIndex.txt")%(str(segmentNumber),str(segmentName))), 'a')
k = segmentStartPos - 1
windowNumber = 0
while (k+1) <=segmentStopPos:
windowStart = k+1
windowNumber = windowNumber+1
windowStop = k + windowSize
if windowStop > segmentStopPos:
windowStop = segmentStopPos
windowFileIndex.write(("%s\t%s\t%s\n")%(str(windowNumber),str(windowStart),str(windowStop)))
k=k+stepSize
windowFileIndex.close()
# Make output files for each step to export the corresponding SNP data into + an index of these output files
stepFileIndex = open((("C:/Users/gwilymh/Desktop/Python/Sliding Window Analyses-2/%s_%s_stepFileIndex.txt")%(str(segmentNumber),str(segmentName))), 'a')
i = segmentStartPos-1
stepNumber = 0
while (i+1) <= segmentStopPos:
stepStart = i+1
stepNumber = stepNumber+1
stepStop = i+stepSize
if stepStop > segmentStopPos:
stepStop = segmentStopPos
stepFile = open((("C:/Users/gwilymh/Desktop/Python/Sliding Window Analyses-2/%s_%s_step_%s.txt")%(str(segmentNumber),str(segmentName),str(stepNumber))), 'a')
stepFileIndex.write(("%s\t%s\t%s\n")%(str(stepNumber),str(stepStart),str(stepStop)))
i=i+stepSize
stepFile.close()
stepFileIndex.close()
# Open the index file for each step in current chromosomal segment
stepFileIndex = open((("C:/Users/gwilymh/Desktop/Python/Sliding Window Analyses-2/%s_%s_stepFileIndex.txt")%(str(segmentNumber),str(segmentName))), 'r')
stepFileIndex = list(stepFileIndex)
numSteps = len(stepFileIndex)
while 1:
currentSNP = SNP_file.readline()
if not currentSNP: break
currentSNP = shlex.shlex(currentSNP,posix=True)
currentSNP.whitespace += '\t'
currentSNP.whitespace_split = True
currentSNP = list(currentSNP)
SNPlocation = int(currentSNP[0])
if SNPlocation > segmentStopPos:break
stepIndexBin = int(((SNPlocation-segmentStartPos-1)/stepSize)+1)
#print(SNPlocation, stepIndexBin)
writeFile = open((("C:/Users/gwilymh/Desktop/Python/Sliding Window Analyses-2/%s_%s_step_%s.txt")%(str(segmentNumber),str(segmentName),str(stepIndexBin))), 'a')
writeFile.write((("%s\n")%(str(currentSNP[:]))))
writeFile.close()
counter3=0
for counter3 in range(counter3,numSteps):
# open up each step in the list of steps across the chromosomal segment:
L=shlex.shlex(stepFileIndex[counter3],posix=True)
L.whitespace += '\t'
L.whitespace_split = True
L=list(L)
#print(L)
stepNumber = int(L[0])
stepStart = int(L[1])
stepStop = int(L[2])
stepSize = int(stepStop-(stepStart-1))
#Now open the file of SNPs corresponding with the window in question and convert it into a list:
currentStepFile = open(("C:/Users/gwilymh/Desktop/Python/Sliding Window Analyses-2/%s_%s_step_%s.txt")%(str(segmentNumber),str(segmentName),str(counter3+1)),'r')
currentStepFile = list(currentStepFile)
nSNPsInCurrentStepFile = len(currentStepFile)
print("number of SNPs in this step is:", nSNPsInCurrentStepFile)
#print(currentStepFile)
if nSNPsInCurrentStepFile == 0:
mismatchesPerSiteList = [0]
else:
# For each line of the file, estimate the per site parameters relevent to Tajima's D
mismatchesPerSiteList = list()
counter4=0
for counter4 in range(counter4,nSNPsInCurrentStepFile):
CountA=0
CountG=0
CountC=0
CountT=0
x = counter4
lineOfData = currentStepFile[x]
counter5=0
for counter5 in range(0,len(lineOfData)):
if lineOfData[counter5]==("A" or "a"): CountA=CountA+1
elif lineOfData[counter5]==("G" or "g"): CountG=CountG+1
elif lineOfData[counter5]==("C" or "c"): CountC=CountC+1
elif lineOfData[counter5]==("T" or "t"): CountT=CountT+1
else: continue
AxG=CountA*CountG
AxC=CountA*CountC
AxT=CountA*CountT
GxC=CountG*CountC
GxT=CountG*CountT
CxT=CountC*CountT
NumberMismatches = AxG+AxC+AxT+GxC+GxT+CxT
mismatchesPerSiteList=mismatchesPerSiteList+[NumberMismatches]
outputFile1.write(str(("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n")%(segmentNumber, segmentName,stepNumber,stepStart,stepStop,stepSize,nSNPsInCurrentStepFile,sum(mismatchesPerSiteList))))
outputFile1.close()
windowFileIndex = open((("C:/Users/gwilymh/Desktop/Python/Sliding Window Analyses-2/%s_%s_windowFileIndex.txt")%(str(segmentNumber),str(segmentName))), 'r')
windowFileIndex = list(windowFileIndex)
numberOfWindows = len(windowFileIndex)
stepData = open((("C:/Users/gwilymh/Desktop/Python/Sliding Window Analyses-2/%s_%s_Count of SNPs and mismatches per step.txt")%(str(segmentNumber),str(segmentName))), 'r')
stepData = list(stepData)
numberOfSteps = len(stepData)
counter = 0
for counter in range(counter, numberOfWindows):
window = shlex.shlex(windowFileIndex[counter], posix = True)
window.whitespace += "\t"
window.whitespace_split = True
window = list(window)
windowNumber = int(window[0])
firstCoordinateInCurrentWindow = int(window[1])
lastCoordinateInCurrentWindow = int(window[2])
currentWindowSize = lastCoordinateInCurrentWindow - firstCoordinateInCurrentWindow +1
nSNPsInThisWindow = 0
nMismatchesInThisWindow = 0
counter2 = 0
for counter2 in range(counter2,numberOfSteps):
step = shlex.shlex(stepData[counter2], posix=True)
step.whitespace += "\t"
step.whitespace_split = True
step = list(step)
lastCoordinateInCurrentStep = int(step[4])
if lastCoordinateInCurrentStep < firstCoordinateInCurrentWindow: continue
elif lastCoordinateInCurrentStep <= lastCoordinateInCurrentWindow:
nSNPsInThisStep = int(step[6])
nMismatchesInThisStep = int(step[7])
nSNPsInThisWindow = nSNPsInThisWindow + nSNPsInThisStep
nMismatchesInThisWindow = nMismatchesInThisWindow + nMismatchesInThisStep
elif lastCoordinateInCurrentStep > lastCoordinateInCurrentWindow: break
if nSNPsInThisWindow ==0 :
S = 0
D = 0
else:
S = nSNPsInThisWindow/currentWindowSize
pi = nMismatchesInThisWindow/(currentWindowSize*numPairwiseComparisons)
print(nSNPsInThisWindow,nMismatchesInThisWindow,currentWindowSize,S,pi)
D = (pi-(S/a1))/math.sqrt(c1*S + c2*S*(S-1/currentWindowSize))
outputFile.write(str(("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n")%(segmentNumber,segmentName,segmentStartPos,segmentStopPos,windowNumber,firstCoordinateInCurrentWindow,lastCoordinateInCurrentWindow,currentWindowSize,nSNPsInThisWindow,S,D)))
导入系统
输入数学
导入文件输入
导入shlex
导入字符串
WindowsSize=int(500)
步长=整数(250)
n=int(50)#分析中的个体数
SNP_file=open(“SNPs-1.txt”,“r”)
SNP_file.readline()文件
断点=打开(“C:/Users/gwilymh/Desktop/Python/Breakpoint coordinates.txt”,“r”)
断点=列表(断点)
numSegments=len(断点)
#打开一个文件以存储Tajima的D结果:
outputFile=open(“C:/Users/gwilymh/Desktop/Python/Sliding Window analysis-2/Tajima's D estimaties.txt,'a')
write(str(“segmentNumber\tchrSegmentName\tsegmentStart\tsegmentStop\twindowNumber\twindowStart\twindowStop\tWindowSize\tnSNPs\tS\tD\n”))
#计算参数a1、a2、b1、b2、c1和c2
numPairWiseComparations=n*((n-1)/2)
b1=(n+1)/(3*(n-1))
b2=(2*(n**2+n+3))/(9*n*(n-1))
num=列表(范围(1,n))#n-1值作为列表
i=0
a1=0
对于num中的i:
a1=a1+(1/i)
i=i+1
j=0
a2=0
对于j in num:
a2=a2+(1/j**2)
j=j+1
c1=(b1/a1)-(1/a1**2)
c2=(1/(a1**2+a2))*(b2-((n+2)/(a1*n))+(a2/a1**2))
计数器6=0
#为每个段分配一个编号,并标识开始和停止坐标以及段名称
对于范围内的计数器6(计数器6,numSegments):
segment=shlex.shlex(断点[counter6],posix=True)
segment.whitespace+='\t'
segment.whitespace\u split=True
段=列表(段)
segmentName=段[0]
分段编号=整数(计数器6+1)
segmentStartPos=int(段[1])
segmentStopPos=int(段[2])
outputFile1=open((((((((“C:/Users/gwillymh/Desktop/Python/Slide Window Analysis-2/%s_%s_每个步骤的SNP和不匹配计数.txt”))%(str(segmentNumber),str(segmentName)),“a”)
#生成输出文件以索引每个段中每个窗口的lcoations
windowFileIndex=open(((((((“C:/Users/gwilymh/Desktop/Python/Sliding Window Analysis-2/%s_%s_windowFileIndex.txt”))%(str(segmentNumber),str(segmentName)),“a”)
k=分段起始时间-1
windowNumber=0
而(k+1)段为:
windowStop=Windows操作系统
windowFileIndex.write((%s\t%s\t%s\n”)%(str(windowNumber)、str(WindowsStart)、str(windowStop)))
k=k+步长
windowFileIndex.close()
#为每个步骤制作输出文件,将相应的SNP数据导出到这些输出文件的索引中
stepFileIndex=open((((((((“C:/Users/gwilymh/Desktop/Python/slidewindow analysis-2/%s_%s_ustepfileindex.txt”))%(str(segmentNumber),str(segmentName)),“a”)
i=分段StartPos-1
步数=0
而(i+1)节段:
步骤停止=分段停止
stepFile=open(((((((“C:/Users/gwilymh/Desktop/Python/slidewindow analysiss-2/%s_%s_ustep_%s.txt”))%(str(segmentNumber)、str(segmentName)、str(stepNumber))、a)
stepFileIndex.write((%s\t%s\t%s\n”)%(str(stepNumber)、str(stepStart)、str(stepStop)))
i=i+步长
stepFile.close()
stepFileIndex.close()
#打开当前染色体片段中每个步骤的索引文件
stepFileIndex=open(((((((“C:/Users/gwilymh/Desktop/Python/slidewindow analysis-2/%s_%s_ustepfileindex.txt”))%(str(segmentNumber),str(segmentName)),'r')
stepFileIndex=列表(stepFileIndex)
numSteps=len(stepFileIndex)
而1:
currentSNP=SNP_file.readline()
如果不是当前SNP:中断
currentSNP=shlex.shlex(currentSNP,posix=True)
currentSNP.whitespace+='\t'
currentSNP.whitespace\u split=True
currentSNP=列表(currentSNP)
SNPlocation=int(当前snp[0])
如果SNPlocation>segmentStopPos:break
stepIndexBin=int((SNPlocation-segmentStartPos-1)/步长)+1)
#打印(单倍定位,单步索引)
writeFile=open(((((((“C:/Users/gwilymh/Desktop/Python/slidewindow analysiss-2/%s_%s_ustep_%s.txt”))%(str(segmentNumber)、str(segmentName)、str(stepIndexBin))、'a')
writeFile.write(((“%s\n”)%(str(currentSNP[:]))
writeFile.close()
计数器3=0
对于范围内的计数器3(计数器3,微步):
#打开染色体片段步骤列表中的每个步骤:
L=shlex.shlex(stepFileIndex[counter3],posix=True)
L.空格+='\t'
L.空格_split=True
L=列表(L)
#印刷品(L)
stepStart = int(L[1])
stepStop = int(L[2])
stepSize = int(stepStop-(stepStart-1))