Bash 使用awk从两列值创建两个数组,查找差异和差异之和,并输出数据

Bash 使用awk从两列值创建两个数组,查找差异和差异之和,并输出数据,bash,csv,awk,bioinformatics,Bash,Csv,Awk,Bioinformatics,我有一个包含以下字段的文件(右侧有一个示例值): 这是该文件的缩短版本: 0 ENST00000371026 chr1 - 67051161 67163158 67051161,67060631,67065090,67066082,67071855,67072261,67073896,67075980,67078739,67085754,67100417,67109640,67113051,67129424,67131499,67143471,67162932, 67052451,6706078

我有一个包含以下字段的文件(右侧有一个示例值):

这是该文件的缩短版本:

0 ENST00000371026 chr1 - 67051161 67163158 67051161,67060631,67065090,67066082,67071855,67072261,67073896,67075980,67078739,67085754,67100417,67109640,67113051,67129424,67131499,67143471,67162932, 67052451,67060788,67065317,67066181,67071977,67072419,67074048,67076067,67078942,67085949,67100573,67109780,67113208,67129537,67131684,67143646,67163158, ENSG00000152763 0,2,0,0,1,2,0,0,1,1,1,2,1,2,0,2,0, uc009waw.1,uc009wax.1,uc001dcx.1,
0 ENST00000371023 chr1 - 67075869 67163055 67075869,67078739,67085754,67100417,67109640,67113051,67129424,67131499,67143471,67162932, 67076067,67078942,67085949,67100573,67109780,67113208,67129537,67131684,67143646,67163055, ENSG00000152763 0,1,1,1,2,1,2,0,2,0, uc001dcy.1
0 ENST00000395250 chr1 - 67075991 67163158 67075991,67076022,67078739,67085754,67100417,67109640,67113051,67129424,67131499,67143471,67162932, 67076018,67076067,67078942,67085949,67100573,67109780,67113208,67129537,67131684,67143646,67163158, ENSG00000152763 0,0,1,1,1,2,0,-1,-1,-1,-1, n/a
我需要计算外显子开始和结束的差值,例如:

hg18.ensGene.exonStarts    67051161,67060631,67065090,67066082,67071855,67072261,67073896,67075980,67078739,67085754,67100417,67109640,67113051,67129424,67131499,67143471,67162932,
hg18.ensGene.exonEnds    67052451,67060788,67065317,67066181,67071977,67072419,67074048,67076067,67078942,67085949,67100573,67109780,67113208,67129537,67131684,67143646,67163158,
区别:

1290,157,227,99,122,158,152,87,203,195,156,140,157,113,185,175,226
总和(hg18、ensGene、exonLenSum):

我希望输出具有以下字段:

hg18.ensGene.name
hg18.ensGene.name2
hg18.ensGene.exonLenSum
例如:

ENST00000371026 ENST00000371023 3842
我想为输入文件中的所有行使用一个awk脚本。我该怎么做?这对于计算外显子长度非常有用,例如RPMK(每千碱基外显子模型每百万映射读取的读取数)计算。

so ross$awk-f gene.awk gene.dat
so ross$ awk -f gene.awk gene.dat
ENST00000371026 ENSG00000152763 3842
ENST00000371023 ENSG00000152763 1645
ENST00000395250 ENSG00000152763 1622
so ross$ cat gene.awk
/./ {
  name = $2
  name2 = $9
  s = $7
  e = $8
  sc = split(s, sa, ",")
  ec = split(e, ea, ",")
  if (sc != ec) {
    print "starts != ends ", name, name2, sc, ec
  }
  diffsum = 0
  for(i = 1; i <= sc; ++i) {
    diffsum += ea[i] - sa[i]
  }
  print name, name2, diffsum
}
ENST0000371026 ENG00000152763 3842 ENST0000371023 ENG00000152763 1645 ENST0000395250 ENSG0000152763 1622 所以ross$cat gene.awk /./ { 姓名=$2 名称2=9美元 s=7美元 e=8美元 sc=拆分(s,sa,“,”) ec=拆分(e,ea,“,”) 如果(sc!=ec){ 打印“开始!=结束”、名称、名称2、sc、ec } diffsum=0 对于(i=1;i
so ross$awk-f gene.awk gene.dat
ENST0000371026 ENG00000152763 3842
ENST0000371023 ENG00000152763 1645
ENST0000395250 ENSG0000152763 1622
所以ross$cat gene.awk
/./ {
姓名=$2
名称2=9美元
s=7美元
e=8美元
sc=拆分(s,sa,“,”)
ec=拆分(e,ea,“,”)
如果(sc!=ec){
打印“开始!=结束”、名称、名称2、sc、ec
}
diffsum=0

对于(i=1;i使用UCSC mysql匿名服务器:

mysql -N -h  genome-mysql.cse.ucsc.edu -A -u genome -D hg18 -e 'select name,name2,exonStarts,exonEnds from ensGene' |\
awk -F '    ' '{n=split($3,a1,"[,]"); split($4,a2,"[,]"); size=0; for(i=1;i<=n;++i) {size+=int(a2[i]-a1[i]);} printf("%s\t%s\t%d\n",$1,$2,size); }'

使用UCSC mysql匿名服务器:

mysql -N -h  genome-mysql.cse.ucsc.edu -A -u genome -D hg18 -e 'select name,name2,exonStarts,exonEnds from ensGene' |\
awk -F '    ' '{n=split($3,a1,"[,]"); split($4,a2,"[,]"); size=0; for(i=1;i<=n;++i) {size+=int(a2[i]-a1[i]);} printf("%s\t%s\t%d\n",$1,$2,size); }'

grep
开始。例如:
grep“exonStarts\| exonEnds”filename.txt
FYI:是问此类问题的另一个好地方:-)@Dave Jarivs谢谢你的评论,但我想我把你和问题的措辞搞混了,这个文件实际上并没有以grep可以用来解决这个问题的方式来布置。从
grep
开始。例如:
grep“exonStarts\| exonEnds”filename.txt
FYI:是问这类问题的另一个好地方:-)@戴夫·贾里夫斯:谢谢你的评论,但我想我把你和问题的措辞搞混了,这个文件实际上并没有以grep可以用来解决这个问题的方式来布置。
mysql -N -h  genome-mysql.cse.ucsc.edu -A -u genome -D hg18 -e 'select name,name2,exonStarts,exonEnds from ensGene' |\
awk -F '    ' '{n=split($3,a1,"[,]"); split($4,a2,"[,]"); size=0; for(i=1;i<=n;++i) {size+=int(a2[i]-a1[i]);} printf("%s\t%s\t%d\n",$1,$2,size); }'
ENST00000404059 ENSG00000219789 632
ENST00000326632 ENSG00000146556 1583
ENST00000408384 ENSG00000221311 138
ENST00000409575 ENSG00000222003 1187
ENST00000409981 ENSG00000222027 1187
ENST00000359752 ENSG00000197490 126
ENST00000379479 ENSG00000205292 873
ENST00000326183 ENSG00000177693 918
ENST00000407826 ENSG00000219467 2820
ENST00000405199 ENSG00000220902 1231
(...)