Javascript 通过CSV文件解析以转换为JSON格式文件
我从excel电子表格中提取了以下CSV文件。为了提供一些可能有帮助的背景信息,本文讨论了AGI编号(将其视为蛋白质标识符)、这些蛋白质标识符的未修改肽序列,然后是对未修改序列进行修改的修改肽序列、这些修改的索引/索引,然后是重复肽的组合光谱计数。文本文件名为MASP.GlycoModReader.txt,信息格式如下:Javascript 通过CSV文件解析以转换为JSON格式文件,javascript,python,scripting,Javascript,Python,Scripting,我从excel电子表格中提取了以下CSV文件。为了提供一些可能有帮助的背景信息,本文讨论了AGI编号(将其视为蛋白质标识符)、这些蛋白质标识符的未修改肽序列,然后是对未修改序列进行修改的修改肽序列、这些修改的索引/索引,然后是重复肽的组合光谱计数。文本文件名为MASP.GlycoModReader.txt,信息格式如下: AGI,UnMd Peptide (M) = x,Mod Peptide (oM) = Ox,Index/Indeces of Modification,counts,Comb
AGI,UnMd Peptide (M) = x,Mod Peptide (oM) = Ox,Index/Indeces of Modification,counts,Combined
Spectral count for repeated Peptides
AT1G56070.1,NMSVIAHVDHGKSTLTDSLVAAAGIIAQEVAGDVR,NoMSVIAHVDHGKSTLTDSLVAAAGIIAQEVAGDVR,2,17
AT1G56070.1,LYMEARPMEEGLAEAIDDGR,LYoMEARPoMEEGLAEAIDDGR,"3, 9",1
AT1G56070.1,EAMTPLSEFEDKL,EAoMTPLSEFEDKL,3,7
AT1G56070.1,LYMEARPMEEGLAEAIDDGR,LYoMEARPoMEEGLAEAIDDGR,"3, 9",2
AT1G56070.1,EGPLAEENMR,EGPLAEENoMR,9,2
AT1G56070.1,DLQDDFMGGAEIIK,DLQDDFoMGGAEIIK,7,1
AT1G56070.1,{"peptides": [{"sequence": "NMSVIAHVDHGKSTLTDSLVAAAGIIAQEVAGDVR", "mod_sequence":
"NoMSVIAHVDHGKSTLTDSLVAAAGIIAQEVAGDVR" , "mod_indeces": 2, "spectral_count": 17}, {"sequence":
"LYMEARPMEEGLAEAIDDGR" , "mod_sequence": "LYoMEARPoMEEGLAEAIDDGR", "mod_indeces": [3, 9],
"spectral_count": 3}, {"sequence": "EAMTPLSEFEDKL" , "mod_sequence": "EAoMTPLSEFEDKL",
"mod_indeces": [3,9], "spectral_count": 7}, {"sequence": "EGPLAEENMR", "mod_sequence":
"EGPLAEENoMR", "mod_indeces": 9, "spectral_count": 2}, {"sequence": "DLQDDFMGGAEIIK",
"mod_sequence": "DLQDDFoMGGAEIIK", "mod_indeces": [7], "spectral_count": 1}]}
提取上述内容后需要生成的输出文件格式如下:
AGI,UnMd Peptide (M) = x,Mod Peptide (oM) = Ox,Index/Indeces of Modification,counts,Combined
Spectral count for repeated Peptides
AT1G56070.1,NMSVIAHVDHGKSTLTDSLVAAAGIIAQEVAGDVR,NoMSVIAHVDHGKSTLTDSLVAAAGIIAQEVAGDVR,2,17
AT1G56070.1,LYMEARPMEEGLAEAIDDGR,LYoMEARPoMEEGLAEAIDDGR,"3, 9",1
AT1G56070.1,EAMTPLSEFEDKL,EAoMTPLSEFEDKL,3,7
AT1G56070.1,LYMEARPMEEGLAEAIDDGR,LYoMEARPoMEEGLAEAIDDGR,"3, 9",2
AT1G56070.1,EGPLAEENMR,EGPLAEENoMR,9,2
AT1G56070.1,DLQDDFMGGAEIIK,DLQDDFoMGGAEIIK,7,1
AT1G56070.1,{"peptides": [{"sequence": "NMSVIAHVDHGKSTLTDSLVAAAGIIAQEVAGDVR", "mod_sequence":
"NoMSVIAHVDHGKSTLTDSLVAAAGIIAQEVAGDVR" , "mod_indeces": 2, "spectral_count": 17}, {"sequence":
"LYMEARPMEEGLAEAIDDGR" , "mod_sequence": "LYoMEARPoMEEGLAEAIDDGR", "mod_indeces": [3, 9],
"spectral_count": 3}, {"sequence": "EAMTPLSEFEDKL" , "mod_sequence": "EAoMTPLSEFEDKL",
"mod_indeces": [3,9], "spectral_count": 7}, {"sequence": "EGPLAEENMR", "mod_sequence":
"EGPLAEENoMR", "mod_indeces": 9, "spectral_count": 2}, {"sequence": "DLQDDFMGGAEIIK",
"mod_sequence": "DLQDDFoMGGAEIIK", "mod_indeces": [7], "spectral_count": 1}]}
我在下面提供了我的解决方案:如果有人用另一种语言有更好的解决方案,或者可以分析我的解决方案,让我知道是否有更有效的方法来实现这一点,那么请在下面进行评论。多谢各位
#!/usr/bin/env node
var fs = require('fs');
var csv = require('csv');
var data ="proteins.csv";
/* Uses csv nodejs module to parse the proteins.csv file.
* Parses the csv file row by row and updates the peptide_arr.
* For new entries creates a peptide object, for similar entries it updates the
* counts in the peptide object with the same AGI#.
* Uses a peptide object to store protein ID AGI#, and the associated data.
* Writes all formatted peptide objects to a txt file - output.txt.
*/
// Tracks current row
var x = 0;
// An array of peptide objects stores the information from the csv file
var peptide_arr = [];
// csv module reads row by row from data
csv()
.from(data)
.to('debug.csv')
.transform(function(row, index) {
// For the first entry push a new peptide object with the AGI# (row[0])
if(x == 0) {
// cur is the current peptide read into row by csv module
Peptide cur = new Peptide( row[0] );
// Add the assoicated data from row (1-5) to cur
cur.data.peptides.push({
"sequence" : row[1];
"mod_sequence" : row[2];
if(row[5]){
"mod_indeces" : "[" + row[3] + ", " + row[4] + "]";
"spectral_count" : row[5];
} else {
"mod_indeces" : row[3];
"spectral_count" : row[4];
}
});
// Add the current peptide to the array
peptide_arr.push(cur);
}
// Move to the next row
x++;
});
// Loop through peptide_arr and append output with each peptide's AGI# and its data
String output = "";
for(var peptide in peptide_arr)
{
output = output + peptide.toString()
}
// Write the output to output.txt
fs.writeFile("output.txt", output);
/* Peptide Object :
* - id:AGI#
* - data: JSON Array associated
*/
function Peptide(id) // this is the actual function that does the ID retrieving and data
// storage
{
this.id = id;
this.data = {
peptides: []
};
}
/* Peptide methods :
* - toJson : Returns the properly formatted string
*/
Peptide.prototype = {
toString: function(){
return this.id + "," + JSON.stringify(this.data, null, " ") + "/n"
}
};
编辑说明:似乎当我运行我发布的这个解决方案时,我得到了一个内存泄漏错误;它无限运行,但不产生任何实质性的、可读的输出。如果有人愿意帮助评估为什么会发生这种情况,那就太好了。您的版本有效吗?看起来你只创建过一个肽对象。另外,“if(row[5])”语句在做什么?在您的示例数据中,始终有5个元素。而且,mod_indes总是应该是一个列表,对吗?因为在您的示例中,输出文件mod_indeces不是第一行中的列表。总之,以下是我在python中得出的结论:
import csv
import json
data = {}
with open('proteins.csv','rb') as f:
reader = csv.reader(f)
for row in reader:
name = row[0]
sequence = row[1]
mod_sequence = row[2]
mod_indeces = map(int,row[3].split(', '))
spectral_count = int(row[4])
peptide = {'sequence':sequence,'mod_sequence':mod_sequence,
'mod_indeces':mod_indeces,'spectral_count':spectral_count}
if name in data:
data[name]['peptides'].append(peptide)
else:
data[name] = {'peptides':[peptide]}
f.close()
f = open('output.txt','wb')
for protein in data:
f.write(protein)
f.write(',')
f.write(json.dumps(data[protein]))
f.write('\n')
f.close()
如果您在windows上,并且希望以纯文本形式查看文件,则可能需要将“\n”替换为“\r\n”或os.linesep
如果要跳过某些行(如果有标题或其他内容),可以执行以下操作:
import csv
import json
data = {}
rows_to_skip = 1
rows_read = 0
with open('proteins.csv','rb') as f:
reader = csv.reader(f)
for row in reader:
if rows_read >= rows_to_skip:
name = row[0]
sequence = row[1]
mod_sequence = row[2]
mod_indeces = map(int,row[3].split(', '))
spectral_count = int(row[4])
peptide = {'sequence':sequence,'mod_sequence':mod_sequence,
'mod_indeces':mod_indeces,'spectral_count':spectral_count}
if name in data:
data[name]['peptides'].append(peptide)
else:
data[name] = {'peptides':[peptide]}
rows_read += 1
f.close()
f = open('output.txt','wb')
for protein in data:
f.write(protein)
f.write(',')
f.write(json.dumps(data[protein]))
f.write('\n')
f.close()
如果您希望字典的键按特定顺序排列,可以使用orderedDict而不是默认dict。只需将肽行替换为以下内容:
peptide = OrderedDict([('sequence',sequence),
('mod_sequence',mod_sequence),
('mod_indeces',mod_indeces),
('spectral_count',spectral_count)])
现在秩序得以维持。也就是说,sequence
后面跟着mod\u sequence
后面跟着mod\u indeces
后面跟着spectral\u count
。要更改顺序,只需更改OrderedDict中元素的顺序
请注意,为了能够使用OrderedICT,您还必须从集合导入OrderedICT添加
。这可能应该在代码审阅中,而不是在代码审阅中。谢谢Matthew!我以Python格式保存了您的脚本,并从Mac OS X上的终端运行了它。我收到了以下错误,这可能是我在运行它时遇到的,但我将以任何方式发布它:回溯(最近一次调用):File“/Users/zsyed/PythonPeptideJSON.py”,第8行,对于读卡器中的行:_csv.Error:在未加引号的字段中看到新行字符-是否需要以通用新行模式打开文件?感谢您对我的程序的反馈。我会考虑你所说的话,并试着对Range进行检查,我没有那个问题。人们说,以“rU”模式打开文件似乎可以解决这个问题,所以不妨试一试。太棒了,这很有效,但我不幸地遇到了另一个错误:\。很抱歉打扰您: