perl根据索引操作文件_Perl - Fatal编程技术网

perl根据索引操作文件

perl

perl根据索引操作文件,perl,Perl,我正在研究一些基因组数据，我有2个文件-> 文件1 A1 110 A1 15 20 A2 2 11 A2 13 16 文件2 >A1 CTattatttatcgcacctacgtcaatactacggaacatactacta AAGTGTTAATATATGCTTGTAGCATAATATATATATGAAT >A2 GTCTGCACAGCGCTTTCCACACAGACATACAAATTCCACA AACCCCCCTCCCGCTTCTGGCCACACAGCATTAACACATCTGC caaacca

我正在研究一些基因组数据，我有2个文件->

文件1

A1 110 A1 15 20 A2 2 11 A2 13 16 文件2

>A1 CTattatttatcgcacctacgtcaatactacggaacatactacta AAGTGTTAATATATGCTTGTAGCATAATATATATATGAAT >A2 GTCTGCACAGCGCTTTCCACACAGACATACAAATTCCACA AACCCCCCTCCCGCTTCTGGCCACACAGCATTAACACATCTGC caaaccaaaaccataccaaccataccaaccataccataccataccataccaaccataccaattcaat 在文件1中，第2列和第3列表示文件2中的索引。所以我想知道，如果file1的column1中的字符与file2中后跟符号（>）的字符相匹配，那么从file2的下一行开始，根据file1的col2和col3中的索引返回子字符串。（对不起，我知道它很复杂）这是欲望输出->

输出

>A1#1:10 CTATTATTTA >A1#15:20 ACCTA >A2#2:11 TCTGCACAGC >A2#13:16 GCTT >A1#1:10 CTATTATTTA >A1#15:20 会计科目 >A2#2:11 TCTGCACAGC >A2#13:16 GCTT 我知道如果我只有一个字符串，我可以很容易地取出子字符串->

@ARGV or die "No input file specified";
open $first, '<',$ARGV[0] or die "Unable to open input file: $!";
$string="GATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCAT";
while (<$first>) 
{
@cols = split /\s+/;
$co=$cols[1]-1;
$length=$cols[2]-$co;
$fragment =  substr $string, $co, $length;
print ">",$cols[0],"#",$cols[1],":",$cols[2],"\n",$fragment,"\n";
}

@ARGV或die“未指定输入文件”；
首先打开$first，“以散列方式加载文件2，使用A1、A2。。。作为键，DNA序列作为值。通过这种方式，您可以轻松获得DNA序列。
以散列方式加载文件2，使用A1、A2。。。作为键，DNA序列作为值。这样你就可以很容易地得到DNA序列。我不确定它们是一条连续的线还是一条独立的线。
我现在把它设置为连续的
基本上，将第二个文件作为主文件读取。
然后，您可以根据需要处理任意多的索引文件
您可以使用数组散列来帮助索引。
推送{$index{$key}，[$start，$stop]
use strict;
my $master_file = "dna_master.txt";
if ($#ARGV) {
    print "Usage: $0 [filename(s)]\n";
    exit 1;
}

my %Data = read_master($master_file);

foreach my $index_file (@ARGV) {
    my %Index = read_index($index_file);
    foreach my $key (sort keys %Index) {
        foreach my $i (@{$Index{$key}}) {
            my ($start,$stop) = @$i;
            print ">$key#$start:$stop\n";
            my $pos = $start - 1;
            my $count = $stop - $start + 1;
            print substr($Data{$key},$pos,$count)."\n";
        }
    }
}

sub read_file {
    my $file = shift;
    my @lines;
    open(FILE, $file) or die "Error: cannot open $file\n$!";
    while(<FILE>){
        chomp; #remove newline
        s/(^\s+|\s+$)//g; # strip lead/trail whitespace
        next if /^$/;  # skip blanks
        push @lines, $_;
    }
    close FILE;
    return @lines;
}

sub read_index {
    my $file = shift;
    my @lines = read_file($file);
    my %index;
    foreach (@lines) {
        my ($key,$start,$stop) = split /\s+/;
        push @{$index{$key}}, [$start,$stop]; 
    }
    return %index;
}

sub read_master {
    my $file = shift;
    my %master;
    my $key;
    my @lines = read_file($file);
    foreach (@lines) {
        if ( m{^>(\w+)} ) { $key = $1 }
        else { $master{$key} .= $_ }
    }
    return %master;
}

使用严格；
我的$master\u file=“dna\u master.txt”；
如果（$#ARGV）{
打印“用法：$0[文件名]\n”；
出口1；
}
我的%Data=读取主文件（$master\u文件）；
foreach我的$index_文件（@ARGV）{
我的%Index=读取索引（$Index\u文件）；
foreach my$键（排序键%索引）{
foreach my$i（@{$Index{$key}}）{
我的（$start，$stop）=@$i；
打印“>$key#$start:$stop\n”；
my$pos=$start-1；
我的$count=$stop-$start+1；
打印substr（$Data{$key}，$pos，$count）。“\n”；
}
}
}
子读取文件{
我的$file=shift；
我的@行；
打开（文件，$FILE）或死亡“错误：无法打开$FILE\n$！”；
while（）{
chomp；#删除换行符
s/（^\s+|\s+$）//g；#去除前导/尾迹空白
下一个if/^$/；#跳过空格
按@行，$\；
}
关闭文件；
返回@行；
}
子读取索引{
我的$file=shift；
my@lines=读取文件（$file）；
我的%索引；
foreach（@行）{
我的（$key，$start，$stop）=拆分/\s+/；
推送{$index{$key}，[$start，$stop]；
}
返回%索引；
}
副读主机{
我的$file=shift；
我的主人；
我的$key；
my@lines=读取文件（$file）；
foreach（@行）{
如果（m{^>（\w+）}{$key=$1}
else{$master{$key}.=$\u0}
}
返回%master；
}
我不确定它们是一条连续线还是一条单独的线。
我现在把它设置为连续的
基本上，将第二个文件作为主文件读取。
然后，您可以根据需要处理任意多的索引文件
您可以使用数组散列来帮助索引。
推送{$index{$key}，[$start，$stop]
use strict;
my $master_file = "dna_master.txt";
if ($#ARGV) {
    print "Usage: $0 [filename(s)]\n";
    exit 1;
}

my %Data = read_master($master_file);

foreach my $index_file (@ARGV) {
    my %Index = read_index($index_file);
    foreach my $key (sort keys %Index) {
        foreach my $i (@{$Index{$key}}) {
            my ($start,$stop) = @$i;
            print ">$key#$start:$stop\n";
            my $pos = $start - 1;
            my $count = $stop - $start + 1;
            print substr($Data{$key},$pos,$count)."\n";
        }
    }
}

sub read_file {
    my $file = shift;
    my @lines;
    open(FILE, $file) or die "Error: cannot open $file\n$!";
    while(<FILE>){
        chomp; #remove newline
        s/(^\s+|\s+$)//g; # strip lead/trail whitespace
        next if /^$/;  # skip blanks
        push @lines, $_;
    }
    close FILE;
    return @lines;
}

sub read_index {
    my $file = shift;
    my @lines = read_file($file);
    my %index;
    foreach (@lines) {
        my ($key,$start,$stop) = split /\s+/;
        push @{$index{$key}}, [$start,$stop]; 
    }
    return %index;
}

sub read_master {
    my $file = shift;
    my %master;
    my $key;
    my @lines = read_file($file);
    foreach (@lines) {
        if ( m{^>(\w+)} ) { $key = $1 }
        else { $master{$key} .= $_ }
    }
    return %master;
}

使用严格；
我的$master\u file=“dna\u master.txt”；
如果（$#ARGV）{
打印“用法：$0[文件名]\n”；
出口1；
}
我的%Data=读取主文件（$master\u文件）；
foreach我的$index_文件（@ARGV）{
我的%Index=读取索引（$Index\u文件）；
foreach my$键（排序键%索引）{
foreach my$i（@{$Index{$key}}）{
我的（$start，$stop）=@$i；
打印“>$key#$start:$stop\n”；
my$pos=$start-1；
我的$count=$stop-$start+1；
打印substr（$Data{$key}，$pos，$count）。“\n”；
}
}
}
子读取文件{
我的$file=shift；
我的@行；
打开（文件，$FILE）或死亡“错误：无法打开$FILE\n$！”；
while（）{
chomp；#删除换行符
s/（^\s+|\s+$）//g；#去除前导/尾迹空白
下一个if/^$/；#跳过空格
按@行，$\；
}
关闭文件；
返回@行；
}
子读取索引{
我的$file=shift；
my@lines=读取文件（$file）；
我的%索引；
foreach（@行）{
我的（$key，$start，$stop）=拆分/\s+/；
推送{$index{$key}，[$start，$stop]；
}
返回%索引；
}
副读主机{
我的$file=shift；
我的主人；
我的$key；
my@lines=读取文件（$file）；
foreach（@行）{
如果（m{^>（\w+）}{$key=$1}
else{$master{$key}.=$\u0}
}
返回%master；
}
第二次更新也将主文件转换为数组散列
这将第二个文件中的每一行视为单独的序列
use strict;
my $master_file = "dna_master.txt";
if ($#ARGV) {
    print "Usage: $0 [filename(s)]\n";
    exit 1;
}

my %Data = read_master($master_file);

foreach my $index_file (@ARGV) {
    my %Index = read_index($index_file);
    foreach my $key (sort keys %Index) {
        foreach my $i (@{$Index{$key}}) {
            my ($start,$stop) = @$i;
            print ">$key#$start:$stop\n";
            my $pos = $start - 1;
            my $count = $stop - $start + 1;
            foreach my $seq (@{$Data{$key}}) {
                print substr($seq,$pos,$count)."\n";
            }
        }
    }
}

sub read_file {
    my $file = shift;
    my @lines;
    open(FILE, $file) or die "Error: cannot open $file\n$!";
    while(<FILE>){
        chomp; #remove newline
        s/(^\s+|\s+$)//g; # strip lead/trail whitespace
        next if /^$/;  # skip blanks
        push @lines, $_;
    }
    close FILE;
    return @lines;
}

sub read_index {
    my $file = shift;
    my @lines = read_file($file);
    my %index;
    foreach (@lines) {
        my ($key,$start,$stop) = split /\s+/;
        push @{$index{$key}}, [$start,$stop]; 
    }
    return %index;
}

sub read_master {
    my $file = shift;
    my %master;
    my $key;
    my @lines = read_file($file);
    foreach (@lines) {
        if ( m{^>(\w+)} ) { $key = $1 }
        else { push @{ $master{$key} }, $_ }
    }
    return %master;
}

第二次更新也将主文件转换为数组散列
这将第二个文件中的每一行视为单独的序列
use strict;
my $master_file = "dna_master.txt";
if ($#ARGV) {
    print "Usage: $0 [filename(s)]\n";
    exit 1;
}

my %Data = read_master($master_file);

foreach my $index_file (@ARGV) {
    my %Index = read_index($index_file);
    foreach my $key (sort keys %Index) {
        foreach my $i (@{$Index{$key}}) {
            my ($start,$stop) = @$i;
            print ">$key#$start:$stop\n";
            my $pos = $start - 1;
            my $count = $stop - $start + 1;
            foreach my $seq (@{$Data{$key}}) {
                print substr($seq,$pos,$count)."\n";
            }
        }
    }
}

sub read_file {
    my $file = shift;
    my @lines;
    open(FILE, $file) or die "Error: cannot open $file\n$!";
    while(<FILE>){
        chomp; #remove newline
        s/(^\s+|\s+$)//g; # strip lead/trail whitespace
        next if /^$/;  # skip blanks
        push @lines, $_;
    }
    close FILE;
    return @lines;
}

sub read_index {
    my $file = shift;
    my @lines = read_file($file);
    my %index;
    foreach (@lines) {
        my ($key,$start,$stop) = split /\s+/;
        push @{$index{$key}}, [$start,$stop]; 
    }
    return %index;
}

sub read_master {
    my $file = shift;
    my %master;
    my $key;
    my @lines = read_file($file);
    foreach (@lines) {
        if ( m{^>(\w+)} ) { $key = $1 }
        else { push @{ $master{$key} }, $_ }
    }
    return %master;
}

谢谢你的回复。但在文件2中，我不知道有多少A1，A2…An（以及它们出现在哪一行），我只知道它们后面跟着符号“>”。我应该如何将所有A1等作为键，将所有DNA序列作为值？是的，这就是我的意思。你不需要知道有多少，但如何识别它们。如果是不均匀的线条->关键点，甚至是线条->序列，那就很容易了。此外，您还需要从键中删除前导“>”，这可以通过/^>（\w+/
和从此正则表达式中提取$1
轻松完成。感谢您的回复。但在文件2中，我不知道有多少A1，A2…An（以及它们出现在哪一行），我只知道它们后面跟着符号“>”。我应该如何将所有A1等作为键，将所有DNA序列作为值？是的，这就是我的意思。你不需要知道有多少，但如何识别它们。如果是不均匀的线条->关键点，甚至是线条->序列，那就很容易了。此外，您还需要从键中删除前导“>”，这可以通过/^>（\w+/
和从此正则表达式中提取$1轻松完成。非常感谢您的帮助
>A1#1:10
CTATTATTTA
AAGTGTGTTA
>A1#15:20
ACCTAC
ATTAAT
>A2#2:11
TCTGCACAGC
ACCCCCCCCT
AAACCCCAAA
>A2#13:16
GCTT
CCCC
ACAA