Performance 编写高效的Perl代码在大目录中爬行_Performance_Perl_Directory_Delete File_Large Data

Performance 编写高效的Perl代码在大目录中爬行

performance perl directory

Performance 编写高效的Perl代码在大目录中爬行,performance,perl,directory,delete-file,large-data,Performance,Perl,Directory,Delete File,Large Data,我正在编写一个Perl脚本，它在一个包含300000多个文件的目录中爬行，并删除给定年份中除第一个文件之外的所有文件。我的问题是，我的代码迫使Perl对包含300000个文件的目录进行大约300001次的扫描。到目前为止，它已经运行了四天，我希望你们有一些技巧，使这样的代码在将来更有效脚本： #!/usr/bin/perl use Date::Calc qw(Delta_Days Decode_Date_EU); # Note: must use default perl moudle on

我正在编写一个Perl脚本，它在一个包含300000多个文件的目录中爬行，并删除给定年份中除第一个文件之外的所有文件。我的问题是，我的代码迫使Perl对包含300000个文件的目录进行大约300001次的扫描。到目前为止，它已经运行了四天，我希望你们有一些技巧，使这样的代码在将来更有效

脚本：

#!/usr/bin/perl
use Date::Calc qw(Delta_Days Decode_Date_EU);
# Note: must use default perl moudle on Killdevil (module add perl)

@base = (1993, 1, 1);
$count = 0;

@files = <*>; # Creates array of all files in directory
foreach $file (@files) {
    # Splits indivdual filename into an array seperated by
    # comma (CIK, 10, K, Year, Month, Date) indexed by 0-5
    @filearray = split(/\-/, $file);

    $cik = $filearray[0];
    $cikyear = $filearray[3];

    # Defines a new array as all files in directory with the
    # same CIK and year as our file
    @cikfiles = grep { /^$cik-10-K-$cikyear/ } <*>;

    $sizecik = @cikfiles;
    $best = 0; # Index for file with earliest date
    $bestsize = 1000000000000000000000000000; # Initial value to beat

    # Only run through the proccess if there are
    # multiple files with same CIK same year.
    if ($sizecik != 1) {

        for($i = 1; $i < $sizecik + 1; $i = $i + 1) {
            # Read filename and creates an array deliminated by "-"
            @filearray1 = split(/-/, $cikfiles[$i-1]);

            $year = $filearray1[3];
            $month = $filearray1[4];

            # Deletes leading zero from months if there exists one
            $month =~ s/^0//;
            $day = $filearray1[5];
            $day =~ s/^0//; # Removes leading zero

            # Calculates number of days from base year
            $dd = Delta_Days($base[0], $base[1], $base[2], $year, $month, $day);

            if ($dd < $bestsize) {
                # If has lower number of days than current best, index
                # this file as the new leader
                $best = $i;

                # Reset the size to beat to the dd of this file
                $bestsize = $dd;
            }
        }

        for ($i = 1; $i < $sizecik + 1; $i = $i + 1) {
            # Runs through current array and deletes all
            # files that are not the best
            if($i != $best) {
                $rm = "rm " . $cikfiles[$i-1];
                system($rm);
                $count = $count + 1;
            }
        }
    }
}

# Displays total number of files removed
print "Number of files deleted: $count";

close(MYOUTFILE);

#/usr/bin/perl
使用日期：：计算qw（增量天解码日期）；
#注意：必须在Killdevil上使用默认的perl Moude（模块添加perl）
@基数=（1993,1,1）；
$count=0；
@文件=；#创建目录中所有文件的数组
foreach$file（@files）{
#将单个文件名拆分为以分隔的数组
#逗号（CIK，10，K，年，月，日期）索引为0-5
@filearray=split（/\-/，$file）；
$cik=$filearray[0]；
$cikyear=$filearray[3]；
#将新数组定义为具有
#与我们的文件相同的CIK和年份
@cikfiles=grep{/^$cik-10-K-$cikyear/}；
$sizecik=@cikfiles；
$best=0；#最早日期文件的索引
$bestsize=10000000000000000000；#要节拍的初始值
#仅当存在以下情况时才运行该过程：
#同一年具有相同CIK的多个文件。
如果（$sizecik！=1）{
对于（$i=1；$i<$sizecik+1；$i=$i+1）{
#读取文件名并创建以“-”清除的数组
@filearray1=拆分（/-/，$cikfiles[$i-1]）；
$year=$filearray1[3]；
$month=$filearray1[4]；
#如果存在月份，则从月份中删除前导零
$month=~s/^0/；
$day=$filearray1[5]；
$day=~s/^0/#删除前导零
#计算从基准年算起的天数
$dd=增量天数（$base[0]，$base[1]，$base[2]，$year，$month，$day）；
如果（$dd<$bestsize）{
#如果天数低于当前最佳天数，则索引
#此文件将作为新的领导
$best=$i；
#将要节拍的大小重置为此文件的dd
$bestsize=$dd；
}
}
对于（$i=1；$i<$sizecik+1；$i=$i+1）{
#运行当前数组并删除所有
#不是最好的文件
如果（$i！=$best）{
$rm=“rm”。$cikfiles[$i-1]；
系统（rm）；
$count=$count+1；
}
}
}
}
#显示删除的文件总数
打印“删除的文件数：$count”；
关闭（MYOUTFILE）；

如果不查看目录，会更有效吗

@cikfiles = grep { /^$cik-10-K-$cikyear/ } <*>;

@cikfiles=grep{/^$cik-10-K-$cikyear/}；

相反，我搜索了原始数组，然后删除了条目

@cikfiles = grep { /^$cik-10-K-$cikyear/ } <@files>;

@cikfiles=grep{/^$cik-10-K-$cikyear/}；

如何删除@files数组中删除的元素？

无需多次扫描目录。扫描目录一次，收集所需信息

如果日期的格式为

YYYYMMDD

，则可以使用简单的字符串比较来确定两个日期中哪一个较旧

my $opt_dry_run = 1;

my %files_by_cik_and_year;
while (<*>) {
   my ($cik, undef, undef, $year, $month, $day) = split(/-/, $_);
   push @{ $files_by_cik_and_year{$cik}{$year} },
      [ $_, sprintf("%04d%02d%02d", $year, $month, $day) ];
}

for my $cik (keys(%files_by_cik_and_year)) {
   for my $year (keys(%{ $files_by_cik_and_year{$cik} })) {
      my @files =
         map { $_->[0] }
            sort { $a->[1] cmp $b->[1] }
               @{ $files_by_cik_and_year{$cik}{$year} };

      shift(@files);

      for (@files) {
         print("Deleting $_\n");
         if (!$opt_dry_run) {
            unlink($_)
               or warn("Couldn't delete $_\n");
         }
      }
   }
}

my$opt_dry_run=1；
我的%U文件按cik和年列出；
而（）{
my（$cik，unde，unde，$year，$month，$day）=拆分（/-/，$）；
按"cik"和"year"{$cik}{$year}推送{$files"，
[$年，sprintf（“%04d%02d%02d”，$year，$month，$day）]；
}
对于我的$cik（密钥（%files\u by\u cik\u和\u year））{
对于我的$year（键（%{$files_by_cik_和_year{$cik}））{
我的@文件=
映射{$\->[0]}
排序{$a->[1]cmp$b->[1]}
@{$cik{$year}{$cik}{$year}}{$cik{$year}}}的{$files{U；
shift（@files）；
对于（@文件）{
打印（“删除$）；
如果（！$opt\u dry\u run）{
取消链接（美元）
或警告（“无法删除$\n”）；
}
}
}
}

简化：

my $opt_dry_run = 1;

my %files_by_cik_and_year;
while (<*>) {
   my ($cik, undef, undef, $year, $month, $day) = split(/-/, $_);
   push @{ $files_by_cik_and_year{"$cik-$year"} },
      [ $_, sprintf("%04d%02d%02d", $year, $month, $day) ];
}

for (values(%files_by_cik_and_year)) {
   my @files =
      map { $_->[0] }
         sort { $a->[1] cmp $b->[1] }
            @$_;

   shift(@files);

   for (@files) {
      print("Deleting $_\n");
      if (!$opt_dry_run) {
         unlink($_)
            or warn("Couldn't delete $_\n");
      }
   }
}

my$opt_dry_run=1；
我的%U文件按cik和年列出；
而（）{
my（$cik，unde，unde，$year，$month，$day）=拆分（/-/，$）；
按"cik"和"year"{“$cik-$year”}推送{$files{u}，
[$年，sprintf（“%04d%02d%02d”，$year，$month，$day）]；
}
用于（值（%files_by_cik_和_year））{
我的@文件=
映射{$\->[0]}
排序{$a->[1]cmp$b->[1]}
@$_;
shift（@files）；
对于（@文件）{
打印（“删除$）；
如果（！$opt\u dry\u run）{
取消链接（美元）
或警告（“无法删除$\n”）；
}
}
}

你能解释一下“my（$cik，unde，unde，$year，$month，$day）=split（/-/，$）；push{$files_by_cik_和_year{“$cik-$year”}，[$\uu，sprintf（%04d%02d，$year，$month，$day）]”的语法吗？第一句话很明显。RHS返回的第一个值被分配给LHS上的第一个变量，等等。

undef

是占位符。我需要帮助的最后两行更多。第二个附加到由

$files\u by\u cik\u和{“$cik-$year”}

引用的数组。如果需要，可以通过自动活化创建一个。用于对文件进行分组。它指定的是对两个元素数组（由

[…]

创建）的引用。第一个元素是文件名，第二个元素是从文件名中提取的日期。它将用于排序，因此使用了

sprintf

来确保存在必要的前导零。