Python 使用脚本对数字进行分组的一种方法_Python_Algorithm_Shell_Awk

Python 使用脚本对数字进行分组的一种方法

python algorithm shell awk

Python 使用脚本对数字进行分组的一种方法,python,algorithm,shell,awk,Python,Algorithm,Shell,Awk,我有一个大的（800K-唯一和排序）数字列表。比如说 1002230091 => 1002230091 <- not a complete set of digits ... 1112223000 -- 1112223001 | 1112223002 | ... | => 111223 1112223009 | ...

我有一个大的（800K-唯一和排序）数字列表。比如说

    1002230091         => 1002230091 <- not a complete set of digits
    ...
    1112223000   --
    1112223001     |
    1112223002     |  
    ...            |   => 111223
    1112223009     |
    ...            |
    1112223999     |
    ...            |
    1112223999   --
    ...

我尝试使用Tree:：Trie（用于更快的查找）和纯旧哈希（用于迭代键）创建一个脚本

我组合的逻辑没有到达根前缀，它只执行一轮分组：

1000  --
1001    |
1002    | => 100
...     |
1009  --
1010      => 1010

此外，遍历这一数量的数据非常缓慢

我确信有**更好的选择**，从处理数据的速度和满足这一需求两方面考虑

非常感谢您在满足这一需求方面提供的建议/帮助。不过，我最熟悉Shell或Perl脚本，可以使用任何类型的脚本解决方案

这是我总结的逻辑，它进行一轮分组，但不执行第二轮分组

#!/usr/bin/perl -w

use Tree::Trie;
use strict;
use Getopt::Long;
use Pod::Usage;

my %w_mk;
my $csv = "./test.csv";
my $debug = 1;
my($trie) = new Tree::Trie;
my $help = 0;
my $man  = 0;
my $cycle = 1;
my $max_key_length = 1;
my $min_key_length = 1;

GetOptions("debug=i"             => \$debug,
           "source_file|s=s"     => \$csv,
           "cycle|c=i"           => \$cycle,
           "help|?"              => \$help,
           "man!"                => \$man
           ) or pod2usage("Try '$0 --help' for more information." );

pod2usage(-verbose => 99, -section => "NAME") if $help;
pod2usage(-verbose => 2) if $man;

sub clean_ds
{
  my ($key, @keys) = @_;
  my $key_len = scalar @keys;

  if ($key_len == 10) {
    foreach my $k (@keys) {
      $trie->remove($k);
    }

    print "\t\tRoot key $key found!!\n" if ($debug > 1);

    ## Add this working key as a new key
    $w_mk{$key} = 2;

    ## remove all of the related complete keys
    delete @w_mk{@keys};

    print "\t\tRemoved keys: [@keys]\n\n" if ($debug > 1);
  }
}

sub is_complete_key
{
  my ($key) = @_;
  my $len = length $key;
  my (@key_list) = $trie->lookup($key, $len + 1);
  my ($key_list_len) = scalar @key_list;

  ## When a key has been processed once,
  ## let's mark it that it has been processed
  $w_mk{$key} = 2;

  print "\t\tSearch for key: '$key'\n\t\tNo. of items found: $key_list_len\n\t\titems : [@key_list]\n" if ($debug >= 3);

  # Complete DNIS found
  if ($key_list_len == 10) {
    #because trie lookup when prefix length is supplied returns only the suffix portion
    #e.g. 1000, 1001, 1002, 1003
    #when lookup('100', 4) returns 0, 1, 2, 3
    #update the returned key list by prepending it with the original key

    my @t_key_list =  @key_list;
    for my $elem (@t_key_list) {
      $elem = $key.$elem;
    }

    clean_ds($key, @t_key_list);

    return (1, @t_key_list);
  }
  else {
    print "\t\tRoot key $key not adding!!\n\n" if ($debug > 1);
  }

  return (0, @key_list);
}

open (my $handle, '<', $csv) or die "Could not open file '$csv' $!";;

while (my $row = <$handle>) {
  chomp($row);

  my $k_len = length($row);
  $max_key_length = $k_len if ($k_len > $max_key_length);

  $trie->add($row);
  $w_mk{$row} = 1;

  print "data: '$row'\n" if ($debug >= 4);
}

close ($handle);

sub group_keys
{
  my ($key, $iteration) = @_;

  my $value = 0;
  if (exists $w_mk{$key}) {
    $value = $w_mk{$key};
    chomp($value);
  }

  while ($value >= $iteration && length $key > 1) {
    chop($key); # Remove last character of the key

    if (exists $w_mk{$key}) {
      $value = $w_mk{$key};
      chomp($value);
    }

    print "\t(w_key => w_value): '$key' => '$value'\n" if ($debug >= 2);

    ## If the working key has already been processed once,
    ## no need to reprocess it
    if ($value < 2) {
      my ($st, @w_key_list) = is_complete_key($key);

      ##
      ## if number of keys found is less than 10
      ## no need to continue to chop the key
      ## go to the next key
      ##
      #if ($st == 0) {
        last;
      #}
    }
  }
}

sub go_through_keys
{
  my ($lcycle) = @_;

  print "Reduction Cycle: '$lcycle'\n\n" if ($debug >= 3);

  foreach my $key (sort keys %w_mk) {
    my $w_key = $key;
    my $w_value = 0;

    if (exists $w_mk{$w_key}) {
      $w_value = $w_mk{$w_key};
      chomp($w_value);
    }

    print "(Key => Value): '$key' => '$w_value'\n" if ($debug >= 2);
    if ($debug >= 3) {
      my (@keys) = $trie->lookup($key);
      my $key_len = scalar @keys;
      print "\t\tNo. of items found: $key_len\n\t\titems : [@keys]\n" if ($debug >= 3);
    }

    group_keys($w_key, $lcycle);
  }
}

sub reset_key_values
{
  foreach my $key (keys %w_mk) {
    $w_mk{$key} = 1;
  }
}

for (my $i=$min_key_length; $i < $max_key_length; $i++) {
  go_through_keys($i);
  # reset values for each key
  #reset_key_values();
}
print "$_\n" for sort keys %w_mk;

__END__

=head1 NAME

  group_dnis.pl - A script to group and reduce a list of numbers

=head1 SYNOPSIS

  group_dnis.pl - A script to group and reduce a list of numbers

              ------------------------------
                 dnis(s)  => common root
              ------------------------------
                 1000   --
                 1001    |
                 1002    | ==> 100
                 1003    |
                 ...     |
                 1009   --
                 1010      ==> 1010


group_dnis.pl [options]
  Options:
    -help     brief help message
    -man      full documentation

=head1 OPTIONS

=over 4

=item B<-source_file>

  Source file contain list of numbers to be groupped.

=item B<-help>

  Prints usage with some examples of how to use this script.

  group_dnis.pl -s <file name>

=back

  Documentation ends here.

=cut

#/usr/bin/perl-w
使用Tree:：Trie；
严格使用；
使用Getopt:：Long；
使用Pod：：用法；
我的%w_mk；
my$csv=“./test.csv”；
我的$debug=1；
my（$trie）=新树：：trie；
我的$help=0；
我的$man=0；
我的$cycle=1；
我的$max_key_length=1；
我的$min_key_length=1；
GetOptions（“debug=i”=>\$debug，
“源文件| s=s”=>\$csv，
“循环| c=i”=>\$cycle，
“帮助”？“=>\$help，
“男人！”=>\$man
)或pod2usage（“请尝试“$0--help”以获取更多信息”）；
pod2usage（-verbose=>99，-section=>“NAME”）如果$help；
pod2usage（-verbose=>2）如果$man；
次清洁
{
我的（$key，@key）=@；
my$key\u len=标量@keys；
如果（$key_len==10）{
foreach我的$k（@keys）{
$trie->remove（$k）；
}
如果（$debug>1），则打印“\t\t已找到密钥$key！！\n”；
##将此工作密钥添加为新密钥
$w_mk{$key}=2；
##删除所有相关的完整密钥
删除@w_mk{@keys}；
如果（$debug>1），则打印“\t\t移动的密钥：[@keys]\n\n”；
}
}
sub是_complete_键
{
我的（$key）=@；
my$len=长度$key；
my（@key_list）=$trie->lookup（$key$len+1）；
my（$key\u list\u len）=标量@key\u list；
##当密钥被处理一次时，
##让我们标记它已被处理
$w_mk{$key}=2；
如果（$debug>=3），则打印“\t\t搜索关键字：'$key'\n\t\t未找到任何项：$key\U list\u len\n\t\t项目：[@key\U list]\n”；
#找到完整的DNA
如果（$key\u list\u len==10）{
#因为提供前缀长度时的trie查找只返回后缀部分
#e、 g.1000、1001、1002、1003
#查找时（'100'，4）返回0、1、2、3
#通过在返回的密钥列表前面加上原始密钥来更新它
我的@t_key_list=@key_list；
我的$elem（@t_key_list）{
$elem=$key.$elem；
}
清理数据（$key，@t\u key\u list）；
返回（1，@t_key_list）；
}
否则{
如果（$debug>1），则打印“\t\t密钥$key未添加！！\n\n”；
}
返回（0，@key_list）；
}
打开（我的$handle，这里有一些JavaScript中的线性内容（假设列表
已排序）。转换为AWK应该不会太糟糕。不确定它是否是完全证明…可能需要根据实际数据进行调试
函数f（list）{
var i=0，j=9，k=0，templast=[list[i]]；
函数组（）{
while（list[i+1]&&list[i]。substr（0，j）==list[i+1]。substr（0，j）
&&数字（列表[i].子列表（j-10））+1==列表[i+1].子列表（j-10））{
圣堂武士推（列表[i+1]）；
i++；
}
}
函数isComplete（）{
返回编号（templast[0].substr（j-10））+Math.pow（10,10-j）-1
==templast[templast.length-1].substr（j-10）；
}
而（i对于（l=Math.pow（10，k）；l在JavaScript中有一些线性的东西（假设list
已排序）。转换为AWK应该不会太糟糕。不确定它是否是完全证明…可能需要根据实际数据进行调试
函数f（list）{
var i=0，j=9，k=0，templast=[list[i]]；
函数组（）{
while（list[i+1]&&list[i]。substr（0，j）==list[i+1]。substr（0，j）
&&数字（列表[i].子列表（j-10））+1==列表[i+1].子列表（j-10））{
圣堂武士推（列表[i+1]）；
i++；
}
}
函数isComplete（）{
返回编号（templast[0].substr（j-10））+Math.pow（10,10-j）-1
==templast[templast.length-1].substr（j-10）；
}
而（i对于（l=Math.pow（10，k）；l在您确定了“完整”前缀之后，您如何知道（这是否很重要？）它有多少额外的数字？您已经给出了两个示例，10[00…10]
和1112223[000…999]
，但是我们如何知道1112223
不是1112223[0…9]
之前，也许10
实际上是10[00000…99999]
？所有数字的位数都相同吗？谢谢你的评论。出于我的需要，不管有多少额外的数字，我对根前缀更感兴趣。在你评论的第二部分，所有数字的位数都相同。因此10[00…10]
将不存在于您的数据中，对吗？它必须是10[00000000…999999999]
才能被视为完整的？数据组1组2 1000 100 100 10
1000  --
1001    |
1002    | => 100
...     |
1009  --
1010      => 1010 

#!/usr/bin/perl -w

use Tree::Trie;
use strict;
use Getopt::Long;
use Pod::Usage;

my %w_mk;
my $csv = "./test.csv";
my $debug = 1;
my($trie) = new Tree::Trie;
my $help = 0;
my $man  = 0;
my $cycle = 1;
my $max_key_length = 1;
my $min_key_length = 1;

GetOptions("debug=i"             => \$debug,
           "source_file|s=s"     => \$csv,
           "cycle|c=i"           => \$cycle,
           "help|?"              => \$help,
           "man!"                => \$man
           ) or pod2usage("Try '$0 --help' for more information." );

pod2usage(-verbose => 99, -section => "NAME") if $help;
pod2usage(-verbose => 2) if $man;

sub clean_ds
{
  my ($key, @keys) = @_;
  my $key_len = scalar @keys;

  if ($key_len == 10) {
    foreach my $k (@keys) {
      $trie->remove($k);
    }

    print "\t\tRoot key $key found!!\n" if ($debug > 1);

    ## Add this working key as a new key
    $w_mk{$key} = 2;

    ## remove all of the related complete keys
    delete @w_mk{@keys};

    print "\t\tRemoved keys: [@keys]\n\n" if ($debug > 1);
  }
}

sub is_complete_key
{
  my ($key) = @_;
  my $len = length $key;
  my (@key_list) = $trie->lookup($key, $len + 1);
  my ($key_list_len) = scalar @key_list;

  ## When a key has been processed once,
  ## let's mark it that it has been processed
  $w_mk{$key} = 2;

  print "\t\tSearch for key: '$key'\n\t\tNo. of items found: $key_list_len\n\t\titems : [@key_list]\n" if ($debug >= 3);

  # Complete DNIS found
  if ($key_list_len == 10) {
    #because trie lookup when prefix length is supplied returns only the suffix portion
    #e.g. 1000, 1001, 1002, 1003
    #when lookup('100', 4) returns 0, 1, 2, 3
    #update the returned key list by prepending it with the original key

    my @t_key_list =  @key_list;
    for my $elem (@t_key_list) {
      $elem = $key.$elem;
    }

    clean_ds($key, @t_key_list);

    return (1, @t_key_list);
  }
  else {
    print "\t\tRoot key $key not adding!!\n\n" if ($debug > 1);
  }

  return (0, @key_list);
}

open (my $handle, '<', $csv) or die "Could not open file '$csv' $!";;

while (my $row = <$handle>) {
  chomp($row);

  my $k_len = length($row);
  $max_key_length = $k_len if ($k_len > $max_key_length);

  $trie->add($row);
  $w_mk{$row} = 1;

  print "data: '$row'\n" if ($debug >= 4);
}

close ($handle);

sub group_keys
{
  my ($key, $iteration) = @_;

  my $value = 0;
  if (exists $w_mk{$key}) {
    $value = $w_mk{$key};
    chomp($value);
  }

  while ($value >= $iteration && length $key > 1) {
    chop($key); # Remove last character of the key

    if (exists $w_mk{$key}) {
      $value = $w_mk{$key};
      chomp($value);
    }

    print "\t(w_key => w_value): '$key' => '$value'\n" if ($debug >= 2);

    ## If the working key has already been processed once,
    ## no need to reprocess it
    if ($value < 2) {
      my ($st, @w_key_list) = is_complete_key($key);

      ##
      ## if number of keys found is less than 10
      ## no need to continue to chop the key
      ## go to the next key
      ##
      #if ($st == 0) {
        last;
      #}
    }
  }
}

sub go_through_keys
{
  my ($lcycle) = @_;

  print "Reduction Cycle: '$lcycle'\n\n" if ($debug >= 3);

  foreach my $key (sort keys %w_mk) {
    my $w_key = $key;
    my $w_value = 0;

    if (exists $w_mk{$w_key}) {
      $w_value = $w_mk{$w_key};
      chomp($w_value);
    }

    print "(Key => Value): '$key' => '$w_value'\n" if ($debug >= 2);
    if ($debug >= 3) {
      my (@keys) = $trie->lookup($key);
      my $key_len = scalar @keys;
      print "\t\tNo. of items found: $key_len\n\t\titems : [@keys]\n" if ($debug >= 3);
    }

    group_keys($w_key, $lcycle);
  }
}

sub reset_key_values
{
  foreach my $key (keys %w_mk) {
    $w_mk{$key} = 1;
  }
}

for (my $i=$min_key_length; $i < $max_key_length; $i++) {
  go_through_keys($i);
  # reset values for each key
  #reset_key_values();
}
print "$_\n" for sort keys %w_mk;

__END__

=head1 NAME

  group_dnis.pl - A script to group and reduce a list of numbers

=head1 SYNOPSIS

  group_dnis.pl - A script to group and reduce a list of numbers

              ------------------------------
                 dnis(s)  => common root
              ------------------------------
                 1000   --
                 1001    |
                 1002    | ==> 100
                 1003    |
                 ...     |
                 1009   --
                 1010      ==> 1010


group_dnis.pl [options]
  Options:
    -help     brief help message
    -man      full documentation

=head1 OPTIONS

=over 4

=item B<-source_file>

  Source file contain list of numbers to be groupped.

=item B<-help>

  Prints usage with some examples of how to use this script.

  group_dnis.pl -s <file name>

=back

  Documentation ends here.

=cut

console.log(f(['1002230091','1112223000','1112223001','1112223002','1112223003'
              ,'1112223004','1112223005','1112223006','1112223007','1112223008'
              ,'1112223009']));
/*
  1002230091
  111222300
*/

console.log(f(['1002230091','1112223000','1112223001','1112223002','1112223003'
              ,'1112223004','1112223005','1112223006','1112223007','1112223008'
              ,'1112223009','1112223010','1112223011','1112223012']));

/*
1002230091
111222300
1112223010
1112223011
1112223012
*/