Warning: file_get_contents(/data/phpspider/zhask/data//catemap/2/python/323.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Python 使用脚本对数字进行分组的一种方法_Python_Algorithm_Shell_Awk - Fatal编程技术网

Python 使用脚本对数字进行分组的一种方法

Python 使用脚本对数字进行分组的一种方法,python,algorithm,shell,awk,Python,Algorithm,Shell,Awk,我有一个大的(800K-唯一和排序)数字列表。比如说 1002230091 => 1002230091 <- not a complete set of digits ... 1112223000 -- 1112223001 | 1112223002 | ... | => 111223 1112223009 | ...

我有一个大的(800K-唯一和排序)数字列表。比如说

    1002230091         => 1002230091 <- not a complete set of digits
    ...
    1112223000   --
    1112223001     |
    1112223002     |  
    ...            |   => 111223
    1112223009     |
    ...            |
    1112223999     |
    ...            |
    1112223999   --
    ...
我尝试使用Tree::Trie(用于更快的查找)和纯旧哈希(用于迭代键)创建一个脚本

我组合的逻辑没有到达根前缀,它只执行一轮分组:

1000  --
1001    |
1002    | => 100
...     |
1009  --
1010      => 1010 
此外,遍历这一数量的数据非常缓慢

我确信有**更好的选择**,从处理数据的速度和满足这一需求两方面考虑

非常感谢您在满足这一需求方面提供的建议/帮助。不过,我最熟悉Shell或Perl脚本,可以使用任何类型的脚本解决方案

这是我总结的逻辑,它进行一轮分组,但不执行第二轮分组

#!/usr/bin/perl -w

use Tree::Trie;
use strict;
use Getopt::Long;
use Pod::Usage;

my %w_mk;
my $csv = "./test.csv";
my $debug = 1;
my($trie) = new Tree::Trie;
my $help = 0;
my $man  = 0;
my $cycle = 1;
my $max_key_length = 1;
my $min_key_length = 1;

GetOptions("debug=i"             => \$debug,
           "source_file|s=s"     => \$csv,
           "cycle|c=i"           => \$cycle,
           "help|?"              => \$help,
           "man!"                => \$man
           ) or pod2usage("Try '$0 --help' for more information." );

pod2usage(-verbose => 99, -section => "NAME") if $help;
pod2usage(-verbose => 2) if $man;

sub clean_ds
{
  my ($key, @keys) = @_;
  my $key_len = scalar @keys;

  if ($key_len == 10) {
    foreach my $k (@keys) {
      $trie->remove($k);
    }

    print "\t\tRoot key $key found!!\n" if ($debug > 1);

    ## Add this working key as a new key
    $w_mk{$key} = 2;

    ## remove all of the related complete keys
    delete @w_mk{@keys};

    print "\t\tRemoved keys: [@keys]\n\n" if ($debug > 1);
  }
}

sub is_complete_key
{
  my ($key) = @_;
  my $len = length $key;
  my (@key_list) = $trie->lookup($key, $len + 1);
  my ($key_list_len) = scalar @key_list;

  ## When a key has been processed once,
  ## let's mark it that it has been processed
  $w_mk{$key} = 2;

  print "\t\tSearch for key: '$key'\n\t\tNo. of items found: $key_list_len\n\t\titems : [@key_list]\n" if ($debug >= 3);

  # Complete DNIS found
  if ($key_list_len == 10) {
    #because trie lookup when prefix length is supplied returns only the suffix portion
    #e.g. 1000, 1001, 1002, 1003
    #when lookup('100', 4) returns 0, 1, 2, 3
    #update the returned key list by prepending it with the original key

    my @t_key_list =  @key_list;
    for my $elem (@t_key_list) {
      $elem = $key.$elem;
    }

    clean_ds($key, @t_key_list);

    return (1, @t_key_list);
  }
  else {
    print "\t\tRoot key $key not adding!!\n\n" if ($debug > 1);
  }

  return (0, @key_list);
}

open (my $handle, '<', $csv) or die "Could not open file '$csv' $!";;

while (my $row = <$handle>) {
  chomp($row);

  my $k_len = length($row);
  $max_key_length = $k_len if ($k_len > $max_key_length);

  $trie->add($row);
  $w_mk{$row} = 1;

  print "data: '$row'\n" if ($debug >= 4);
}

close ($handle);

sub group_keys
{
  my ($key, $iteration) = @_;

  my $value = 0;
  if (exists $w_mk{$key}) {
    $value = $w_mk{$key};
    chomp($value);
  }

  while ($value >= $iteration && length $key > 1) {
    chop($key); # Remove last character of the key

    if (exists $w_mk{$key}) {
      $value = $w_mk{$key};
      chomp($value);
    }

    print "\t(w_key => w_value): '$key' => '$value'\n" if ($debug >= 2);

    ## If the working key has already been processed once,
    ## no need to reprocess it
    if ($value < 2) {
      my ($st, @w_key_list) = is_complete_key($key);

      ##
      ## if number of keys found is less than 10
      ## no need to continue to chop the key
      ## go to the next key
      ##
      #if ($st == 0) {
        last;
      #}
    }
  }
}

sub go_through_keys
{
  my ($lcycle) = @_;

  print "Reduction Cycle: '$lcycle'\n\n" if ($debug >= 3);

  foreach my $key (sort keys %w_mk) {
    my $w_key = $key;
    my $w_value = 0;

    if (exists $w_mk{$w_key}) {
      $w_value = $w_mk{$w_key};
      chomp($w_value);
    }

    print "(Key => Value): '$key' => '$w_value'\n" if ($debug >= 2);
    if ($debug >= 3) {
      my (@keys) = $trie->lookup($key);
      my $key_len = scalar @keys;
      print "\t\tNo. of items found: $key_len\n\t\titems : [@keys]\n" if ($debug >= 3);
    }

    group_keys($w_key, $lcycle);
  }
}

sub reset_key_values
{
  foreach my $key (keys %w_mk) {
    $w_mk{$key} = 1;
  }
}

for (my $i=$min_key_length; $i < $max_key_length; $i++) {
  go_through_keys($i);
  # reset values for each key
  #reset_key_values();
}
print "$_\n" for sort keys %w_mk;

__END__

=head1 NAME

  group_dnis.pl - A script to group and reduce a list of numbers

=head1 SYNOPSIS

  group_dnis.pl - A script to group and reduce a list of numbers

              ------------------------------
                 dnis(s)  => common root
              ------------------------------
                 1000   --
                 1001    |
                 1002    | ==> 100
                 1003    |
                 ...     |
                 1009   --
                 1010      ==> 1010


group_dnis.pl [options]
  Options:
    -help     brief help message
    -man      full documentation

=head1 OPTIONS

=over 4

=item B<-source_file>

  Source file contain list of numbers to be groupped.

=item B<-help>

  Prints usage with some examples of how to use this script.

  group_dnis.pl -s <file name>

=back

  Documentation ends here.

=cut
#/usr/bin/perl-w
使用Tree::Trie;
严格使用;
使用Getopt::Long;
使用Pod::用法;
我的%w_mk;
my$csv=“./test.csv”;
我的$debug=1;
my($trie)=新树::trie;
我的$help=0;
我的$man=0;
我的$cycle=1;
我的$max_key_length=1;
我的$min_key_length=1;
GetOptions(“debug=i”=>\$debug,
“源文件| s=s”=>\$csv,
“循环| c=i”=>\$cycle,
“帮助”?“=>\$help,
“男人!”=>\$man
)或pod2usage(“请尝试“$0--help”以获取更多信息”);
pod2usage(-verbose=>99,-section=>“NAME”)如果$help;
pod2usage(-verbose=>2)如果$man;
次清洁
{
我的($key,@key)=@;
my$key\u len=标量@keys;
如果($key_len==10){
foreach我的$k(@keys){
$trie->remove($k);
}
如果($debug>1),则打印“\t\t已找到密钥$key!!\n”;
##将此工作密钥添加为新密钥
$w_mk{$key}=2;
##删除所有相关的完整密钥
删除@w_mk{@keys};
如果($debug>1),则打印“\t\t移动的密钥:[@keys]\n\n”;
}
}
sub是_complete_键
{
我的($key)=@;
my$len=长度$key;
my(@key_list)=$trie->lookup($key$len+1);
my($key\u list\u len)=标量@key\u list;
##当密钥被处理一次时,
##让我们标记它已被处理
$w_mk{$key}=2;
如果($debug>=3),则打印“\t\t搜索关键字:'$key'\n\t\t未找到任何项:$key\U list\u len\n\t\t项目:[@key\U list]\n”;
#找到完整的DNA
如果($key\u list\u len==10){
#因为提供前缀长度时的trie查找只返回后缀部分
#e、 g.1000、1001、1002、1003
#查找时('100',4)返回0、1、2、3
#通过在返回的密钥列表前面加上原始密钥来更新它
我的@t_key_list=@key_list;
我的$elem(@t_key_list){
$elem=$key.$elem;
}
清理数据($key,@t\u key\u list);
返回(1,@t_key_list);
}
否则{
如果($debug>1),则打印“\t\t密钥$key未添加!!\n\n”;
}
返回(0,@key_list);
}

打开(我的$handle,这里有一些JavaScript中的线性内容(假设
列表
已排序)。转换为AWK应该不会太糟糕。不确定它是否是完全证明…可能需要根据实际数据进行调试

函数f(list){
var i=0,j=9,k=0,templast=[list[i]];
函数组(){
while(list[i+1]&&list[i]。substr(0,j)==list[i+1]。substr(0,j)
&&数字(列表[i].子列表(j-10))+1==列表[i+1].子列表(j-10)){
圣堂武士推(列表[i+1]);
i++;
}
}
函数isComplete(){
返回编号(templast[0].substr(j-10))+Math.pow(10,10-j)-1
==templast[templast.length-1].substr(j-10);
}
而(i对于(l=Math.pow(10,k);l在JavaScript中有一些线性的东西(假设
list
已排序)。转换为AWK应该不会太糟糕。不确定它是否是完全证明…可能需要根据实际数据进行调试

函数f(list){
var i=0,j=9,k=0,templast=[list[i]];
函数组(){
while(list[i+1]&&list[i]。substr(0,j)==list[i+1]。substr(0,j)
&&数字(列表[i].子列表(j-10))+1==列表[i+1].子列表(j-10)){
圣堂武士推(列表[i+1]);
i++;
}
}
函数isComplete(){
返回编号(templast[0].substr(j-10))+Math.pow(10,10-j)-1
==templast[templast.length-1].substr(j-10);
}
而(i对于(l=Math.pow(10,k);l在您确定了“完整”前缀之后,您如何知道(这是否很重要?)它有多少额外的数字?您已经给出了两个示例,
10[00…10]
1112223[000…999]
,但是我们如何知道
1112223
不是
1112223[0…9]
之前,也许
10
实际上是
10[00000…99999]
?所有数字的位数都相同吗?谢谢你的评论。出于我的需要,不管有多少额外的数字,我对根前缀更感兴趣。在你评论的第二部分,所有数字的位数都相同。因此
10[00…10]
将不存在于您的数据中,对吗?它必须是
10[00000000…999999999]
才能被视为完整的?
数据组1组2 1000 100 100 10
1000  --
1001    |
1002    | => 100
...     |
1009  --
1010      => 1010 
#!/usr/bin/perl -w

use Tree::Trie;
use strict;
use Getopt::Long;
use Pod::Usage;

my %w_mk;
my $csv = "./test.csv";
my $debug = 1;
my($trie) = new Tree::Trie;
my $help = 0;
my $man  = 0;
my $cycle = 1;
my $max_key_length = 1;
my $min_key_length = 1;

GetOptions("debug=i"             => \$debug,
           "source_file|s=s"     => \$csv,
           "cycle|c=i"           => \$cycle,
           "help|?"              => \$help,
           "man!"                => \$man
           ) or pod2usage("Try '$0 --help' for more information." );

pod2usage(-verbose => 99, -section => "NAME") if $help;
pod2usage(-verbose => 2) if $man;

sub clean_ds
{
  my ($key, @keys) = @_;
  my $key_len = scalar @keys;

  if ($key_len == 10) {
    foreach my $k (@keys) {
      $trie->remove($k);
    }

    print "\t\tRoot key $key found!!\n" if ($debug > 1);

    ## Add this working key as a new key
    $w_mk{$key} = 2;

    ## remove all of the related complete keys
    delete @w_mk{@keys};

    print "\t\tRemoved keys: [@keys]\n\n" if ($debug > 1);
  }
}

sub is_complete_key
{
  my ($key) = @_;
  my $len = length $key;
  my (@key_list) = $trie->lookup($key, $len + 1);
  my ($key_list_len) = scalar @key_list;

  ## When a key has been processed once,
  ## let's mark it that it has been processed
  $w_mk{$key} = 2;

  print "\t\tSearch for key: '$key'\n\t\tNo. of items found: $key_list_len\n\t\titems : [@key_list]\n" if ($debug >= 3);

  # Complete DNIS found
  if ($key_list_len == 10) {
    #because trie lookup when prefix length is supplied returns only the suffix portion
    #e.g. 1000, 1001, 1002, 1003
    #when lookup('100', 4) returns 0, 1, 2, 3
    #update the returned key list by prepending it with the original key

    my @t_key_list =  @key_list;
    for my $elem (@t_key_list) {
      $elem = $key.$elem;
    }

    clean_ds($key, @t_key_list);

    return (1, @t_key_list);
  }
  else {
    print "\t\tRoot key $key not adding!!\n\n" if ($debug > 1);
  }

  return (0, @key_list);
}

open (my $handle, '<', $csv) or die "Could not open file '$csv' $!";;

while (my $row = <$handle>) {
  chomp($row);

  my $k_len = length($row);
  $max_key_length = $k_len if ($k_len > $max_key_length);

  $trie->add($row);
  $w_mk{$row} = 1;

  print "data: '$row'\n" if ($debug >= 4);
}

close ($handle);

sub group_keys
{
  my ($key, $iteration) = @_;

  my $value = 0;
  if (exists $w_mk{$key}) {
    $value = $w_mk{$key};
    chomp($value);
  }

  while ($value >= $iteration && length $key > 1) {
    chop($key); # Remove last character of the key

    if (exists $w_mk{$key}) {
      $value = $w_mk{$key};
      chomp($value);
    }

    print "\t(w_key => w_value): '$key' => '$value'\n" if ($debug >= 2);

    ## If the working key has already been processed once,
    ## no need to reprocess it
    if ($value < 2) {
      my ($st, @w_key_list) = is_complete_key($key);

      ##
      ## if number of keys found is less than 10
      ## no need to continue to chop the key
      ## go to the next key
      ##
      #if ($st == 0) {
        last;
      #}
    }
  }
}

sub go_through_keys
{
  my ($lcycle) = @_;

  print "Reduction Cycle: '$lcycle'\n\n" if ($debug >= 3);

  foreach my $key (sort keys %w_mk) {
    my $w_key = $key;
    my $w_value = 0;

    if (exists $w_mk{$w_key}) {
      $w_value = $w_mk{$w_key};
      chomp($w_value);
    }

    print "(Key => Value): '$key' => '$w_value'\n" if ($debug >= 2);
    if ($debug >= 3) {
      my (@keys) = $trie->lookup($key);
      my $key_len = scalar @keys;
      print "\t\tNo. of items found: $key_len\n\t\titems : [@keys]\n" if ($debug >= 3);
    }

    group_keys($w_key, $lcycle);
  }
}

sub reset_key_values
{
  foreach my $key (keys %w_mk) {
    $w_mk{$key} = 1;
  }
}

for (my $i=$min_key_length; $i < $max_key_length; $i++) {
  go_through_keys($i);
  # reset values for each key
  #reset_key_values();
}
print "$_\n" for sort keys %w_mk;

__END__

=head1 NAME

  group_dnis.pl - A script to group and reduce a list of numbers

=head1 SYNOPSIS

  group_dnis.pl - A script to group and reduce a list of numbers

              ------------------------------
                 dnis(s)  => common root
              ------------------------------
                 1000   --
                 1001    |
                 1002    | ==> 100
                 1003    |
                 ...     |
                 1009   --
                 1010      ==> 1010


group_dnis.pl [options]
  Options:
    -help     brief help message
    -man      full documentation

=head1 OPTIONS

=over 4

=item B<-source_file>

  Source file contain list of numbers to be groupped.

=item B<-help>

  Prints usage with some examples of how to use this script.

  group_dnis.pl -s <file name>

=back

  Documentation ends here.

=cut
console.log(f(['1002230091','1112223000','1112223001','1112223002','1112223003'
              ,'1112223004','1112223005','1112223006','1112223007','1112223008'
              ,'1112223009']));
/*
  1002230091
  111222300
*/

console.log(f(['1002230091','1112223000','1112223001','1112223002','1112223003'
              ,'1112223004','1112223005','1112223006','1112223007','1112223008'
              ,'1112223009','1112223010','1112223011','1112223012']));

/*
1002230091
111222300
1112223010
1112223011
1112223012
*/