Python 使用脚本对数字进行分组的一种方法
我有一个大的(800K-唯一和排序)数字列表。比如说Python 使用脚本对数字进行分组的一种方法,python,algorithm,shell,awk,Python,Algorithm,Shell,Awk,我有一个大的(800K-唯一和排序)数字列表。比如说 1002230091 => 1002230091 <- not a complete set of digits ... 1112223000 -- 1112223001 | 1112223002 | ... | => 111223 1112223009 | ...
1002230091 => 1002230091 <- not a complete set of digits
...
1112223000 --
1112223001 |
1112223002 |
... | => 111223
1112223009 |
... |
1112223999 |
... |
1112223999 --
...
我尝试使用Tree::Trie(用于更快的查找)和纯旧哈希(用于迭代键)创建一个脚本
我组合的逻辑没有到达根前缀,它只执行一轮分组:
1000 --
1001 |
1002 | => 100
... |
1009 --
1010 => 1010
此外,遍历这一数量的数据非常缓慢
我确信有**更好的选择**,从处理数据的速度和满足这一需求两方面考虑
非常感谢您在满足这一需求方面提供的建议/帮助。不过,我最熟悉Shell或Perl脚本,可以使用任何类型的脚本解决方案
这是我总结的逻辑,它进行一轮分组,但不执行第二轮分组
#!/usr/bin/perl -w
use Tree::Trie;
use strict;
use Getopt::Long;
use Pod::Usage;
my %w_mk;
my $csv = "./test.csv";
my $debug = 1;
my($trie) = new Tree::Trie;
my $help = 0;
my $man = 0;
my $cycle = 1;
my $max_key_length = 1;
my $min_key_length = 1;
GetOptions("debug=i" => \$debug,
"source_file|s=s" => \$csv,
"cycle|c=i" => \$cycle,
"help|?" => \$help,
"man!" => \$man
) or pod2usage("Try '$0 --help' for more information." );
pod2usage(-verbose => 99, -section => "NAME") if $help;
pod2usage(-verbose => 2) if $man;
sub clean_ds
{
my ($key, @keys) = @_;
my $key_len = scalar @keys;
if ($key_len == 10) {
foreach my $k (@keys) {
$trie->remove($k);
}
print "\t\tRoot key $key found!!\n" if ($debug > 1);
## Add this working key as a new key
$w_mk{$key} = 2;
## remove all of the related complete keys
delete @w_mk{@keys};
print "\t\tRemoved keys: [@keys]\n\n" if ($debug > 1);
}
}
sub is_complete_key
{
my ($key) = @_;
my $len = length $key;
my (@key_list) = $trie->lookup($key, $len + 1);
my ($key_list_len) = scalar @key_list;
## When a key has been processed once,
## let's mark it that it has been processed
$w_mk{$key} = 2;
print "\t\tSearch for key: '$key'\n\t\tNo. of items found: $key_list_len\n\t\titems : [@key_list]\n" if ($debug >= 3);
# Complete DNIS found
if ($key_list_len == 10) {
#because trie lookup when prefix length is supplied returns only the suffix portion
#e.g. 1000, 1001, 1002, 1003
#when lookup('100', 4) returns 0, 1, 2, 3
#update the returned key list by prepending it with the original key
my @t_key_list = @key_list;
for my $elem (@t_key_list) {
$elem = $key.$elem;
}
clean_ds($key, @t_key_list);
return (1, @t_key_list);
}
else {
print "\t\tRoot key $key not adding!!\n\n" if ($debug > 1);
}
return (0, @key_list);
}
open (my $handle, '<', $csv) or die "Could not open file '$csv' $!";;
while (my $row = <$handle>) {
chomp($row);
my $k_len = length($row);
$max_key_length = $k_len if ($k_len > $max_key_length);
$trie->add($row);
$w_mk{$row} = 1;
print "data: '$row'\n" if ($debug >= 4);
}
close ($handle);
sub group_keys
{
my ($key, $iteration) = @_;
my $value = 0;
if (exists $w_mk{$key}) {
$value = $w_mk{$key};
chomp($value);
}
while ($value >= $iteration && length $key > 1) {
chop($key); # Remove last character of the key
if (exists $w_mk{$key}) {
$value = $w_mk{$key};
chomp($value);
}
print "\t(w_key => w_value): '$key' => '$value'\n" if ($debug >= 2);
## If the working key has already been processed once,
## no need to reprocess it
if ($value < 2) {
my ($st, @w_key_list) = is_complete_key($key);
##
## if number of keys found is less than 10
## no need to continue to chop the key
## go to the next key
##
#if ($st == 0) {
last;
#}
}
}
}
sub go_through_keys
{
my ($lcycle) = @_;
print "Reduction Cycle: '$lcycle'\n\n" if ($debug >= 3);
foreach my $key (sort keys %w_mk) {
my $w_key = $key;
my $w_value = 0;
if (exists $w_mk{$w_key}) {
$w_value = $w_mk{$w_key};
chomp($w_value);
}
print "(Key => Value): '$key' => '$w_value'\n" if ($debug >= 2);
if ($debug >= 3) {
my (@keys) = $trie->lookup($key);
my $key_len = scalar @keys;
print "\t\tNo. of items found: $key_len\n\t\titems : [@keys]\n" if ($debug >= 3);
}
group_keys($w_key, $lcycle);
}
}
sub reset_key_values
{
foreach my $key (keys %w_mk) {
$w_mk{$key} = 1;
}
}
for (my $i=$min_key_length; $i < $max_key_length; $i++) {
go_through_keys($i);
# reset values for each key
#reset_key_values();
}
print "$_\n" for sort keys %w_mk;
__END__
=head1 NAME
group_dnis.pl - A script to group and reduce a list of numbers
=head1 SYNOPSIS
group_dnis.pl - A script to group and reduce a list of numbers
------------------------------
dnis(s) => common root
------------------------------
1000 --
1001 |
1002 | ==> 100
1003 |
... |
1009 --
1010 ==> 1010
group_dnis.pl [options]
Options:
-help brief help message
-man full documentation
=head1 OPTIONS
=over 4
=item B<-source_file>
Source file contain list of numbers to be groupped.
=item B<-help>
Prints usage with some examples of how to use this script.
group_dnis.pl -s <file name>
=back
Documentation ends here.
=cut
#/usr/bin/perl-w
使用Tree::Trie;
严格使用;
使用Getopt::Long;
使用Pod::用法;
我的%w_mk;
my$csv=“./test.csv”;
我的$debug=1;
my($trie)=新树::trie;
我的$help=0;
我的$man=0;
我的$cycle=1;
我的$max_key_length=1;
我的$min_key_length=1;
GetOptions(“debug=i”=>\$debug,
“源文件| s=s”=>\$csv,
“循环| c=i”=>\$cycle,
“帮助”?“=>\$help,
“男人!”=>\$man
)或pod2usage(“请尝试“$0--help”以获取更多信息”);
pod2usage(-verbose=>99,-section=>“NAME”)如果$help;
pod2usage(-verbose=>2)如果$man;
次清洁
{
我的($key,@key)=@;
my$key\u len=标量@keys;
如果($key_len==10){
foreach我的$k(@keys){
$trie->remove($k);
}
如果($debug>1),则打印“\t\t已找到密钥$key!!\n”;
##将此工作密钥添加为新密钥
$w_mk{$key}=2;
##删除所有相关的完整密钥
删除@w_mk{@keys};
如果($debug>1),则打印“\t\t移动的密钥:[@keys]\n\n”;
}
}
sub是_complete_键
{
我的($key)=@;
my$len=长度$key;
my(@key_list)=$trie->lookup($key$len+1);
my($key\u list\u len)=标量@key\u list;
##当密钥被处理一次时,
##让我们标记它已被处理
$w_mk{$key}=2;
如果($debug>=3),则打印“\t\t搜索关键字:'$key'\n\t\t未找到任何项:$key\U list\u len\n\t\t项目:[@key\U list]\n”;
#找到完整的DNA
如果($key\u list\u len==10){
#因为提供前缀长度时的trie查找只返回后缀部分
#e、 g.1000、1001、1002、1003
#查找时('100',4)返回0、1、2、3
#通过在返回的密钥列表前面加上原始密钥来更新它
我的@t_key_list=@key_list;
我的$elem(@t_key_list){
$elem=$key.$elem;
}
清理数据($key,@t\u key\u list);
返回(1,@t_key_list);
}
否则{
如果($debug>1),则打印“\t\t密钥$key未添加!!\n\n”;
}
返回(0,@key_list);
}
打开(我的$handle,这里有一些JavaScript中的线性内容(假设列表
已排序)。转换为AWK应该不会太糟糕。不确定它是否是完全证明…可能需要根据实际数据进行调试
函数f(list){
var i=0,j=9,k=0,templast=[list[i]];
函数组(){
while(list[i+1]&&list[i]。substr(0,j)==list[i+1]。substr(0,j)
&&数字(列表[i].子列表(j-10))+1==列表[i+1].子列表(j-10)){
圣堂武士推(列表[i+1]);
i++;
}
}
函数isComplete(){
返回编号(templast[0].substr(j-10))+Math.pow(10,10-j)-1
==templast[templast.length-1].substr(j-10);
}
而(i 对于(l=Math.pow(10,k);l在JavaScript中有一些线性的东西(假设list
已排序)。转换为AWK应该不会太糟糕。不确定它是否是完全证明…可能需要根据实际数据进行调试
函数f(list){
var i=0,j=9,k=0,templast=[list[i]];
函数组(){
while(list[i+1]&&list[i]。substr(0,j)==list[i+1]。substr(0,j)
&&数字(列表[i].子列表(j-10))+1==列表[i+1].子列表(j-10)){
圣堂武士推(列表[i+1]);
i++;
}
}
函数isComplete(){
返回编号(templast[0].substr(j-10))+Math.pow(10,10-j)-1
==templast[templast.length-1].substr(j-10);
}
而(i 对于(l=Math.pow(10,k);l在您确定了“完整”前缀之后,您如何知道(这是否很重要?)它有多少额外的数字?您已经给出了两个示例,10[00…10]
和1112223[000…999]
,但是我们如何知道1112223
不是1112223[0…9]
之前,也许10
实际上是10[00000…99999]
?所有数字的位数都相同吗?谢谢你的评论。出于我的需要,不管有多少额外的数字,我对根前缀更感兴趣。在你评论的第二部分,所有数字的位数都相同。因此10[00…10]
将不存在于您的数据中,对吗?它必须是10[00000000…999999999]
才能被视为完整的?数据组1组2 1000 100 100 10
1000 --
1001 |
1002 | => 100
... |
1009 --
1010 => 1010
#!/usr/bin/perl -w
use Tree::Trie;
use strict;
use Getopt::Long;
use Pod::Usage;
my %w_mk;
my $csv = "./test.csv";
my $debug = 1;
my($trie) = new Tree::Trie;
my $help = 0;
my $man = 0;
my $cycle = 1;
my $max_key_length = 1;
my $min_key_length = 1;
GetOptions("debug=i" => \$debug,
"source_file|s=s" => \$csv,
"cycle|c=i" => \$cycle,
"help|?" => \$help,
"man!" => \$man
) or pod2usage("Try '$0 --help' for more information." );
pod2usage(-verbose => 99, -section => "NAME") if $help;
pod2usage(-verbose => 2) if $man;
sub clean_ds
{
my ($key, @keys) = @_;
my $key_len = scalar @keys;
if ($key_len == 10) {
foreach my $k (@keys) {
$trie->remove($k);
}
print "\t\tRoot key $key found!!\n" if ($debug > 1);
## Add this working key as a new key
$w_mk{$key} = 2;
## remove all of the related complete keys
delete @w_mk{@keys};
print "\t\tRemoved keys: [@keys]\n\n" if ($debug > 1);
}
}
sub is_complete_key
{
my ($key) = @_;
my $len = length $key;
my (@key_list) = $trie->lookup($key, $len + 1);
my ($key_list_len) = scalar @key_list;
## When a key has been processed once,
## let's mark it that it has been processed
$w_mk{$key} = 2;
print "\t\tSearch for key: '$key'\n\t\tNo. of items found: $key_list_len\n\t\titems : [@key_list]\n" if ($debug >= 3);
# Complete DNIS found
if ($key_list_len == 10) {
#because trie lookup when prefix length is supplied returns only the suffix portion
#e.g. 1000, 1001, 1002, 1003
#when lookup('100', 4) returns 0, 1, 2, 3
#update the returned key list by prepending it with the original key
my @t_key_list = @key_list;
for my $elem (@t_key_list) {
$elem = $key.$elem;
}
clean_ds($key, @t_key_list);
return (1, @t_key_list);
}
else {
print "\t\tRoot key $key not adding!!\n\n" if ($debug > 1);
}
return (0, @key_list);
}
open (my $handle, '<', $csv) or die "Could not open file '$csv' $!";;
while (my $row = <$handle>) {
chomp($row);
my $k_len = length($row);
$max_key_length = $k_len if ($k_len > $max_key_length);
$trie->add($row);
$w_mk{$row} = 1;
print "data: '$row'\n" if ($debug >= 4);
}
close ($handle);
sub group_keys
{
my ($key, $iteration) = @_;
my $value = 0;
if (exists $w_mk{$key}) {
$value = $w_mk{$key};
chomp($value);
}
while ($value >= $iteration && length $key > 1) {
chop($key); # Remove last character of the key
if (exists $w_mk{$key}) {
$value = $w_mk{$key};
chomp($value);
}
print "\t(w_key => w_value): '$key' => '$value'\n" if ($debug >= 2);
## If the working key has already been processed once,
## no need to reprocess it
if ($value < 2) {
my ($st, @w_key_list) = is_complete_key($key);
##
## if number of keys found is less than 10
## no need to continue to chop the key
## go to the next key
##
#if ($st == 0) {
last;
#}
}
}
}
sub go_through_keys
{
my ($lcycle) = @_;
print "Reduction Cycle: '$lcycle'\n\n" if ($debug >= 3);
foreach my $key (sort keys %w_mk) {
my $w_key = $key;
my $w_value = 0;
if (exists $w_mk{$w_key}) {
$w_value = $w_mk{$w_key};
chomp($w_value);
}
print "(Key => Value): '$key' => '$w_value'\n" if ($debug >= 2);
if ($debug >= 3) {
my (@keys) = $trie->lookup($key);
my $key_len = scalar @keys;
print "\t\tNo. of items found: $key_len\n\t\titems : [@keys]\n" if ($debug >= 3);
}
group_keys($w_key, $lcycle);
}
}
sub reset_key_values
{
foreach my $key (keys %w_mk) {
$w_mk{$key} = 1;
}
}
for (my $i=$min_key_length; $i < $max_key_length; $i++) {
go_through_keys($i);
# reset values for each key
#reset_key_values();
}
print "$_\n" for sort keys %w_mk;
__END__
=head1 NAME
group_dnis.pl - A script to group and reduce a list of numbers
=head1 SYNOPSIS
group_dnis.pl - A script to group and reduce a list of numbers
------------------------------
dnis(s) => common root
------------------------------
1000 --
1001 |
1002 | ==> 100
1003 |
... |
1009 --
1010 ==> 1010
group_dnis.pl [options]
Options:
-help brief help message
-man full documentation
=head1 OPTIONS
=over 4
=item B<-source_file>
Source file contain list of numbers to be groupped.
=item B<-help>
Prints usage with some examples of how to use this script.
group_dnis.pl -s <file name>
=back
Documentation ends here.
=cut
console.log(f(['1002230091','1112223000','1112223001','1112223002','1112223003'
,'1112223004','1112223005','1112223006','1112223007','1112223008'
,'1112223009']));
/*
1002230091
111222300
*/
console.log(f(['1002230091','1112223000','1112223001','1112223002','1112223003'
,'1112223004','1112223005','1112223006','1112223007','1112223008'
,'1112223009','1112223010','1112223011','1112223012']));
/*
1002230091
111222300
1112223010
1112223011
1112223012
*/