在SQL Server中使用Royal Mail PAF文件
我被赋予了一项令人沮丧的任务,试图对我们刚刚购买的皇家邮政PAF文件进行处理,现在我已将所有相关表加载到我们的SQL server中,并根据相关ID将它们链接在一起。我现在需要做的是根据多个规则将适当的地址放在一起,例如,如果只提供了建筑物名称或编号,或者如果提供了特定邮政编码时提供了邮政信箱。我希望,因为这似乎是一个广泛使用的文件,有人可能已经做了一些这方面的工作,任何人的想法,因为他们希望在本周末它在SQL Server中使用Royal Mail PAF文件,sql,tsql,Sql,Tsql,我被赋予了一项令人沮丧的任务,试图对我们刚刚购买的皇家邮政PAF文件进行处理,现在我已将所有相关表加载到我们的SQL server中,并根据相关ID将它们链接在一起。我现在需要做的是根据多个规则将适当的地址放在一起,例如,如果只提供了建筑物名称或编号,或者如果提供了特定邮政编码时提供了邮政信箱。我希望,因为这似乎是一个广泛使用的文件,有人可能已经做了一些这方面的工作,任何人的想法,因为他们希望在本周末它 希望有人能帮忙,p心情沮丧-这是一件大事-有很多规则,也有很多例外。 我想你已经看过PAF程
希望有人能帮忙,p心情沮丧-这是一件大事-有很多规则,也有很多例外。 我想你已经看过PAF程序员指南中的所有规则了吧? 如果你想在周末之前完成,你最好得到“压缩标准”版本,它实际上是你想要的扩展地址,但当然它不是数据库的标准化版本。 很久以前,我用关系格式构建地址。我看看能不能找到密码 发现一些丑陋的8岁Perl-小心使用
while ( <PAF> ) # Reading from mysql view - you can work this out from the fields populated by the split /\t/
{
chomp;
$incount++;
print STDERR "Processed ..... $incount\n" if $incount%1000000 == 0;
my ( $postcode , $aKey , $lkey , $skey ,
$sekey , $dskey , $dsekey , $bnumber ,
$bkey , $sbkey , $households , $oKey ,
$fmtPcType , $concat , $dps , $fmtsUser ,
$poBox , $version , $active ,
$fmtCompany , $fmtDepartment , $rawSubLine , $rawBldLine ,
$DepSteet , $fmtDepSteetEnd , $Steet , $fmtSteetEnd ,
$DepLocality , $Locality , $PostTown )
= split /\t/ , $_ , -1 ;
my $fmtaKey = sprintf("%08s",$aKey);
my $fmtoKey = sprintf("%08s",$oKey);
$fmtoKey = "" if $fmtoKey eq "00000000" ;
$fmtoKey = $fmtaKey if $fmtPcType eq "L";
$fmtDepLocality = removePunch($DepLocality);
$fmtLocality = removePunch($Locality,"dq");
$fmtDepSteet = removePunch($DepSteet);
$fmtSteet = removePunch($Steet,"dh");
$rawSubLine = removePunch($rawSubLine,"d");
$rawBldLine = removePunch($rawBldLine,"d");
my $fmtPostTown = removePunch($PostTown);
my ($opc,$ipc) = unpack("A4A3" ,$postcode);
$opc =~ s/\s$//;
my $fmtPoBox = "PO BOX $poBox" if (length($poBox) > 0);
my ($fmtSubBuilding , $fmtBuilding ) = ( "","") ;
# ##############################################################################
# Format rules are based on presence of builing name , subbuilding and number
# See PAF Digest for details
# ##############################################################################
my ($subBuildingFlag,$buildingFlag,$buildingNumberFlag) = ("N","N","N") ;
$buildingNumberFlag ="Y" if $bnumber ne "0" ;
$buildingFlag ="Y" if length($rawBldLine) > 0;
$subBuildingFlag ="Y" if length($rawSubLine) > 0;
my $formatKey="${subBuildingFlag}${buildingFlag}${buildingNumberFlag}";
my $ruleid ;
# ----------------------------------------------------------------------------
# Formmating Rule 1 (Org name only)
# ----------------------------------------------------------------------------
if ( $formatKey eq "NNN" )
{
$ruleid="1";
}
# ----------------------------------------------------------------------------
# Formmating Rule 2 (Building number only)
# ----------------------------------------------------------------------------
elsif ( $formatKey eq "NNY" )
{
$ruleid="2";
InsertNumber ($bnumber) ;
}
# ----------------------------------------------------------------------------
# Formmating Rule 3 (Building Name only)
# ----------------------------------------------------------------------------
elsif ($formatKey eq "NYN")
{
$ruleid="3";
my $N="";
($fmtBuilding,$N) = split /\|/ , F1Check ($rawBldLine) , -1 ;
InsertNumber ($N) ;
}
# ----------------------------------------------------------------------------
# Formmating Rule 4 (Building Name and building Number)
# ----------------------------------------------------------------------------
elsif ($formatKey eq "NYY")
{
$ruleid="4";
$fmtBuilding = "$rawBldLine";
InsertNumber ($bnumber) ;
}
# ----------------------------------------------------------------------------
# Formmating Rule 5 (SubBuilding Name and building Number)
# ----------------------------------------------------------------------------
elsif ($formatKey eq "YNY")
{
$ruleid="5";
if ($concat eq "Y")
{
my $numSub = "$bnumber $rawSubLine";
InsertNumber ($numSub) ;
}
else
{
$fmtSubBuilding = "$rawSubLine";
InsertNumber ($bnumber) ;
}
}
# ----------------------------------------------------------------------------
# Formmating Rule 6 (SubBuilding Name and building name)
# ----------------------------------------------------------------------------
elsif ($formatKey eq "YYN" )
{
$ruleid="6";
($fmtSubBuilding,$N1) = split /\|/ , F1Check ($rawSubLine) , -1 ;
($fmtBuilding,$N2) = split /\|/ , F1Check ($rawBldLine) , -1 ;
if ( $fmtSubBuilding eq "" )
{
if ( $N2 =~ /^REAR OF/ )
{
$fmtSubBuilding .= "$N1 $N2" ;
$N1 = "" ;
$N2 = "" ;
}
else
{
$fmtSubBuilding .= "$N1 " . $fmtBuilding;
$fmtBuilding = "";
$N1 = "" ;
}
}
$N2 = $N1 if $N2 eq "";
InsertNumber ($N2) ;
}
# ----------------------------------------------------------------------------
# Formmating Rule 7 (SubBuilding Name ,building name and building number)
# ----------------------------------------------------------------------------
elsif ($formatKey eq "YYY" )
{
$ruleid="7";
($fmtSubBuilding,$N1) = split /\|/ , F1Check ($rawSubLine) , -1 ;
($fmtBuilding,$N2) = split /\|/ , F1Check ($rawBldLine) , -1 ;
if ( $fmtSubBuilding eq "" )
{
$fmtSubBuilding .= "$N1 " . $fmtBuilding;
$fmtBuilding = "";
}
InsertNumber ($bnumber) ;
}
# ##############################################################################
# Format the address
# ##############################################################################
#----------------------------------------------------------------
# subbuilding = building if no subbuilding (why?)
#----------------------------------------------------------------
if (( length($fmtBuilding) > 0) && ($fmtSubBuilding eq "" ))
{
$fmtSubBuilding = $fmtBuilding;
$fmtBuilding = "";
}
#----------------------------------------------------------------
# Get rid of duplicate lines
#----------------------------------------------------------------
if ("${fmtSubBuilding}${fmtBuilding}" eq "${fmtSteet}${fmtSteetEnd}")
{
$fmtSubBuilding="";
$fmtBuilding="";
}
#----------------------------------------------------------------
# Parse out number ranges (including number suffixes)
#----------------------------------------------------------------
my ($fmtStreetName ,$lo_num , $low_suf , $hi_num ,$hi_suf) = split /\|/ , getLoHiNum ("$fmtSteet") , -1 ;
my $dependentTfare = "$fmtDepSteet";
$dependentTfare .= " $fmtDepSteetEnd" if length($fmtDepSteetEnd) > 0;
# ABERDEEN CITY COUNCIL|EDUCATION DEPARTMENT||ST NICHOLAS HOUSE||||BROAD|STREET|||ABERDEEN|ABERDEENSHIRE|AB10|1AG|1A|0|01901355|01901355|L||
my $fmtaddr .= "$postcode|$dps|$fmtaKey|$fmtoKey|$fmtPoBox|$fmtSubBuilding|$fmtBuilding|$fmtDepSteet|$fmtDepSteetEnd|" ;
$fmtaddr .= "$fmtSteet|$fmtSteetEnd|$fmtDepLocality|$fmtLocality|$fmtPostTown|$opc|$ipc|" ;
$fmtaddr .= "$ruleid|$lo_num|$low_suf|$hi_num|$hi_suf|$dependentTfare|$fmtStreetName";
print "$fmtaddr\n";
}
close PAF;
# # ############################################################################
# SUB F1Check (PAF Digest Note:1 Page 42)
#
# Try and extract a number embeded in the (sub)building name
# There are loads of exceptions to the rules in the PAF digest
# hence the horrible regex's
# # ############################################################################
sub F1Check
{
my ($building) = (@_);
my $bnumber = "";
if ($building =~ m/^REAR OF.+\d/ )
{
$bnumber=$building;
$building="" ;
}
elsif (($building =~ m/\d+/) && ($building !~ /UNIT[S]?\s|
FLAT[S]?\s|
^DP\d+[A-Z]?$|
BLOCK[S]?\s|
^OFFICE[S]?\s|
HANG[EA]R\s|
^BUILDING[S]?\s|
^HOUSE\s|
HOLDING\s|
APARTMENT\s|
CHALET\s|
^BUNGALOW\s|
^CARAVAN\s|
^ANNEXE\s|
^PLOT[S]?\s|
^HOME[S]?\s|
^LODGE\s\d|
^PLATFORM\s|
^DOMUS\s|
^BARLOW\s|
^BEALAH\s|
^KIOSK[S]?\s|
^LEVEL[S]?\s|
^VILLA\s|
MEOTA\s|
MAXI[NM]\s|
^AQUEOUS\s|
SUITE[S]?\s|
WING\s|
^CAMPUS\s|
^STUDIO\s|
^COTTAGE\s|
^STALL\s|
^SHOP\s|
^ARCH\s|
^QUAY\s|
^ABOVE\s\d|
^LINK\s|
JETTY\s|
WAREHOUSE\s|
^HOLDING\s|
^PENTHOUSE\s|
^MOORING\s|
^BOTHY\s|
^MAISONETTE\s|
^SITE\s|
^WORKSHOP\s|
^BARN[S]?\s|
STALL\s|
^BOAT\s|
^STAND\s|
^TOWER\s\d|
^YARD\s\d|
^STANCE\s|
^VAN\s|
^BAY\s\d|
^MOBILE HOME\s|
^STABLE\s|
^ROOM\s|
^[A-Z]\d+$
/xo
))
{
if ( $building =~ m/^([A-Z]?\d+[A-Z]{0,2}|
\d+[A-Z]{0,2}[\-\&\ \\\/]{0,1}\d+[A-Z]{0,2}|
[A-Z])$/ox )
{
$bnumber=$building;
$building="" ;
}
elsif ($building =~ m/^([A-Z]?\d*\s*[A-Z\s\.]+)(\d+[A-Z]{0,2}[\-\&\ \\\/]{0,1}\d*[A-Z]{0,2})$/o)
{
if (length($1) > 2)
{
$bnumber=$2;
$building=$1;
$building =~ s/\s+$//g;
}
}
elsif ($building =~ m/^(\D+)(\d+[A-Z]{0,2}[\-\&\ \\\/]\d+[A-Z]{0,2})$/)
{
$bnumber=$2;
$building=$1;
$building =~ s/\s+$//g;
}
}
return "$building|$bnumber"
}
# # ############################################################################
# SUB InsertNumber
#
# Prepend number to first non blank line from thoroughfare onwards
# # ############################################################################
sub InsertNumber
{
my ($N) = (@_) ;
if ( $N =~ /\d/ && $N ne "0")
{
if (length($fmtDepSteet) > 0) {$fmtDepSteet = $N . " " . $fmtDepSteet ;}
elsif (length($fmtSteet) > 0) {$fmtSteet = $N . " " . $fmtSteet;}
elsif (length($fmtDepLocality) > 0) {$fmtDepLocality = $N . " " . $fmtDepLocality ;}
else {$fmtLocality = $N . " " . $fmtLocality;} ;
}
}
# # ############################################################################
# SUB getLoHiNum
#
# Extract number ranges
# # ############################################################################
sub getLoHiNum
{
my ($tfare) = (@_) ;
my ($lonum,$losuf,$hinum,$hisuf) = ( "","","","");
# extract numbers and suffixes
my ($num,$tt) = ($tfare=~ /^(\d+[A-Z]?[\-]?\d*[A-Z]?)\s(.*)$/ );
if (length($num) > 0)
{
$tfare = $tt ;
if ( $num =~ m/[\-]{1}/ )
{
($lonum,$hinum) = ($num=~ /(\d+.*)[\-](\d+.*)/ );
}
else
{
$lonum = $num;
}
if ( $lonum =~ m/[A-Z]$/ )
{
($lonum,$losuf) = ($lonum=~ /(\d+)([A-Z])$/ );
}
if ( $hinum =~ m/[A-Z]$/ )
{
($hinum,$hisuf) = ($hinum=~ /(\d+)([A-Z])$/ );
}
}
return "$tfare|$lonum|$losuf|$hinum|$hisuf" ;
}
# # ############################################################################
# SUB removePunch
#
# Remove punchuation
# # ############################################################################
sub removePunch
{
my ($dirtyWord,$punch) = (@_) ;
$punch = "dhqa" if length($punch) == 0;
$dirtyWord =~ s/\.//g if $punch =~ m/d/ ;
$dirtyWord =~ s/\-/ /g if $punch =~ m/h/ ;
$dirtyWord =~ s/\'//g if $punch =~ m/q/ ;
$dirtyWord =~ s/\@//g if $punch =~ m/a/ ;
return $dirtyWord;
}
while()#从mysql视图读取-您可以从split/\t填充的字段中计算出来/
{
咀嚼;
$incount++;
如果$incount%1000000==0,则打印STDERR“已处理…”$incount\n;
我的($postcode,$aKey,$lkey,$skey,
$sekey,$dskey,$dsekey,$B编号,
$bkey,$sbkey,$HOMENTS,$oKey,
$fmtPcType、$concat、$dps、$fmtsUser、,
$poBox、$version、$active、,
$FMT公司,$FMT部门,$rawSubLine,$rawBldLine,
$DepSteet、$fmtDepSteetEnd、$Steet、$fmtSteetEnd、,
$DepLocation,$Locality,$PostTown)
=split/\t/,$\u1;
my$fmtaKey=sprintf(“%08s”,$aKey);
my$fmtoKey=sprintf(“%08s”,$oKey);
$fmtoKey=”“如果$fmtoKey eq“00000000”;
$fmtoKey=$fmtaKey,如果$fmtPcType eq“L”;
$fmtDepLocality=removePunch($depplocality);
$fmtLocality=removePunch($Locality,“dq”);
$fmtDepSteet=removePunch($DepSteet);
$fmtSteet=移除冲压($Steet,“dh”);
$rawSubLine=removePunch($rawSubLine,“d”);
$rawBldLine=removePunch($rawBldLine,“d”);
my$fmtPostTown=removePunch($PostTown);
my($opc,$ipc)=解包(“A4A3”,美元邮政编码);
$opc=~s/\s$/;
my$fmtPoBox=“PO BOX$poBox”如果(长度($poBox)>0);
我的($fmtsubuilding,$fmtBuilding)=(“”,“”);
# ##############################################################################
#格式规则基于建筑物名称、子建筑物和编号的存在
#有关详细信息,请参阅PAF摘要
# ##############################################################################
我的($subBuildingFlag,$buildingFlag,$buildingNumberFlag)=(“N”、“N”、“N”);
$buildingNumberFlag=“Y”如果$B编号为“0”;
$buildingFlag=“Y”如果长度($rawBldLine)>0;
$subBuildingFlag=“Y”如果长度($rawSubLine)>0;
my$formatKey=“${subBuildingFlag}${buildingFlag}${buildingNumberFlag}”;
我的$ruleid;
# ----------------------------------------------------------------------------
#表单匹配规则1(仅限组织名称)
# ----------------------------------------------------------------------------
如果($formatKey eq“NNN”)
{
$ruleid=“1”;
}
# ----------------------------------------------------------------------------
#格式匹配规则2(仅限建筑编号)
# ----------------------------------------------------------------------------
elsif($formatKey eq“NNY”)
{
$ruleid=“2”;
插入编号($b编号);
}
# ----------------------------------------------------------------------------
#格式匹配规则3(仅建筑物名称)
# ----------------------------------------------------------------------------
elsif($formatKey eq“NYN”)
{
$ruleid=“3”;
我的$N=“”;
($fmtBuilding,$N)=拆分/\\\\;/,F1Check($rawBldLine),-1;
插入编号($N);
}
# ----------------------------------------------------------------------------
#格式匹配规则4(建筑物名称和建筑物编号)
# ----------------------------------------------------------------------------
elsif($formatKey eq“NYY”)
{
$ruleid=“4”;
$fmtBuilding=“$rawBldLine”;
插入编号($b编号);
}
# ----------------------------------------------------------------------------
#格式匹配规则5(子建筑物名称和建筑物编号)
# ----------------------------------------------------------------------------
elsif($formatKey eq“YNY”)
{
$ruleid=“5”;
如果($concat eq“Y”)
{
my$numSub=“$bnumber$rawSubLine”;
插入编号($numSub);
}
其他的
{
$fmtSubBuilding=“$rawSubLine”;
插入编号($b编号);
}
}
# ----------------------------------------------------------------------------
#格式匹配规则6(子建筑物名称和建筑物名称)
# ----------------------------------------------------------------------------
elsif($formatKey eq“YYN”)
{
$ruleid=“6”;
($fmtSubBuilding,$N1)=拆分/\\|/,F1Check($rawSubLine),-1;
($fmtBuilding,$N2)=拆分/\\\\;/,F1Check($rawBldLine),-1;
如果($fmtSubBuilding eq“”)
{
如果($N2=~/^后面/)
{
$fmtSubBuilding.=“$N1$N