在SQL Server中使用Royal Mail PAF文件

在SQL Server中使用Royal Mail PAF文件,sql,tsql,Sql,Tsql,我被赋予了一项令人沮丧的任务,试图对我们刚刚购买的皇家邮政PAF文件进行处理,现在我已将所有相关表加载到我们的SQL server中,并根据相关ID将它们链接在一起。我现在需要做的是根据多个规则将适当的地址放在一起,例如,如果只提供了建筑物名称或编号,或者如果提供了特定邮政编码时提供了邮政信箱。我希望,因为这似乎是一个广泛使用的文件,有人可能已经做了一些这方面的工作,任何人的想法,因为他们希望在本周末它 希望有人能帮忙,p心情沮丧-这是一件大事-有很多规则,也有很多例外。 我想你已经看过PAF程

我被赋予了一项令人沮丧的任务,试图对我们刚刚购买的皇家邮政PAF文件进行处理,现在我已将所有相关表加载到我们的SQL server中,并根据相关ID将它们链接在一起。我现在需要做的是根据多个规则将适当的地址放在一起,例如,如果只提供了建筑物名称或编号,或者如果提供了特定邮政编码时提供了邮政信箱。我希望,因为这似乎是一个广泛使用的文件,有人可能已经做了一些这方面的工作,任何人的想法,因为他们希望在本周末它


希望有人能帮忙,p

心情沮丧-这是一件大事-有很多规则,也有很多例外。 我想你已经看过PAF程序员指南中的所有规则了吧? 如果你想在周末之前完成,你最好得到“压缩标准”版本,它实际上是你想要的扩展地址,但当然它不是数据库的标准化版本。 很久以前,我用关系格式构建地址。我看看能不能找到密码

发现一些丑陋的8岁Perl-小心使用

    while ( <PAF> )  # Reading from mysql view - you can work this out from the fields populated by the split /\t/
    {
      chomp;
      $incount++;
      print STDERR "Processed ..... $incount\n" if $incount%1000000 == 0;

      my ( $postcode       , $aKey           , $lkey        , $skey         , 
           $sekey          , $dskey          , $dsekey      , $bnumber      ,
           $bkey           , $sbkey          , $households  , $oKey         ,
           $fmtPcType      , $concat         , $dps         , $fmtsUser     ,
           $poBox          , $version        , $active      , 
           $fmtCompany     , $fmtDepartment  , $rawSubLine  , $rawBldLine   , 
           $DepSteet       , $fmtDepSteetEnd , $Steet       , $fmtSteetEnd  , 
           $DepLocality    , $Locality       , $PostTown )
           = split /\t/ , $_ , -1 ;

      my $fmtaKey  = sprintf("%08s",$aKey);
      my $fmtoKey  = sprintf("%08s",$oKey);

      $fmtoKey = "" if $fmtoKey eq "00000000" ;
      $fmtoKey = $fmtaKey if $fmtPcType eq "L"; 

      $fmtDepLocality       = removePunch($DepLocality);
      $fmtLocality          = removePunch($Locality,"dq");
      $fmtDepSteet          = removePunch($DepSteet);
      $fmtSteet             = removePunch($Steet,"dh");
      $rawSubLine           = removePunch($rawSubLine,"d");
      $rawBldLine           = removePunch($rawBldLine,"d");
      my $fmtPostTown       = removePunch($PostTown);

      my ($opc,$ipc) = unpack("A4A3" ,$postcode);

      $opc =~ s/\s$//;

      my $fmtPoBox      = "PO BOX $poBox"  if (length($poBox) > 0);  

      my ($fmtSubBuilding , $fmtBuilding ) = ( "","") ;

      # ##############################################################################
      # Format rules are based on presence of builing name , subbuilding and number
      # See PAF Digest for details
      # ##############################################################################

      my ($subBuildingFlag,$buildingFlag,$buildingNumberFlag) = ("N","N","N") ;

      $buildingNumberFlag ="Y" if $bnumber ne "0" ;
      $buildingFlag       ="Y" if length($rawBldLine) > 0;
      $subBuildingFlag    ="Y" if length($rawSubLine) > 0;

      my $formatKey="${subBuildingFlag}${buildingFlag}${buildingNumberFlag}";
      my $ruleid ;
      # ----------------------------------------------------------------------------
      # Formmating Rule 1 (Org name only)
      # ----------------------------------------------------------------------------

      if ( $formatKey eq "NNN" )
      {
          $ruleid="1";
      }

      # ----------------------------------------------------------------------------
      # Formmating Rule 2 (Building number only)
      # ----------------------------------------------------------------------------

      elsif ( $formatKey eq "NNY" )
      {
        $ruleid="2";

        InsertNumber ($bnumber) ;
      }

      # ----------------------------------------------------------------------------
      # Formmating Rule 3 (Building Name only)
      # ----------------------------------------------------------------------------

      elsif ($formatKey eq "NYN")
      {
        $ruleid="3";
        my $N="";

        ($fmtBuilding,$N) = split /\|/ , F1Check ($rawBldLine) , -1 ;

        InsertNumber ($N) ;
      }

      # ----------------------------------------------------------------------------
      # Formmating Rule 4 (Building Name and building Number)
      # ----------------------------------------------------------------------------

      elsif ($formatKey eq "NYY")
      {
        $ruleid="4";
        $fmtBuilding  = "$rawBldLine";
        InsertNumber ($bnumber) ;
      }

      # ----------------------------------------------------------------------------
      # Formmating Rule 5 (SubBuilding Name and building Number)
      # ----------------------------------------------------------------------------

      elsif ($formatKey eq "YNY")
      {
        $ruleid="5";

        if ($concat eq "Y")
        {
          my $numSub  = "$bnumber $rawSubLine";
          InsertNumber ($numSub) ;
        }
        else
        {
          $fmtSubBuilding = "$rawSubLine";
          InsertNumber ($bnumber) ;
        }

      }

      # ----------------------------------------------------------------------------
      # Formmating Rule 6 (SubBuilding Name and building name)
      # ----------------------------------------------------------------------------

      elsif ($formatKey eq "YYN" )
      {

         $ruleid="6";
        ($fmtSubBuilding,$N1) = split /\|/ , F1Check ($rawSubLine) , -1 ;
        ($fmtBuilding,$N2) = split /\|/ , F1Check ($rawBldLine) , -1 ;

        if ( $fmtSubBuilding eq "" )
        {
          if ( $N2 =~ /^REAR OF/ )
          {
            $fmtSubBuilding .= "$N1 $N2" ;
            $N1 = "" ;
            $N2 = "" ;
          }
          else
          {
            $fmtSubBuilding .= "$N1 " . $fmtBuilding;
            $fmtBuilding = "";
            $N1 = "" ;
          }
        }

        $N2 = $N1 if $N2 eq "";
        InsertNumber ($N2) ;
      }

      # ----------------------------------------------------------------------------
      # Formmating Rule 7 (SubBuilding Name ,building name and building number)
      # ----------------------------------------------------------------------------

      elsif ($formatKey eq "YYY" )
      {
        $ruleid="7";
        ($fmtSubBuilding,$N1) = split /\|/ , F1Check ($rawSubLine) , -1 ;
        ($fmtBuilding,$N2) = split /\|/ , F1Check ($rawBldLine) , -1 ;

        if ( $fmtSubBuilding eq "" )
        {
          $fmtSubBuilding .= "$N1 " . $fmtBuilding;
          $fmtBuilding = "";
        }

        InsertNumber ($bnumber) ;
      }


      # ##############################################################################
      # Format the address
      # ##############################################################################

      #----------------------------------------------------------------
      # subbuilding = building if no subbuilding (why?)
      #----------------------------------------------------------------

      if (( length($fmtBuilding) > 0) &&  ($fmtSubBuilding eq "" ))
      {
        $fmtSubBuilding = $fmtBuilding;
        $fmtBuilding = "";
      }

      #----------------------------------------------------------------
      # Get rid of duplicate lines
      #----------------------------------------------------------------

      if ("${fmtSubBuilding}${fmtBuilding}" eq "${fmtSteet}${fmtSteetEnd}") 
      {
        $fmtSubBuilding="";
        $fmtBuilding=""; 
      }

      #----------------------------------------------------------------
      # Parse out number ranges (including number suffixes)
      #----------------------------------------------------------------  

      my ($fmtStreetName ,$lo_num , $low_suf , $hi_num ,$hi_suf) = split /\|/ ,  getLoHiNum ("$fmtSteet") , -1 ;
      my $dependentTfare = "$fmtDepSteet";
      $dependentTfare .= " $fmtDepSteetEnd" if length($fmtDepSteetEnd) > 0;


    # ABERDEEN CITY COUNCIL|EDUCATION DEPARTMENT||ST NICHOLAS HOUSE||||BROAD|STREET|||ABERDEEN|ABERDEENSHIRE|AB10|1AG|1A|0|01901355|01901355|L||

     my $fmtaddr .=  "$postcode|$dps|$fmtaKey|$fmtoKey|$fmtPoBox|$fmtSubBuilding|$fmtBuilding|$fmtDepSteet|$fmtDepSteetEnd|" ;
        $fmtaddr .=  "$fmtSteet|$fmtSteetEnd|$fmtDepLocality|$fmtLocality|$fmtPostTown|$opc|$ipc|"  ;
        $fmtaddr .=  "$ruleid|$lo_num|$low_suf|$hi_num|$hi_suf|$dependentTfare|$fmtStreetName";

     print "$fmtaddr\n";
    }

    close PAF;

    # # ############################################################################
    # SUB F1Check (PAF Digest Note:1 Page 42)
    #
    # Try and extract a number embeded in the (sub)building name
    # There are loads of exceptions to the rules in the PAF digest
    # hence the horrible regex's
    # # ############################################################################

    sub F1Check
    {

      my ($building) = (@_);

      my $bnumber = "";

     if ($building =~ m/^REAR OF.+\d/ )
     {
        $bnumber=$building;
        $building="" ;
     }

     elsif (($building =~ m/\d+/) && ($building !~ /UNIT[S]?\s|
                                                  FLAT[S]?\s|
                                                  ^DP\d+[A-Z]?$|
                                                  BLOCK[S]?\s|
                                                  ^OFFICE[S]?\s|
                                                  HANG[EA]R\s|
                                                  ^BUILDING[S]?\s|
                                                  ^HOUSE\s|
                                                  HOLDING\s|
                                                  APARTMENT\s|
                                                  CHALET\s|
                                                  ^BUNGALOW\s|
                                                  ^CARAVAN\s|
                                                  ^ANNEXE\s|
                                                  ^PLOT[S]?\s|
                                                  ^HOME[S]?\s|
                                                  ^LODGE\s\d|
                                                  ^PLATFORM\s|
                                                  ^DOMUS\s|
                                                  ^BARLOW\s|
                                                  ^BEALAH\s|
                                                  ^KIOSK[S]?\s|
                                                  ^LEVEL[S]?\s|
                                                  ^VILLA\s|
                                                  MEOTA\s|
                                                  MAXI[NM]\s|
                                                  ^AQUEOUS\s|
                                                  SUITE[S]?\s|
                                                  WING\s|
                                                  ^CAMPUS\s|
                                                  ^STUDIO\s|
                                                  ^COTTAGE\s|
                                                  ^STALL\s|
                                                  ^SHOP\s|
                                                  ^ARCH\s|
                                                  ^QUAY\s|
                                                  ^ABOVE\s\d|
                                                  ^LINK\s|
                                                  JETTY\s|
                                                  WAREHOUSE\s|
                                                  ^HOLDING\s|
                                                  ^PENTHOUSE\s|
                                                  ^MOORING\s|
                                                  ^BOTHY\s|
                                                  ^MAISONETTE\s|
                                                  ^SITE\s|
                                                  ^WORKSHOP\s|
                                                  ^BARN[S]?\s|
                                                  STALL\s|
                                                  ^BOAT\s|
                                                  ^STAND\s|
                                                  ^TOWER\s\d|
                                                  ^YARD\s\d|
                                                  ^STANCE\s|
                                                  ^VAN\s|
                                                  ^BAY\s\d|
                                                  ^MOBILE HOME\s|
                                                  ^STABLE\s|
                                                  ^ROOM\s|
                                                  ^[A-Z]\d+$
                                                  /xo
                                                  ))
      {

        if  ( $building =~ m/^([A-Z]?\d+[A-Z]{0,2}|
                            \d+[A-Z]{0,2}[\-\&\ \\\/]{0,1}\d+[A-Z]{0,2}|
                            [A-Z])$/ox )
        {
          $bnumber=$building;
          $building="" ;
        }
        elsif  ($building =~ m/^([A-Z]?\d*\s*[A-Z\s\.]+)(\d+[A-Z]{0,2}[\-\&\ \\\/]{0,1}\d*[A-Z]{0,2})$/o)
        {
          if (length($1) > 2)
          {
            $bnumber=$2;
            $building=$1;
            $building =~ s/\s+$//g;
          }
        }
       elsif   ($building =~ m/^(\D+)(\d+[A-Z]{0,2}[\-\&\ \\\/]\d+[A-Z]{0,2})$/)
       {
        $bnumber=$2;
        $building=$1;
        $building =~ s/\s+$//g;
       }  
     }   
     return "$building|$bnumber"
    }

    # # ############################################################################
    # SUB InsertNumber
    #
    # Prepend number to first non blank line from thoroughfare onwards
    # # ############################################################################

    sub InsertNumber
    {
        my ($N) = (@_) ;

        if ( $N =~ /\d/  && $N ne "0")
        {
          if    (length($fmtDepSteet)     > 0)  {$fmtDepSteet    = $N . " " . $fmtDepSteet ;}
          elsif (length($fmtSteet)  > 0)  {$fmtSteet  = $N . " " . $fmtSteet;}
          elsif (length($fmtDepLocality)    > 0)  {$fmtDepLocality    = $N . " " . $fmtDepLocality ;}
          else                              {$fmtLocality     = $N . " " . $fmtLocality;} ;
        }
    }

    # # ############################################################################
    # SUB getLoHiNum
    #
    # Extract number ranges 
    # # ############################################################################

    sub getLoHiNum
    {

      my ($tfare) = (@_) ;

      my ($lonum,$losuf,$hinum,$hisuf) = ( "","","","");

    # extract numbers and suffixes  
      my ($num,$tt) = ($tfare=~ /^(\d+[A-Z]?[\-]?\d*[A-Z]?)\s(.*)$/ );

      if (length($num) > 0)
      {
        $tfare = $tt ;
        if ( $num =~ m/[\-]{1}/ ) 
        {
          ($lonum,$hinum) = ($num=~ /(\d+.*)[\-](\d+.*)/ ); 
        }
        else
        {
          $lonum = $num;
        }

        if ( $lonum =~ m/[A-Z]$/ ) 
        {
          ($lonum,$losuf) = ($lonum=~ /(\d+)([A-Z])$/ ); 
        }

        if ( $hinum =~ m/[A-Z]$/ ) 
        {
          ($hinum,$hisuf) = ($hinum=~ /(\d+)([A-Z])$/ ); 
      }
      }
      return "$tfare|$lonum|$losuf|$hinum|$hisuf" ;
    }


    # # ############################################################################
    # SUB removePunch
    #
    # Remove punchuation
    # # ############################################################################

    sub removePunch
    {

      my ($dirtyWord,$punch) = (@_) ;
      $punch = "dhqa" if length($punch) == 0;

      $dirtyWord =~ s/\.//g     if $punch =~ m/d/ ;
      $dirtyWord =~ s/\-/ /g    if $punch =~ m/h/ ;
      $dirtyWord =~ s/\'//g     if $punch =~ m/q/ ;
      $dirtyWord =~ s/\@//g     if $punch =~ m/a/ ;

      return $dirtyWord;
    }
while()#从mysql视图读取-您可以从split/\t填充的字段中计算出来/
{
咀嚼;
$incount++;
如果$incount%1000000==0,则打印STDERR“已处理…”$incount\n;
我的($postcode,$aKey,$lkey,$skey,
$sekey,$dskey,$dsekey,$B编号,
$bkey,$sbkey,$HOMENTS,$oKey,
$fmtPcType、$concat、$dps、$fmtsUser、,
$poBox、$version、$active、,
$FMT公司,$FMT部门,$rawSubLine,$rawBldLine,
$DepSteet、$fmtDepSteetEnd、$Steet、$fmtSteetEnd、,
$DepLocation,$Locality,$PostTown)
=split/\t/,$\u1;
my$fmtaKey=sprintf(“%08s”,$aKey);
my$fmtoKey=sprintf(“%08s”,$oKey);
$fmtoKey=”“如果$fmtoKey eq“00000000”;
$fmtoKey=$fmtaKey,如果$fmtPcType eq“L”;
$fmtDepLocality=removePunch($depplocality);
$fmtLocality=removePunch($Locality,“dq”);
$fmtDepSteet=removePunch($DepSteet);
$fmtSteet=移除冲压($Steet,“dh”);
$rawSubLine=removePunch($rawSubLine,“d”);
$rawBldLine=removePunch($rawBldLine,“d”);
my$fmtPostTown=removePunch($PostTown);
my($opc,$ipc)=解包(“A4A3”,美元邮政编码);
$opc=~s/\s$/;
my$fmtPoBox=“PO BOX$poBox”如果(长度($poBox)>0);
我的($fmtsubuilding,$fmtBuilding)=(“”,“”);
# ##############################################################################
#格式规则基于建筑物名称、子建筑物和编号的存在
#有关详细信息,请参阅PAF摘要
# ##############################################################################
我的($subBuildingFlag,$buildingFlag,$buildingNumberFlag)=(“N”、“N”、“N”);
$buildingNumberFlag=“Y”如果$B编号为“0”;
$buildingFlag=“Y”如果长度($rawBldLine)>0;
$subBuildingFlag=“Y”如果长度($rawSubLine)>0;
my$formatKey=“${subBuildingFlag}${buildingFlag}${buildingNumberFlag}”;
我的$ruleid;
# ----------------------------------------------------------------------------
#表单匹配规则1(仅限组织名称)
# ----------------------------------------------------------------------------
如果($formatKey eq“NNN”)
{
$ruleid=“1”;
}
# ----------------------------------------------------------------------------
#格式匹配规则2(仅限建筑编号)
# ----------------------------------------------------------------------------
elsif($formatKey eq“NNY”)
{
$ruleid=“2”;
插入编号($b编号);
}
# ----------------------------------------------------------------------------
#格式匹配规则3(仅建筑物名称)
# ----------------------------------------------------------------------------
elsif($formatKey eq“NYN”)
{
$ruleid=“3”;
我的$N=“”;
($fmtBuilding,$N)=拆分/\\\\;/,F1Check($rawBldLine),-1;
插入编号($N);
}
# ----------------------------------------------------------------------------
#格式匹配规则4(建筑物名称和建筑物编号)
# ----------------------------------------------------------------------------
elsif($formatKey eq“NYY”)
{
$ruleid=“4”;
$fmtBuilding=“$rawBldLine”;
插入编号($b编号);
}
# ----------------------------------------------------------------------------
#格式匹配规则5(子建筑物名称和建筑物编号)
# ----------------------------------------------------------------------------
elsif($formatKey eq“YNY”)
{
$ruleid=“5”;
如果($concat eq“Y”)
{
my$numSub=“$bnumber$rawSubLine”;
插入编号($numSub);
}
其他的
{
$fmtSubBuilding=“$rawSubLine”;
插入编号($b编号);
}
}
# ----------------------------------------------------------------------------
#格式匹配规则6(子建筑物名称和建筑物名称)
# ----------------------------------------------------------------------------
elsif($formatKey eq“YYN”)
{
$ruleid=“6”;
($fmtSubBuilding,$N1)=拆分/\\|/,F1Check($rawSubLine),-1;
($fmtBuilding,$N2)=拆分/\\\\;/,F1Check($rawBldLine),-1;
如果($fmtSubBuilding eq“”)
{
如果($N2=~/^后面/)
{
$fmtSubBuilding.=“$N1$N