Warning: file_get_contents(/data/phpspider/zhask/data//catemap/2/unit-testing/4.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
SAS中的Jaro-Winkler字符串比较函数_Sas_Jaro Winkler - Fatal编程技术网

SAS中的Jaro-Winkler字符串比较函数

SAS中的Jaro-Winkler字符串比较函数,sas,jaro-winkler,Sas,Jaro Winkler,SAS中是否有字符串比较的实现 它看起来有Jaro Winkler,但我更喜欢自己调用函数的灵活性 谢谢 我不这么认为。它可以执行Levenshtein距离(complev函数)或广义编辑距离(compged),但我没有看到任何其他编辑距离函数 如果你死心塌地想在SAS中实现这一点,你可以在PROC IML中编写一个程序。据我所知,jaro winkler距离没有内置函数@伊齐已经提到了我所知道的唯一一个。不过,如果您愿意,您可以使用proc fcmp滚动自己的函数。我甚至会用下面的代码给你一个

SAS中是否有字符串比较的实现

它看起来有Jaro Winkler,但我更喜欢自己调用函数的灵活性


谢谢

我不这么认为。它可以执行Levenshtein距离(complev函数)或广义编辑距离(
compged
),但我没有看到任何其他编辑距离函数


如果你死心塌地想在SAS中实现这一点,你可以在
PROC IML

中编写一个程序。据我所知,jaro winkler距离没有内置函数@伊齐已经提到了我所知道的唯一一个。不过,如果您愿意,您可以使用
proc fcmp
滚动自己的函数。我甚至会用下面的代码给你一个领先的开始。我只是想看看维基百科上的文章。无论如何,它都不能完美地表示Bill Winkler的strcmp.c文件,而且可能有很多bug

proc fcmp outlib=work.jaro.chars;

  subroutine jaromatch ( string1 $ , string2 $ , matchChars $);
    outargs matchChars;
    /* Returns number of matched characters between 2 strings excluding blanks*/
    /* two chars from string1 and string2 are considered matching
       if they are no farther than floor(max(|s1|, |s2|)/2)-1  */

    str1_len = length(strip(string1));
    str2_len = length(strip(string2));

    allowedDist = floor(max(str1_len, str2_len)/2) -1;

    matchChars="";

    /* walk through string 1 and match characters to string2 */
    do i= 1 to str1_len;
      x=substr(string1,i,1);
      position = findc(string2,x ,max(1,i-allowedDist));
      if position > 0 then do;
          if position - i <= allowedDist then do;
          y=substr(string2,position,1);
          /* build list of matched characters */
          matchChars=cats(matchChars,y);
        end;
      end;
    end;
    matchChars = strip(matchChars);
  endsub;


  function jarotrans (string1 $ , string2 $ );
    ntrans = 0;
    ubnd = min(length(strip(string1)), length(strip(string2)));
    do i = 1 to ubnd;
      if substr(string1,i,1) ne substr(string2,i,1) then do;
        ntrans + 1;
      end;
    end;
    return(ntrans/2);
  endsub;

  function getPrefixlen( string1 $ , string2 $, maxprelen);
     /* get the length of the matching characters at the beginning */
     n = min(maxprelen, length(string1), length(string2));
     do i = 1 to n;
       if substr(string1,i,1) ne substr(string2,i,1) 
       then return(max(1,i-1));
     end;
  endsub;

  function jarodist(string1 $, string2 $);
    /* get number of matched characters */
    call jaromatch(string1, string2, m1);
    m1_len = length(m1);
    if m1_len = 0 then return(0);
    call jaromatch(string2, string1, m2);
    m2_len = length(m2);
    if m2_len = 0 then return(0);

    /* get number of transposed characters */
    ntrans = jarotrans(m1, m2);
    put m1_len= m2_len= ntrans= ;
    j_dist =  (m1_len/length(string1) 
             + m2_len/length(string2) 
             + (m1_len-ntrans)/m1_len )  /  3;
    return(j_dist);
  endsub;

  function jarowink( string1 $, string2 $, prefixscale);
    jarodist=jarodist(string1, string2);
    prelen=getPrefixlen(string1, string2, 4);
    if prelen = 0 then return(jarodist);
    else  return(jarodist + prelen * prefixscale * (1-jarodist));
  endsub;

run;quit;

/* tell SAS where to find the functions we just wrote */
option cmplib=work.jaro;

/* Now let's try it out! */
data _null_;
string1='DIXON';
string2='DICKSONX';
x=jarodist(string1, string2);
y=jarowink(string1, string2, 0.1);
put x= y=;
run;
proc fcmp outlib=work.jaro.chars;
子程序jaromatch(string1$、string2$、matchChars$);
外来火柴;
/*返回两个字符串(不包括空格)之间的匹配字符数*/
/*string1和string2中的两个字符被视为匹配
如果它们不超过地板(最大值(| s1 |,| s2 |)/2)-1*/
str1_len=长度(条带(string1));
str2_len=长度(条带(string2));
Allowedist=楼层(最大(str1和str2长度)/2)-1;
matchChars=“”;
/*遍历字符串1并将字符与字符串2匹配*/
i=1至str1_len;
x=子串(串1,i,1);
位置=findc(string2,x,max(1,i-allowedist));
如果位置>0,则执行该操作;

如果位置-ii,则修改并纠正cmjohns代码。感谢他/她让我起步。温克勒出版 文克尔,W.E.(2006)论文中的一些例子。“记录链接和当前 研究方向”。研究报告系列,RRS。(参见表6)我使用这些示例来测试我的代码

proc fcmp outlib=work.jaro.chars;

  /* Returns matched characters between 2 strings. Two chars from string1 and string2
     are considered matching if they are no farther apart than 
     floor(max(|s1|, |s2|)/2)-1                                                      */
  function jaromatch(string1 $, string2 $) $ 40;
    length matchChars $ 40;

    str1_len = lengthn(string1);
    str2_len = lengthn(string2);

    allowedDist = floor(max(str1_len, str2_len) / 2) - 1;

    *** walk through string1 and match characters to string2 ***;
    matchChars="";
    do i= 1 to str1_len;
      *** get the part of string2 to search ***;
      allowed_start = max(1, i - allowedDist);      *** starting char position ***;
      allowed_str2 = substr(string2, allowed_start, i + allowedDist - allowed_start + 1);

      *** find i char from string1 in string2 within the allowedDist ***;
      position = findc(allowed_str2, substr(string1, i, 1));
      if position > 0 
      then do;
    matchChars = cats(matchChars, substr(allowed_str2, position, 1));
    *** Once a char is assigned, it can not be assigned again. So, chg char in string2. ***;
    substr(string2, allowed_start + position -1, 1) = '~';
      end;
    end;
    return(strip(matchChars));
  endsub;

  /* count the number of "half" transpositions */
  function jarotrans(string1 $, string2 $);
    ntrans = 0;
    do i = 1 to min(lengthn(strip(string1)), lengthn(strip(string2)));
      if substr(string1, i, 1) ne substr(string2, i, 1) then ntrans + 1;
    end;

    return(ntrans / 2);
  endsub;

  /* get the length of the matching characters at the beginning */
  function getPrefixlen(string1 $, string2 $, maxprelen);
    n = min(maxprelen, lengthn(string1), lengthn(string2));

    if n = 0
    then return(0);
    else do;
      do i = 1 to n;
    if substr(string1, i, 1) ne substr(string2, i, 1) 
    then return(i - 1);
      end;
      return(n);  *** all maxprelen characters match ***;
    end;
  endsub;

  /* calc the jaro distance */
  function jarodist(string1 $, string2 $);
    *** get number of matched characters in string1 ***;
    m1 = jaromatch(string1, string2);
    m1_len = lengthn(m1);
    if m1_len = 0 then return(0);

    *** get number of matched characters in string2 ***;
    m2 = jaromatch(string2, string1);
    m2_len = lengthn(m2);
    if m2_len = 0 then return(0);

    *** get number of transposed characters ***;
    ntrans = jarotrans(m1, m2);

    *** calc jaro distance ***;
    j_dist = (m1_len / lengthn(string1) +
          m2_len / lengthn(string2) +
          (m1_len - ntrans) / m1_len
         ) / 3;

    return(j_dist);
  endsub;

  /* calc the jaro-winkler distance */
  function jarowink(string1 $, string2 $, prefixscale);
    string1 = upcase(strip(string1));
    string2 = upcase(strip(string2));

    *** check for trivial case and calc JW if needed ***;
    if string1 = string2
    then return(1.0);
    else do;
      jarodist = jarodist(string1, string2);
      prelen = getPrefixlen(string1, string2, 4);
      return(jarodist + prelen * prefixscale * (1 - jarodist));
    end;
  endsub;

run;

*** tell SAS where to find the functions we just wrote ***;
option cmplib=work.jaro;

    /* test code */
data _null_;
  put 'SHACKLEFORD SHACKELFORD 0.982';
  jw = jarowink('SHACKLEFORD', 'SHACKELFORD', 0.1);
  put jw=;
  put;
  put 'DUNNINGHAM  CUNNIGHAM   0.896';
  jw = jarowink('DUNNINGHAM', 'CUNNIGHAM', 0.1);
  put jw=;
  put;
  put 'NICHLESON   NICHULSON   0.956';
  jw = jarowink('NICHLESON', 'NICHULSON', 0.1);
  put jw=;
  put;
  put 'JONES       JOHNSON     0.832';
  jw = jarowink('JONES', 'JOHNSON', 0.1);
  put jw=;
  put;
  put 'MASSEY      MASSIE      0.933';
  jw = jarowink('MASSEY', 'MASSIE', 0.1);
  put jw=;
  put;
  put 'ABROMS      ABRAMS      0.922';
  jw = jarowink('ABROMS', 'ABRAMS', 0.1);
  put jw=;
  put; 
  put 'JERALDINE   GERALDINE   0.926';
  jw = jarowink('JERALDINE', 'GERALDINE', 0.1);
  put jw=;
  put;
  put 'MARHTA      MARTHA      0.961';
  jw = jarowink('MARHTA', 'MARTHA', 0.1);
  put jw=;
  put;
  put 'MICHELLE    MICHAEL     0.921';
  jw = jarowink('MICHELLE', 'MICHAEL', 0.1);
  put jw=;
  put;
  put 'JULIES      JULIUS      0.933';
  jw = jarowink('JULIES', 'JULIUS', 0.1);
  put jw=;
  put;
  put 'TANYA       TONYA       0.880';
  jw = jarowink('TANYA', 'TONYA', 0.1);
  put jw=;
  put;
  put 'DWAYNE      DUANE       0.840';
  jw = jarowink('DWAYNE', 'DUANE', 0.1);
  put jw=;
  put;
  put 'SEAN        SUSAN       0.805';
  jw = jarowink('SEAN', 'SUSAN', 0.1);
  put jw=;
  put;
  put 'JON         JOHN        0.933';
  jw = jarowink('JON', 'JOHN', 0.1);
  put jw=;
  put;
run;

你必须注意一个事实,雅罗·温克勒是一个类似距离的分数。您的函数需要对称,即无论字符串参数的顺序是:d(s1,s2)=d(s2,s1),都返回相同的结果。为此,您需要更改jaro距离和jaromatch函数。匹配字符串的长度必须相同。