如何在SAS中解析大型html中的文本

如何在SAS中解析大型html中的文本,sas,Sas,我在网上到处寻找一个好的解析代码,但是所有的例子都很琐碎。下面的PERL表达式仅适用于5个字节 rx1=prxparse(“s//”) 我的表包含一个文本文件,其中包含如下字符串 <meta name="generator" content="HTML Tidy, see www.w3.org" /> Test <table style="WIDTH: 360.0pt;BORDER-COLLAPSE: collapse;" border="0" cellsp

我在网上到处寻找一个好的解析代码,但是所有的例子都很琐碎。下面的PERL表达式仅适用于5个字节

rx1=prxparse(“s//”)

我的表包含一个文本文件,其中包含如下字符串

    <meta name="generator" content="HTML Tidy, see www.w3.org" /> Test 
    <table style="WIDTH: 360.0pt;BORDER-COLLAPSE: collapse;" border="0" 
cellspacing="0" cellpadding="0" width="480"> <tr style="HEIGHT: 15.0pt;"> 
    <td style="BORDER-BOTTOM: rgb(236,233,216);BORDER-LEFT: rgb(236,233,216);
BACKGROUND-COLOR: transparent;WIDTH: 360.0pt;HEIGHT: 15.0pt; " width="480"> 
测试

因此,它包含了@Joe在评论中所说的是真的,但不幸的是,它并没有否定人们经常需要解决这类问题的事实。下面是我需要从XML/HTML中提取某些值时使用的一些宏。它并不完美,但它完成了我所需要的一切

以下宏的主要限制是它们需要解析的HTML/XML存在于SAS中的单个字段中。SAS中单个字段的大小限制为32767个字符,这意味着如果您的HTML文件大于此大小,则只需获取需要处理的部分

其中包括一些示例,了解其工作原理的最佳方法就是运行这些示例

/*****************************************************************************
**  PROGRAM: PRXEXTRACT.SAS
**
**  SEARCHES THROUGH AN XML (OR HTML) FILE FOR AN ELEMENT AND EXTRACTS THE 
**  VALUE BETWEEN AN ELEMENTS TAGS.
**  
**  PARAMETERS:
**  iElement      : The element to search through the blob for.
**  iField        : The fieldname to save the result to.
**  iType         : (N or C) for Numeric or Character.
**  iLength       : The length of the field to create.  
**  iXMLField     : The name of the field that contains the XML blob to parse.
**  iDelimiterType: (1 or 2). Defaults to 1.  1 USES <> AS DELIMS. 2 USES [].
**
******************************************************************************
**  HISTORY:
**  1.0 MODIFIED: 14-FEB-2011  BY:RP
**  - CREATED. 
**  1.1 MODIFIED: 16-FEB-2011  BY:RP
**  - ADDED OPTION TO CHANGE DELIMITERS FROM <> TO []
**  1.1 MODIFIED: 17-FEB-2011  BY:RP
**  - CORRECTED ERROR WHEN MATCH RETURNS A LENGTH OF ZERO
**  - CORRECTED MISSING AMPERSAND FROM IDELIMITERTYPE CHECK.
**  - ADDED ESCAPING QUOTES TO [] DELIMITER TYPE
**  - CORRECTED WARNING WHEN MATCH RETURNS MISSING NUMERIC FIELD
**  1.2 MODIFIED: 25-FEB-2011  BY:RP
**  - ADDED DELIMITER TYPES TO WORK WITH MASKED HTML CODES
**  1.3 MODIFIED: 11-MAR-2011  BY:RP
**  - MODIFIED TO ALLOW FOR OPTIONAL ATTRIBUTES ON THE ELEMENT BEING SEARCHED FOR.
**  1.4 MODIFIED: 14-MAR-2011  BY:RP
**  - CORRECTED TO REMOVE FALSE MATCHES FROM PRIOR VERSION. ADDED EXAMPLE.
**  1.5 MODIFIED: 10-APR-2012  BY:RP
**  - CORRECTED PROBLEM WITH ZERO LENGTH STRING MATCHES
**  1.6 MODIFIED: 22-MAY-2012  BY:RP
**  - ADDED ABILITY TO CAPTURE ATTRIBUTES
*****************************************************************************/

%macro prxExtract(iElement=, iField=, iType=, iLength=, iXMLField=, iDelimiterType=1, iSequence=1, iAttributesField=);

  %local delim_open delim_close;

  crLf = byte(10) || byte(13);
  &iXMLField = compress(&iXMLField,crLf,);

  %if &iDelimiterType eq 1 %then %do;
    %let delim_open  = <;
    %let delim_close = >;
  %end;
  %else %if &iDelimiterType eq 2 %then %do;
    %let delim_open  = \[;
    %let delim_close = \];
  %end;
  %else %if &iDelimiterType eq 3 %then %do;
    %let delim_open  = %nrbquote(&)lt%quote(%str(;)) ;
    %let delim_close = %nrbquote(&)gt%quote(%str(;)) ;
  %end;
  %else %do;
    %put ERR%str()ROR (prxExtract.sas): You specified an incorrect option for the iDelimiterType parameter.;
  %end;

  %if %sysfunc(index(&iField,[)) %then %do;
    /* DONT DO THIS IF ITS AN ARRAY */
  %end;
  %else %do;
    %if "%upcase(&iType)" eq "N" %then %do;
      attrib &iField length=&iLength format=best.;
    %end;
    %else %do;
      attrib &iField length=$&iLength format=$&iLength..;
    %end;
  %end;

  /*
  ** BREAKDOWN OF REGULAR EXPRESSION (EXAMPLE USES < AND > AS DELIMS AND ANI AS THE ELEMENT BEING LOOKED FOR:
  **
  ** &delim_open&iElement                            -->  FINDS <ANI
  ** (\s+.*?&delim_close|&delim_close){1}?           -->  FINDS THE SHORTEST SINGLE INSTANCE OF EITHER:
  **                                                      - ONE OR MORE SPACES FOLLOWED BY ANYTHING UNTIL A > CHARACTER
  **                                                      - OR JUST A > CHARACTER
  **                                                      THE ?: JUST TELLS IT NOT TO CAPTURE WHAT IT FOUND INBETWEEN THE ( AND )
  ** (.*?)                                           -->  FINDS WHAT WE ARE SEARCHING FOR AND CAPTURES IT INTO BUFFER 1.
  ** &delim_open                                     -->  FINDS <
  ** \/                                              -->  FINDS THE / CHARACTER. THE FIRST SLASH ESCAPES IT SO IT KNOWS ITS NOT A SPECIAL REGEX SLASH
  ** &iElement&delim_close                           -->  FINDS ANI>
  */
  prx_id = prxparse("/&delim_open&iElement((\s+.*?)&delim_close|&delim_close){1}?(.*?)&delim_open\/&iElement&delim_close/i"); 

  prx_start = 1;
  prx_stop = length(&iXMLField);
  prx_sequence = 0;
  call prxnext(prx_id, prx_start, prx_stop, &iXMLField, prx_pos, prx_length);
  do while (prx_pos > 0);
    prx_sequence = prx_sequence + 1;
    if prx_sequence = &iSequence then do;
      if prx_length > 0 then do;

        call prxposn(prx_id, 3, prx_pos, prx_length);
        %if "%upcase(&iType)" eq "N" %then %do;
          length prx_tmp_n $200;
          prx_tmp_n = substr(&iXMLField, prx_pos, prx_length);
          if cats(prx_tmp_n) ne "" then do;
            &iField = input(substr(&iXMLField, prx_pos, prx_length), ?best.);
          end;
        %end;
        %else %do;          
          if prx_length ne 0 then do;
            &iField = substr(&iXMLField, prx_pos, prx_length);
          end;
          else do;
            &iField = "";
          end;
        %end;

        **
        ** ALSO SAVE THE ATTRIBUTES TO A FIELD IF REQUESTED
        *;
        %if "%upcase(&iAttributesField)" ne "" %then %do;
          call prxposn(prx_id, 2, prx_pos, prx_length);
          if prx_length ne 0 then do;
            &iAttributesField = substr(&iXMLField, prx_pos, prx_length);
          end;
          else do;
            &iAttributesField = "";
          end;
        %end;

      end;
    end;
    call prxnext(prx_id, prx_start, prx_stop, &iXMLField, prx_pos, prx_length);
  end;

  drop crLf prx:;

%mend;

在一般意义上不可能,参见示例。HTML对于任何类型的正则表达式来说都过于复杂和不规则。你可以在有限的意义上做它,就像解析一个特定的页面或结构,但实际上是逐个情况,而不是一个适合于这个表单中的问题的问题。但是我已经用java、C++等做了很多,有很多HTML解析器API。我认为SAS可能也有一些
SAS html解析器。SAS
用户定义的文件,我可以
%包括这些文件,并使用类似于APIS的文件。SAS中没有一个真正的文件。有
proc http
可用于从表单等获取数据,并且在一些BI套件中有一些更复杂的工具,但是在基本SAS中没有直接的HTML解析器。如果您有直接XML解析器(libname方法,使用XML映射),谢谢。很高兴知道。我应该停止搜索,如果您熟悉groovy,您可以做的一件有趣的事情就是使用
procGroovy
。我相信,有一些为它编写的HTML解析器非常强大。这太棒了。感谢您与大家分享您的代码。我想我不是唯一一个从itI中受益的人。我尝试了一个简单的测试,但失败了
数据示例;xml=“测试”;%prxExtract(iElement=xni,iField=my_xni,iType=c,iLength=15,iXMLField=xml,iSequence=1,iAttributesField=my_xni_attribs);运行
@buras对于
iElement
参数,您应该通过
test
,因为示例中的元素名为
。此外,仅当元素包含以下信息时,才会填充iAttributes字段:
data example;

  xml = "<test><ANI2Digits>00</ANI2Digits><XNI xniattrib=1>7606256091</XNI><ANI>number2</ANI><ANI x=hmm y=yay>number3</ANI></test>"; * NOTE THE XML MUST BE ALL ON ONE LINE;

  %prxExtract(iElement=xni, iField=my_xni, iType=c, iLength=15, iXMLField=xml, iSequence=1, iAttributesField=my_xni_attribs);

run;
data example;

  xml = "<test><ANI2Digits>00</ANI2Digits><ANI>7606256091</ANI><ANI>number2</ANI><ANI x=hmm y=yay>number3</ANI></test>"; * NOTE THE XML MUST BE ALL ON ONE LINE;

  %prxExtract(iElement=ani2digits, iField=ani2digits, iType=c, iLength=50, iXMLField=xml);

  length ani1-ani6 $15;
  length attr1-attr6 $100;
  array arrani [1:6] $ ani1-ani6;
  array arrattr [1:6] $ attr1-attr6;
  %prxCount  (iElement=ani, iXMLField=xml, iDelimiterType=1);
  do cnt=1 to prx_count;
    %prxExtract(iElement=ani, iField=arrani[cnt], iType=c, iLength=15, iXMLField=xml, iSequence=cnt, iAttributesField=arrattr[cnt]);
  end;

run;
/*****************************************************************************
**  PROGRAM: MACROS.PRXCOUNT.SAS
**
**  RETURNS THE NUMBER OF TIMES AN ELEMENT IS FOUND IN AN HTML/XML FILE.
**  
**  PARAMETERS:
**  iElement      : The element to search through the blob for.
**  iXMLField     : The name of the field that contains the XML blob to parse.
**  iDelimiterType: (1/2/3). Defaults to 1.  1 USES <> AS DELIMS. 2 USES [].
**                  3 USES ENCODED VALUES FOR <>.   
**
******************************************************************************
**  HISTORY:
**  1.0 MODIFIED: 25-FEB-2011  BY:RP
**  - CREATED. 
**  1.1 MODIFIED: 14-MAR-2011  BY:RP
**  - MODIFIED TO ALLOW FOR OPTIONAL ATTRIBUTES ON THE ELEMENT BEING SEARCHED FOR.
*****************************************************************************/

%macro prxCount(iElement=, iXMLField=, iDelimiterType=1);

  %local delim_open delim_close;

  crLf = byte(10) || byte(13);
  &iXMLField = compress(&iXMLField,crLf,);

  %if &iDelimiterType eq 1 %then %do;
    %let delim_open  = <;
    %let delim_close = >;
  %end;
  %else %if &iDelimiterType eq 2 %then %do;
    %let delim_open  = \[;
    %let delim_close = \];
  %end;
  %else %if &iDelimiterType eq 3 %then %do;
    %let delim_open  = %nrbquote(&)lt%quote(%str(;)) ;
    %let delim_close = %nrbquote(&)gt%quote(%str(;)) ;
  %end;
  %else %do;
    %put ERR%str()ROR (prxCount.sas): You specified an incorrect option for the iDelimiterType parameter.;
  %end;

  prx_id = prxparse("/&delim_open&iElement(\s+.*?&delim_close|&delim_close){1}?(.*?)&delim_open\/&iElement&delim_close/i"); 

  prx_count = 0;
  prx_start = 1;
  prx_stop  = length(&iXMLField);
  call prxnext(prx_id, prx_start, prx_stop, &iXMLField, prx_pos, prx_length);
  do while (prx_pos > 0);
    prx_count = prx_count + 1;
    call prxposn(prx_id, 1, prx_pos, prx_length);
    call prxnext(prx_id, prx_start, prx_stop, &iXMLField, prx_pos, prx_length);
  end;

  drop crLf prx_:;

%mend;