Php 查找两个XML格式不正确且包含嵌套标记的标记之间的文本_Php_Xml_Parsing_Xml Parsing_Html Parsing

Php 查找两个XML格式不正确且包含嵌套标记的标记之间的文本

php xml parsing

Php 查找两个XML格式不正确且包含嵌套标记的标记之间的文本,php,xml,parsing,xml-parsing,html-parsing,Php,Xml,Parsing,Xml Parsing,Html Parsing,有没有简单的方法可以在两个XML格式不正确的标记之间查找文本并忽略嵌套鉴于这一内容： <div> Some content 1 </ <some:tag> Section 1 </some:tag> <b>Some content 2 <some:tag> Section 2 <some:tag>

有没有简单的方法可以在两个XML格式不正确的标记之间查找文本并忽略嵌套

鉴于这一内容：

<div>
    Some content 1
    </
    <some:tag>
        Section 1
    </some:tag>
    <b>Some content 2
    <some:tag>
        Section 2
        <some:tag>
            Section 3
        </some:tag>
    </some:tag>
    Some content 3
    </p>
</div>


部分内容1
字符串'
第一节
'（长度=52）
1=>string'
第二节
第三节
'（长度=125）

你尝试过什么

我曾尝试使用strpos/substr退出匹配，但我对逻辑有点迷茫：

function findSomeTag($str) {
    $result = [];
    $startTag = "<some:tag>";
    $endTag = "</some:tag>";
    $offset = 0;
    $start = strpos($str, $startTag, $offset);
    while ($start !== false) {
        $nextStart = strpos($str, $startTag, $start + 1);
        $nextEnd = strpos($str, $endTag, $start + 1);
        if ($nextStart === false || $nextEnd < $nextStart) {
            $result[] = substr($str, $start, $nextEnd - $start + strlen($endTag));
        }
        $start = $nextStart;
    }
    return $result;
}

函数findSomeTag（$str）{
$result=[]；
$startTag=“”；
$endTag=“”；
$offset=0；
$start=strpos（$str、$startTag、$offset）；
while（$start！==false）{
$nextStart=strpos（$str、$startTag、$start+1）；
$nextEnd=strpos（$str，$endTag，$start+1）；
如果（$nextStart==false | |$nextEnd<$nextStart）{
$result[]=substr（$str，$start，$nextEnd-$start+strlen（$endTag））；
}
$start=$nextStart；
}
返回$result；
}

（注意：上述函数完全不起作用，可能会无限循环。）

要包含嵌套的标记，可以计算当前打开的标记数

因此，当

$nextEnd>$nextStart

递增

$counter

时，仅当

$nextEnd<$nextStart&&$counter==1

时添加新结果（您有一个打开的标记）。如果

$nextEnd<$nextStart&&$counter<1

减量

$counter

我认为最简单的解析方法是使用类似状态机的东西。基本上，您定义了一组特定的状态以及离开这些状态并进入其他状态的条件

假设您的文本位于某种文本阅读器中，该阅读器可以为您提供下一个字符并向前移动指针，还可以将指针倒回一定数量的字符

然后您可以创建一个类似这样的状态机（它原来是一个简单的状态机，只有一个状态基本上在其内部循环）：

那么就这样称呼它：

$found = $myStateMachine->getTagContents("<some:tag>", "</some:tag>");

class TextReader {
    private $idx = 0;
    private $reading;
    private $lastIdx;

    public function __construct($reading) {
        $this->reading = $reading;
        $this->lastIdx = strlen($reading) - 1;
    }

    public function hasMore() {
        return $this->idx < $this->lastIdx;
    }

    public function nextChar() {
        if( !$this->hasMore() ) return null;

        return $this->reading[$this->idx++];
    }

    public function rewind($howFar) {
        $this->idx -= $howFar;
        if( $this->idx < 0 ) $this->idx = 0;
    }
}

$myStateMachine = new StateMachine( new TextReader($myXmlFileContents) );
$found = $myStateMachine->getTagContents("<some:tag>", "</some:tag>");

$found=$myStateMachine->getTagContents（“，”）；

文本阅读器可能如下所示：

$found = $myStateMachine->getTagContents("<some:tag>", "</some:tag>");

class TextReader {
    private $idx = 0;
    private $reading;
    private $lastIdx;

    public function __construct($reading) {
        $this->reading = $reading;
        $this->lastIdx = strlen($reading) - 1;
    }

    public function hasMore() {
        return $this->idx < $this->lastIdx;
    }

    public function nextChar() {
        if( !$this->hasMore() ) return null;

        return $this->reading[$this->idx++];
    }

    public function rewind($howFar) {
        $this->idx -= $howFar;
        if( $this->idx < 0 ) $this->idx = 0;
    }
}

$myStateMachine = new StateMachine( new TextReader($myXmlFileContents) );
$found = $myStateMachine->getTagContents("<some:tag>", "</some:tag>");

类文本阅读器{
私人$idx=0；
私人阅读；
私人$lastIdx；
公共功能构造（$reading）{
$this->reading=$reading；
$this->lastIdx=strlen（$reading）-1；
}
公共功能hasMore（）{
返回$this->idx<$this->lastIdx；
}
公共功能nextChar（）{
如果（！$this->hasMore（））返回null；
返回$this->reading[$this->idx++]；
}
公共功能倒带（$howFar）{
$this->idx-=$howFar；
如果（$this->idx<0）$this->idx=0；
}
}

然后，您可以这样调用您的状态机：

$found = $myStateMachine->getTagContents("<some:tag>", "</some:tag>");

class TextReader {
    private $idx = 0;
    private $reading;
    private $lastIdx;

    public function __construct($reading) {
        $this->reading = $reading;
        $this->lastIdx = strlen($reading) - 1;
    }

    public function hasMore() {
        return $this->idx < $this->lastIdx;
    }

    public function nextChar() {
        if( !$this->hasMore() ) return null;

        return $this->reading[$this->idx++];
    }

    public function rewind($howFar) {
        $this->idx -= $howFar;
        if( $this->idx < 0 ) $this->idx = 0;
    }
}

$myStateMachine = new StateMachine( new TextReader($myXmlFileContents) );
$found = $myStateMachine->getTagContents("<some:tag>", "</some:tag>");

$myStateMachine=newstatemachine（newtextreader（$myXmlFileContents））；
$found=$myStateMachine->getTagContents（“，”）；

与我的其他答案不同，此版本将读取带有嵌套标记的标记：

$text = "
<div>
    Some content 1
    </
    <some:tag>
        Section 1
    </some:tag>
    <b>Some content 2
    <some:tag>
        Section 2
        <some:tag>
            Section 3
        </some:tag>
    </some:tag>
    Some content 3
    </p>
</div>
";

$parser = new Parser( new TextReader($text) );
$found = $parser->findTags("<some:tag>", "</some:tag>");

class TextReader {
    private $idx = 0;
    private $reading;
    private $lastIdx;

    public function __construct($reading) {
        $this->reading = $reading;
        $this->lastIdx = strlen($reading) - 1;
    }

    public function hasMore() {
        return $this->idx < $this->lastIdx;
    }

    public function nextChar() {
        if( !$this->hasMore() ) return null;

        return $this->reading[$this->idx++];
    }

    public function rewind($howFar) {
        $this->idx -= $howFar;
        if( $this->idx < 0 ) $this->idx = 0;
    }
}


class Parser {
    private $TextReader;

    public function __construct($TextReader) {
        $this->TextReader = $TextReader;
    }

    public function findTags($startTagName, $endTagName) {
        $found = array();

        while( ($next = $this->findNextTag($startTagName, $endTagName)) != null ) {
            $found[] = $next;
        }

        return $found;
    }

    public function findNextTag($startTagName, $endTagName) {
        // find the start of our first tag
        $junk = $this->readForTag($startTagName);
        if( $junk == null ) return null; // didn't find another tag

        $nests = 0;
        $started = false;

        $startLength = strlen($startTagName);
        $endLength = strlen($endTagName);

        $readSoFar = "";

        while($this->TextReader->hasMore()) {
            // found a start tag
            if( substr( $readSoFar, $readSoFarLength - $startLength ) == $startTagName ) {
                $started = true;
                $nests++;
            }

            // found an end tag
            if( substr( $readSoFar, $readSoFarLength - $endLength ) == $endTagName ) $nests--;

            $readSoFar .= $this->TextReader->nextChar();

            // if we've started, and we found as many starts as ends
            if( $started && $nests == 0 ) return $readSoFar;
        }

        return null;
    }

    /*
     * read the Text Reader until you find a certain tag, and
     * return what you read before finding the tag, including the tag itself
     *
     * Text Reader will be rewound to the beginning of the tag
     */
    private function readForTag($tagName) {
        $readSoFar = "";

        $tagNameLength = strlen($tagName);

        while($this->TextReader->hasMore()) {
            // if the last few characters read are the tag
            if( substr( $readSoFar, strlen($readSoFar) - $tagNameLength ) == $tagName ) {
                // rewind
                $this->TextReader->rewind($tagNameLength);

                // return what we've read
                return $readSoFar;
            }

            $readSoFar .= $this->TextReader->nextChar();
        }

        return null;
    }
}

$text=”
部分内容1
结果是：
class TagExtractor {

    public $content;
    public $tag;

    public function getTagContent() {
        $result = [];
        $startTag = "<{$this->getTag()}>";
        $endTag = "</{$this->getTag()}>";
        $content = $this->getContent();
        $offset = strpos($content, $startTag);
        while ($offset !== false) {
            $end = $this->findEnd($content, $offset, $startTag, $endTag);
            $result[] = substr($content, $offset, $end - $offset);
            $offset = strpos($content, $startTag, $end);
        }
        return $result;
    }

    public function findEnd($content, $offset, $startTag, $endTag, $counter = 1) {
        $offset++;
        $nextStart = strpos($content, $startTag, $offset);
        $nextEnd = strpos($content, $endTag, $offset);
        if ($nextEnd === false) {
            $counter = 0;
        } elseif ($nextStart < $nextEnd && $nextStart !== false) {
            $counter++;
            $offset = $nextStart;
        } elseif ($nextEnd < $nextStart || ($nextStart === false && $nextEnd !== false)) {
            $counter--;
            $offset = $nextEnd;
        }
        if ($counter === 0) {
            return $offset + strlen($endTag);
        }
        return $this->findEnd($content, $offset, $startTag, $endTag, $counter);
    }

    // <editor-fold defaultstate="collapsed" desc="Getters and setters">
    public function getContent() {
        return $this->content;
    }

    public function setContent($content) {
        $this->content = $content;
        return $this;
    }

    public function getTag() {
        return $this->tag;
    }

    public function setTag($tag) {
        $this->tag = $tag;
        return $this;
    }
    // </editor-fold>
}

类标记提取器{
公共内容；
公共$标签；
公共函数getTagContent（）{
$result=[]；
$startTag=“getTag（）}>”；
$endTag=“getTag（）}>”；
$content=$this->getContent（）；
$offset=strpos（$content，$startTag）；
而（$offset！==false）{
$end=$this->findEnd（$content、$offset、$startTag、$endTag）；
$result[]=substr（$content，$offset，$end-$offset）；
$offset=strpos（$content，$startTag，$end）；
}
返回$result；
}
公共函数findEnd（$content、$offset、$startTag、$endTag、$counter=1）{
$offset++；
$nextStart=strpos（$content、$startTag、$offset）；
$nextEnd=strpos（$content，$endTag，$offset）；
如果（$nextEnd==false）{
$counter=0；
}elseif（$nextStart<$nextEnd&&$nextStart！==false）{
$counter++；
$offset=$nextStart；
}elseif（$nextEnd<$nextStart | |（$nextStart==false&$nextEnd！==false））{
$counter--；
$offset=$nextEnd；
}
如果（$counter==0）{
返回$offset+strlen（$endTag）；
}
返回$this->findEnd（$content、$offset、$startTag、$endTag、$counter）；
}
// 
公共函数getContent（）{
返回$this->content；
}
公共函数setContent（$content）{
$this->content=$content；
退还$this；
}
公共函数getTag（）{
返回$this->tag；
}
公共函数setTag（$tag）{
$this->tag=$tag；
退还$this；
}
// 
}

似乎几乎可以正常工作，但它不会在标记结束后返回内容：@Petah它将一直读取，直到找到您要查找的标记的结尾。哦，我明白了，您希望在原始标记中允许其他

。我将添加一些内容。@Petah我添加了另一个嵌套版本的答案。这不是真正的状态中文，所以我只是加上它作为另一个答案：）似乎是有效的，尽管我最终得到了这个：