Php 查找两个XML格式不正确且包含嵌套标记的标记之间的文本
有没有简单的方法可以在两个XML格式不正确的标记之间查找文本并忽略嵌套 鉴于这一内容:Php 查找两个XML格式不正确且包含嵌套标记的标记之间的文本,php,xml,parsing,xml-parsing,html-parsing,Php,Xml,Parsing,Xml Parsing,Html Parsing,有没有简单的方法可以在两个XML格式不正确的标记之间查找文本并忽略嵌套 鉴于这一内容: <div> Some content 1 </ <some:tag> Section 1 </some:tag> <b>Some content 2 <some:tag> Section 2 <some:tag>
<div>
Some content 1
</
<some:tag>
Section 1
</some:tag>
<b>Some content 2
<some:tag>
Section 2
<some:tag>
Section 3
</some:tag>
</some:tag>
Some content 3
</p>
</div>
部分内容1
字符串'
第一节
'(长度=52)
1=>string'
第二节
第三节
'(长度=125)
你尝试过什么
我曾尝试使用strpos/substr退出匹配,但我对逻辑有点迷茫:
function findSomeTag($str) {
$result = [];
$startTag = "<some:tag>";
$endTag = "</some:tag>";
$offset = 0;
$start = strpos($str, $startTag, $offset);
while ($start !== false) {
$nextStart = strpos($str, $startTag, $start + 1);
$nextEnd = strpos($str, $endTag, $start + 1);
if ($nextStart === false || $nextEnd < $nextStart) {
$result[] = substr($str, $start, $nextEnd - $start + strlen($endTag));
}
$start = $nextStart;
}
return $result;
}
函数findSomeTag($str){
$result=[];
$startTag=“”;
$endTag=“”;
$offset=0;
$start=strpos($str、$startTag、$offset);
while($start!==false){
$nextStart=strpos($str、$startTag、$start+1);
$nextEnd=strpos($str,$endTag,$start+1);
如果($nextStart==false | |$nextEnd<$nextStart){
$result[]=substr($str,$start,$nextEnd-$start+strlen($endTag));
}
$start=$nextStart;
}
返回$result;
}
(注意:上述函数完全不起作用,可能会无限循环。)要包含嵌套的标记,可以计算当前打开的标记数
因此,当
$nextEnd>$nextStart
递增$counter
时,仅当$nextEnd<$nextStart&&$counter==1
时添加新结果(您有一个打开的标记)。如果$nextEnd<$nextStart&&$counter<1
减量$counter
我认为最简单的解析方法是使用类似状态机的东西。基本上,您定义了一组特定的状态以及离开这些状态并进入其他状态的条件
假设您的文本位于某种文本阅读器中,该阅读器可以为您提供下一个字符并向前移动指针,还可以将指针倒回一定数量的字符
然后您可以创建一个类似这样的状态机(它原来是一个简单的状态机,只有一个状态基本上在其内部循环):
那么就这样称呼它:
$found = $myStateMachine->getTagContents("<some:tag>", "</some:tag>");
class TextReader {
private $idx = 0;
private $reading;
private $lastIdx;
public function __construct($reading) {
$this->reading = $reading;
$this->lastIdx = strlen($reading) - 1;
}
public function hasMore() {
return $this->idx < $this->lastIdx;
}
public function nextChar() {
if( !$this->hasMore() ) return null;
return $this->reading[$this->idx++];
}
public function rewind($howFar) {
$this->idx -= $howFar;
if( $this->idx < 0 ) $this->idx = 0;
}
}
$myStateMachine = new StateMachine( new TextReader($myXmlFileContents) );
$found = $myStateMachine->getTagContents("<some:tag>", "</some:tag>");
$found=$myStateMachine->getTagContents(“,”);
文本阅读器可能如下所示:
$found = $myStateMachine->getTagContents("<some:tag>", "</some:tag>");
class TextReader {
private $idx = 0;
private $reading;
private $lastIdx;
public function __construct($reading) {
$this->reading = $reading;
$this->lastIdx = strlen($reading) - 1;
}
public function hasMore() {
return $this->idx < $this->lastIdx;
}
public function nextChar() {
if( !$this->hasMore() ) return null;
return $this->reading[$this->idx++];
}
public function rewind($howFar) {
$this->idx -= $howFar;
if( $this->idx < 0 ) $this->idx = 0;
}
}
$myStateMachine = new StateMachine( new TextReader($myXmlFileContents) );
$found = $myStateMachine->getTagContents("<some:tag>", "</some:tag>");
类文本阅读器{
私人$idx=0;
私人阅读;
私人$lastIdx;
公共功能构造($reading){
$this->reading=$reading;
$this->lastIdx=strlen($reading)-1;
}
公共功能hasMore(){
返回$this->idx<$this->lastIdx;
}
公共功能nextChar(){
如果(!$this->hasMore())返回null;
返回$this->reading[$this->idx++];
}
公共功能倒带($howFar){
$this->idx-=$howFar;
如果($this->idx<0)$this->idx=0;
}
}
然后,您可以这样调用您的状态机:
$found = $myStateMachine->getTagContents("<some:tag>", "</some:tag>");
class TextReader {
private $idx = 0;
private $reading;
private $lastIdx;
public function __construct($reading) {
$this->reading = $reading;
$this->lastIdx = strlen($reading) - 1;
}
public function hasMore() {
return $this->idx < $this->lastIdx;
}
public function nextChar() {
if( !$this->hasMore() ) return null;
return $this->reading[$this->idx++];
}
public function rewind($howFar) {
$this->idx -= $howFar;
if( $this->idx < 0 ) $this->idx = 0;
}
}
$myStateMachine = new StateMachine( new TextReader($myXmlFileContents) );
$found = $myStateMachine->getTagContents("<some:tag>", "</some:tag>");
$myStateMachine=newstatemachine(newtextreader($myXmlFileContents));
$found=$myStateMachine->getTagContents(“,”);
与我的其他答案不同,此版本将读取带有嵌套标记的标记:
$text = "
<div>
Some content 1
</
<some:tag>
Section 1
</some:tag>
<b>Some content 2
<some:tag>
Section 2
<some:tag>
Section 3
</some:tag>
</some:tag>
Some content 3
</p>
</div>
";
$parser = new Parser( new TextReader($text) );
$found = $parser->findTags("<some:tag>", "</some:tag>");
class TextReader {
private $idx = 0;
private $reading;
private $lastIdx;
public function __construct($reading) {
$this->reading = $reading;
$this->lastIdx = strlen($reading) - 1;
}
public function hasMore() {
return $this->idx < $this->lastIdx;
}
public function nextChar() {
if( !$this->hasMore() ) return null;
return $this->reading[$this->idx++];
}
public function rewind($howFar) {
$this->idx -= $howFar;
if( $this->idx < 0 ) $this->idx = 0;
}
}
class Parser {
private $TextReader;
public function __construct($TextReader) {
$this->TextReader = $TextReader;
}
public function findTags($startTagName, $endTagName) {
$found = array();
while( ($next = $this->findNextTag($startTagName, $endTagName)) != null ) {
$found[] = $next;
}
return $found;
}
public function findNextTag($startTagName, $endTagName) {
// find the start of our first tag
$junk = $this->readForTag($startTagName);
if( $junk == null ) return null; // didn't find another tag
$nests = 0;
$started = false;
$startLength = strlen($startTagName);
$endLength = strlen($endTagName);
$readSoFar = "";
while($this->TextReader->hasMore()) {
// found a start tag
if( substr( $readSoFar, $readSoFarLength - $startLength ) == $startTagName ) {
$started = true;
$nests++;
}
// found an end tag
if( substr( $readSoFar, $readSoFarLength - $endLength ) == $endTagName ) $nests--;
$readSoFar .= $this->TextReader->nextChar();
// if we've started, and we found as many starts as ends
if( $started && $nests == 0 ) return $readSoFar;
}
return null;
}
/*
* read the Text Reader until you find a certain tag, and
* return what you read before finding the tag, including the tag itself
*
* Text Reader will be rewound to the beginning of the tag
*/
private function readForTag($tagName) {
$readSoFar = "";
$tagNameLength = strlen($tagName);
while($this->TextReader->hasMore()) {
// if the last few characters read are the tag
if( substr( $readSoFar, strlen($readSoFar) - $tagNameLength ) == $tagName ) {
// rewind
$this->TextReader->rewind($tagNameLength);
// return what we've read
return $readSoFar;
}
$readSoFar .= $this->TextReader->nextChar();
}
return null;
}
}
$text=”
部分内容1
结果是:
class TagExtractor {
public $content;
public $tag;
public function getTagContent() {
$result = [];
$startTag = "<{$this->getTag()}>";
$endTag = "</{$this->getTag()}>";
$content = $this->getContent();
$offset = strpos($content, $startTag);
while ($offset !== false) {
$end = $this->findEnd($content, $offset, $startTag, $endTag);
$result[] = substr($content, $offset, $end - $offset);
$offset = strpos($content, $startTag, $end);
}
return $result;
}
public function findEnd($content, $offset, $startTag, $endTag, $counter = 1) {
$offset++;
$nextStart = strpos($content, $startTag, $offset);
$nextEnd = strpos($content, $endTag, $offset);
if ($nextEnd === false) {
$counter = 0;
} elseif ($nextStart < $nextEnd && $nextStart !== false) {
$counter++;
$offset = $nextStart;
} elseif ($nextEnd < $nextStart || ($nextStart === false && $nextEnd !== false)) {
$counter--;
$offset = $nextEnd;
}
if ($counter === 0) {
return $offset + strlen($endTag);
}
return $this->findEnd($content, $offset, $startTag, $endTag, $counter);
}
// <editor-fold defaultstate="collapsed" desc="Getters and setters">
public function getContent() {
return $this->content;
}
public function setContent($content) {
$this->content = $content;
return $this;
}
public function getTag() {
return $this->tag;
}
public function setTag($tag) {
$this->tag = $tag;
return $this;
}
// </editor-fold>
}
类标记提取器{
公共内容;
公共$标签;
公共函数getTagContent(){
$result=[];
$startTag=“getTag()}>”;
$endTag=“getTag()}>”;
$content=$this->getContent();
$offset=strpos($content,$startTag);
而($offset!==false){
$end=$this->findEnd($content、$offset、$startTag、$endTag);
$result[]=substr($content,$offset,$end-$offset);
$offset=strpos($content,$startTag,$end);
}
返回$result;
}
公共函数findEnd($content、$offset、$startTag、$endTag、$counter=1){
$offset++;
$nextStart=strpos($content、$startTag、$offset);
$nextEnd=strpos($content,$endTag,$offset);
如果($nextEnd==false){
$counter=0;
}elseif($nextStart<$nextEnd&&$nextStart!==false){
$counter++;
$offset=$nextStart;
}elseif($nextEnd<$nextStart | |($nextStart==false&$nextEnd!==false)){
$counter--;
$offset=$nextEnd;
}
如果($counter==0){
返回$offset+strlen($endTag);
}
返回$this->findEnd($content、$offset、$startTag、$endTag、$counter);
}
//
公共函数getContent(){
返回$this->content;
}
公共函数setContent($content){
$this->content=$content;
退还$this;
}
公共函数getTag(){
返回$this->tag;
}
公共函数setTag($tag){
$this->tag=$tag;
退还$this;
}
//
}
似乎几乎可以正常工作,但它不会在标记结束后返回内容:@Petah它将一直读取,直到找到您要查找的标记的结尾。哦,我明白了,您希望在原始标记中允许其他
。我将添加一些内容。@Petah我添加了另一个嵌套版本的答案。这不是真正的状态中文,所以我只是加上它作为另一个答案:)似乎是有效的,尽管我最终得到了这个: