Parsing 解析维基百科{{Location map}}模板

Parsing 解析维基百科{{Location map}}模板,parsing,wikipedia,text-parsing,Parsing,Wikipedia,Text Parsing,我想解析包含{{Location map}}模板的Wikipedia电厂列表。在我的例子中,我使用的是德语翻译,但这不应该改变基本过程 如何从这样的代码中获取label=、lat=、lon=和region=参数? 对于像BeautifulSoup这样的html解析器来说,这可能不是什么,而是awk {{ Positionskarte+ | Tadschikistan | maptype = relief | width = 600 | float = ri

我想解析包含{{Location map}}模板的Wikipedia电厂列表。在我的例子中,我使用的是德语翻译,但这不应该改变基本过程

如何从这样的代码中获取label=、lat=、lon=和region=参数? 对于像BeautifulSoup这样的html解析器来说,这可能不是什么,而是awk

{{ Positionskarte+
 | Tadschikistan
 | maptype     = relief
 | width       = 600
 | float       = right
 | caption     =
 | places      =
 {{ Positionskarte~
  | Tadschikistan
  | label      = <small>[[Talsperre Baipasa|Baipasa]]</small>
  | marktarget =
  | mark       = Blue pog.svg
  | position   = right
  | lat        = 38.267584
  | long       = 69.123906
  | region     = TJ
  | background = #FEFEE9
 }}
 {{ Positionskarte~
  | Tadschikistan
  | label      = <small>[[Kraftwerk Duschanbe|Duschanbe]]</small>
  | marktarget =
  | mark       = Red pog.svg
  | position   = left
  | lat        = 38.5565
  | long       = 68.776
  | region     = TJ
  | background = #FEFEE9
 }}
...
}}
{{Positionskarte+
|塔吉克斯坦
|地图类型=浮雕
|宽度=600
|浮动=右
|标题=
|地点=
{{Positionskarte~
|塔吉克斯坦
|标签=[[Talsperre Baipasa | Baipasa]]
|标记目标=
|标记=蓝色pog.svg
|位置=右
|lat=38.267584
|长=69.123906
|区域=TJ
|背景=#FEFEE9
}}
{{Positionskarte~
|塔吉克斯坦
|标签=[[Kraftwerk Duschanbe | Duschanbe]]
|标记目标=
|标记=红色pog.svg
|位置=左
|lat=38.5565
|长=68.776
|区域=TJ
|背景=#FEFEE9
}}
...
}}

提前谢谢

只需使用正则表达式提取信息即可。 例如这样(
PHP

$k=“{{Positionskarte+
|塔吉克斯坦
|地图类型=浮雕
|宽度=600
|浮动=右
|标题=
|地点=
{{Positionskarte~
|塔吉克斯坦
|标签=[[Talsperre Baipasa | Baipasa]]
|标记目标=
|标记=蓝色pog.svg
|位置=右
|lat=38.267584
|长=69.123906
|区域=TJ
|背景=#FEFEE9
}}
{{Positionskarte~
|塔吉克斯坦
|标签=[[Kraftwerk Duschanbe | Duschanbe]]
|标记目标=
|标记=红色pog.svg
|位置=左
|lat=38.5565
|长=68.776
|区域=TJ
|背景=#FEFEE9
}}
}}";
$items=explode(“Positionskarte~”,$k);
$result=[];
foreach($items作为$item){
$info=[];
$pattern1='/label\s+=\s+(.+)/';
预匹配($pattern1,$item,$matches);
如果(!空($matches)){
$info['label']=$matches[1];
}
$pattern2='/lat\s+=\s+(.+)/';
预匹配($pattern2,$item,$matches);
如果(!空($matches)){
$info['lat']=$matches[1];
}
$pattern3='/long\s+=\s+(.+)/';
预匹配($pattern3,$item,$matches);
如果(!空($matches)){
$info['long']=$matches[1];
}
$pattern4='/region\s+=\s+(.+)/';
预匹配($pattern4,$item,$matches);
如果(!空($matches)){
$info['region']=$matches[1];
}
如果(!空($info)){
$result[]=$info;
}
}
var_dump($结果);

谢谢!是否可以在管道符号处拆分标签?我想我可以用str|u replace删除“”是的,这是可能的:
$labelStringParts=explode('|',$labelString)
如果您阅读了有关
preg_match
explode
str_replace
函数的文档,您将能够用
PHP
中的信息编写解析任何字符串。或者对你最喜欢的编程语言也这样做。它对{{Location map~ | rusia | label=[[Balakovo核电厂| Balakovo]]| lat=52.09121454 | long=47.95528861 | position=bottom | mark=Location dot red.svg}有何作用??
$k = "{{ Positionskarte+
 | Tadschikistan
 | maptype     = relief
 | width       = 600
 | float       = right
 | caption     =
 | places      =
 {{ Positionskarte~
  | Tadschikistan
  | label      = <small>[[Talsperre Baipasa|Baipasa]]</small>
  | marktarget =
  | mark       = Blue pog.svg
  | position   = right
  | lat        = 38.267584
  | long       = 69.123906
  | region     = TJ
  | background = #FEFEE9
 }}
 {{ Positionskarte~
  | Tadschikistan
  | label      = <small>[[Kraftwerk Duschanbe|Duschanbe]]</small>
  | marktarget =
  | mark       = Red pog.svg
  | position   = left
  | lat        = 38.5565
  | long       = 68.776
  | region     = TJ
  | background = #FEFEE9
 }}
}}";

$items = explode("Positionskarte~", $k);

$result = [];

foreach ($items as $item) {
    $info = [];
    $pattern1 = '/label\s+=\s+(.+)/';
    preg_match($pattern1, $item, $matches);
    if (!empty($matches)) {
        $info['label'] = $matches[1];       
    }
    $pattern2 = '/lat\s+=\s+(.+)/';
    preg_match($pattern2, $item, $matches);
    if (!empty($matches)) {
        $info['lat'] = $matches[1];     
    }
    $pattern3 = '/long\s+=\s+(.+)/';
    preg_match($pattern3, $item, $matches);
    if (!empty($matches)) {
    $info['long'] = $matches[1];        
    }

    $pattern4 = '/region\s+=\s+(.+)/';
    preg_match($pattern4, $item, $matches);
    if (!empty($matches)) {
        $info['region'] = $matches[1];      
    }

    if(!empty($info)) {
        $result[] = $info;
    }
}

var_dump($result);