Python:嵌套键值数据解析
我正在尝试创建一个python脚本,它可以解析以下类型的日志条目,其中包括键和值。对于每个键,可能存在也可能不存在另一对嵌套的键和值。下面是一个例子。嵌套的深度可以根据我得到的日志而变化,所以它必须是动态的。但是,深度用大括号封装 带有键和值的字符串如下所示:Python:嵌套键值数据解析,python,parsing,nested,Python,Parsing,Nested,我正在尝试创建一个python脚本,它可以解析以下类型的日志条目,其中包括键和值。对于每个键,可能存在也可能不存在另一对嵌套的键和值。下面是一个例子。嵌套的深度可以根据我得到的日志而变化,所以它必须是动态的。但是,深度用大括号封装 带有键和值的字符串如下所示: Countries = { "USA" = 0; "Spain" = 0; Connections = 1; Flights = { "KLM" = 11;
Countries = {
"USA" = 0;
"Spain" = 0;
Connections = 1;
Flights = {
"KLM" = 11;
"Air America" = 15;
"Emirates" = 2;
"Delta" = 3;
};
"Belgium" = 1;
"Czech Republic" = 0;
"Netherlands" = 1;
"Hungary" = 0;
"Luxembourg" = 0;
"Italy" = 0;
};
def parseNestedData(data):
if isinstance(data, dict):
for k in data.keys():
parseNestedData(data.get(k))
else:
print data
import re
class datastruct():
def __init__(self,data_in):
flights = re.findall('(?:Flights\s=\s*\{)([\s"A-Z=0-9;a-z]*)};',data_in)
flight_dict = {}
for flight in flights[0].split(';')[0:-1]:
key,val = self.split_data(flight)
flight_dict[key] = val
countries = re.findall('("[A-Za-z]+\s?[A-Za-z]*"\s=\s[0-9]{1,2})',data_in)
countries_dict = {}
for country in countries:
key,val = self.split_data(country)
if key not in flight_dict:
countries_dict[key]=val
connections = re.findall('(?:Connections\s=\s)([0-9]*);',data_in)
self.country= countries_dict
self.flight = flight_dict
self.connections = int(connections[0])
def split_data(self,data2):
item = data2.split('=')
key = item[0].strip().strip('"')
val = int(item[1].strip())
return key,val
上面的数据也可以有多个嵌套。我想编写一个函数,通过这个函数进行解析,并将其放入一个数据数组(或类似的数组),这样我就可以得到一个特定键的值,如:
print countries.belgium
value should be printed as 1
同样地
print countries.flights.delta
value should be printed as 3.
请注意,输入不需要在所有键中都加引号(如连接或航班)
任何指向我可以开始的东西的指针。任何已经可以这样做一些解析的python库?迭代数据并检查元素是否是另一个键值对,如果是,则递归调用该函数。大概是这样的:
Countries = {
"USA" = 0;
"Spain" = 0;
Connections = 1;
Flights = {
"KLM" = 11;
"Air America" = 15;
"Emirates" = 2;
"Delta" = 3;
};
"Belgium" = 1;
"Czech Republic" = 0;
"Netherlands" = 1;
"Hungary" = 0;
"Luxembourg" = 0;
"Italy" = 0;
};
def parseNestedData(data):
if isinstance(data, dict):
for k in data.keys():
parseNestedData(data.get(k))
else:
print data
import re
class datastruct():
def __init__(self,data_in):
flights = re.findall('(?:Flights\s=\s*\{)([\s"A-Z=0-9;a-z]*)};',data_in)
flight_dict = {}
for flight in flights[0].split(';')[0:-1]:
key,val = self.split_data(flight)
flight_dict[key] = val
countries = re.findall('("[A-Za-z]+\s?[A-Za-z]*"\s=\s[0-9]{1,2})',data_in)
countries_dict = {}
for country in countries:
key,val = self.split_data(country)
if key not in flight_dict:
countries_dict[key]=val
connections = re.findall('(?:Connections\s=\s)([0-9]*);',data_in)
self.country= countries_dict
self.flight = flight_dict
self.connections = int(connections[0])
def split_data(self,data2):
item = data2.split('=')
key = item[0].strip().strip('"')
val = int(item[1].strip())
return key,val
输出:
>>> Countries = {
"USA" : 0,
"Spain" : 0,
"Connections" : 1,
"Flights" : {
"KLM" : 11,
"Air America" : 15,
"Emirates" : 2,
"Delta" : 3,
},
"Belgium" : 1,
"Czech Republic" : 0,
"Netherlands" : 1,
"Hungary" : 0,
"Luxembourg" : 0,
"Italy" :0
};
>>> Countries
{'Connections': 1,
'Flights': {'KLM': 11, 'Air America': 15, 'Emirates': 2, 'Delta': 3},
'Netherlands': 1,
'Italy': 0,
'Czech Republic': 0,
'USA': 0,
'Belgium': 1,
'Hungary': 0,
'Luxembourg': 0, 'Spain': 0}
>>> parseNestedData(Countries)
1
11
15
2
3
1
0
0
0
1
0
0
0
迭代数据并检查元素是否是另一个键值对,如果是,则递归调用该函数。大概是这样的:
Countries = {
"USA" = 0;
"Spain" = 0;
Connections = 1;
Flights = {
"KLM" = 11;
"Air America" = 15;
"Emirates" = 2;
"Delta" = 3;
};
"Belgium" = 1;
"Czech Republic" = 0;
"Netherlands" = 1;
"Hungary" = 0;
"Luxembourg" = 0;
"Italy" = 0;
};
def parseNestedData(data):
if isinstance(data, dict):
for k in data.keys():
parseNestedData(data.get(k))
else:
print data
import re
class datastruct():
def __init__(self,data_in):
flights = re.findall('(?:Flights\s=\s*\{)([\s"A-Z=0-9;a-z]*)};',data_in)
flight_dict = {}
for flight in flights[0].split(';')[0:-1]:
key,val = self.split_data(flight)
flight_dict[key] = val
countries = re.findall('("[A-Za-z]+\s?[A-Za-z]*"\s=\s[0-9]{1,2})',data_in)
countries_dict = {}
for country in countries:
key,val = self.split_data(country)
if key not in flight_dict:
countries_dict[key]=val
connections = re.findall('(?:Connections\s=\s)([0-9]*);',data_in)
self.country= countries_dict
self.flight = flight_dict
self.connections = int(connections[0])
def split_data(self,data2):
item = data2.split('=')
key = item[0].strip().strip('"')
val = int(item[1].strip())
return key,val
输出:
>>> Countries = {
"USA" : 0,
"Spain" : 0,
"Connections" : 1,
"Flights" : {
"KLM" : 11,
"Air America" : 15,
"Emirates" : 2,
"Delta" : 3,
},
"Belgium" : 1,
"Czech Republic" : 0,
"Netherlands" : 1,
"Hungary" : 0,
"Luxembourg" : 0,
"Italy" :0
};
>>> Countries
{'Connections': 1,
'Flights': {'KLM': 11, 'Air America': 15, 'Emirates': 2, 'Delta': 3},
'Netherlands': 1,
'Italy': 0,
'Czech Republic': 0,
'USA': 0,
'Belgium': 1,
'Hungary': 0,
'Luxembourg': 0, 'Spain': 0}
>>> parseNestedData(Countries)
1
11
15
2
3
1
0
0
0
1
0
0
0
我已经创建了一个示例python脚本来完成这项工作,只需根据需要进行调整即可。它将您的格式转换为嵌套的dict,并且它是动态的 请看这里: 代码: 样本数据的结果:
{'Countries': {'USA': 1, 'Technical Fault': 0, 'No Connections': 0, 'Delayed': 0, 'Connections': {'17 Flights': 0, '10 Flights': 0, '11 Flights': 0, 'More than 25 Flights': 0, '14 Flights': 0, '15 Flights': 0, '12 Flights': 0, '18 Flights': 0, '16 Flights': 0, '1 Flights': 0, '13 Flights': 0}, 'Single Connections': 0, 'Others': 0}}
您可以像普通dict一样获得值:)希望它有帮助…我已经创建了一个示例python脚本来完成这项工作,只需根据您的喜好对其进行调整即可。它将您的格式转换为嵌套的dict,并且它是动态的 请看这里: 代码: 样本数据的结果:
{'Countries': {'USA': 1, 'Technical Fault': 0, 'No Connections': 0, 'Delayed': 0, 'Connections': {'17 Flights': 0, '10 Flights': 0, '11 Flights': 0, 'More than 25 Flights': 0, '14 Flights': 0, '15 Flights': 0, '12 Flights': 0, '18 Flights': 0, '16 Flights': 0, '1 Flights': 0, '13 Flights': 0}, 'Single Connections': 0, 'Others': 0}}
您可以像普通dict一样获得值:)希望它有助于…定义一个类结构来处理和存储信息,可以为您提供如下信息:
Countries = {
"USA" = 0;
"Spain" = 0;
Connections = 1;
Flights = {
"KLM" = 11;
"Air America" = 15;
"Emirates" = 2;
"Delta" = 3;
};
"Belgium" = 1;
"Czech Republic" = 0;
"Netherlands" = 1;
"Hungary" = 0;
"Luxembourg" = 0;
"Italy" = 0;
};
def parseNestedData(data):
if isinstance(data, dict):
for k in data.keys():
parseNestedData(data.get(k))
else:
print data
import re
class datastruct():
def __init__(self,data_in):
flights = re.findall('(?:Flights\s=\s*\{)([\s"A-Z=0-9;a-z]*)};',data_in)
flight_dict = {}
for flight in flights[0].split(';')[0:-1]:
key,val = self.split_data(flight)
flight_dict[key] = val
countries = re.findall('("[A-Za-z]+\s?[A-Za-z]*"\s=\s[0-9]{1,2})',data_in)
countries_dict = {}
for country in countries:
key,val = self.split_data(country)
if key not in flight_dict:
countries_dict[key]=val
connections = re.findall('(?:Connections\s=\s)([0-9]*);',data_in)
self.country= countries_dict
self.flight = flight_dict
self.connections = int(connections[0])
def split_data(self,data2):
item = data2.split('=')
key = item[0].strip().strip('"')
val = int(item[1].strip())
return key,val
请注意,如果数据不完全如我在下面假设的那样,正则表达式可能需要调整。数据可按如下方式设置和引用:
raw_data = 'Countries = { "USA" = 0; "Spain" = 0; Connections = 1; Flights = { "KLM" = 11; "Air America" = 15; "Emirates" = 2; "Delta" = 3; }; "Belgium" = 1; "Czech Republic" = 0; "Netherlands" = 1; "Hungary" = 0; "Luxembourg" = 0; "Italy" = 0;};'
flight_data = datastruct(raw_data)
print("No. Connections:",flight_data.connections)
print("Country 'USA':",flight_data.country['USA'],'\n'
print("Flight 'KLM':",flight_data.flight['KLM'],'\n')
for country in flight_data.country.keys():
print("Country: {0} -> {1}".format(country,flight_data.country[country]))
定义一个类结构来处理和存储信息,可以提供如下功能:
Countries = {
"USA" = 0;
"Spain" = 0;
Connections = 1;
Flights = {
"KLM" = 11;
"Air America" = 15;
"Emirates" = 2;
"Delta" = 3;
};
"Belgium" = 1;
"Czech Republic" = 0;
"Netherlands" = 1;
"Hungary" = 0;
"Luxembourg" = 0;
"Italy" = 0;
};
def parseNestedData(data):
if isinstance(data, dict):
for k in data.keys():
parseNestedData(data.get(k))
else:
print data
import re
class datastruct():
def __init__(self,data_in):
flights = re.findall('(?:Flights\s=\s*\{)([\s"A-Z=0-9;a-z]*)};',data_in)
flight_dict = {}
for flight in flights[0].split(';')[0:-1]:
key,val = self.split_data(flight)
flight_dict[key] = val
countries = re.findall('("[A-Za-z]+\s?[A-Za-z]*"\s=\s[0-9]{1,2})',data_in)
countries_dict = {}
for country in countries:
key,val = self.split_data(country)
if key not in flight_dict:
countries_dict[key]=val
connections = re.findall('(?:Connections\s=\s)([0-9]*);',data_in)
self.country= countries_dict
self.flight = flight_dict
self.connections = int(connections[0])
def split_data(self,data2):
item = data2.split('=')
key = item[0].strip().strip('"')
val = int(item[1].strip())
return key,val
请注意,如果数据不完全如我在下面假设的那样,正则表达式可能需要调整。数据可按如下方式设置和引用:
raw_data = 'Countries = { "USA" = 0; "Spain" = 0; Connections = 1; Flights = { "KLM" = 11; "Air America" = 15; "Emirates" = 2; "Delta" = 3; }; "Belgium" = 1; "Czech Republic" = 0; "Netherlands" = 1; "Hungary" = 0; "Luxembourg" = 0; "Italy" = 0;};'
flight_data = datastruct(raw_data)
print("No. Connections:",flight_data.connections)
print("Country 'USA':",flight_data.country['USA'],'\n'
print("Flight 'KLM':",flight_data.flight['KLM'],'\n')
for country in flight_data.country.keys():
print("Country: {0} -> {1}".format(country,flight_data.country[country]))
谢谢希曼殊。我怎样才能得到捷克共和国的值(应该只返回0),这也需要一些预处理?因为并非所有键都用双引号括起来,例如-connections如果您知道捷克共和国键位于第一级,那么只需执行
data.get('捷克共和国')
数据中的任何键都应该是不可变的,也就是说,它可以是string
、integer
或tuple
类型。JustConnections
无效,这就是我编辑此问题的原因。谢谢Himanshu。我怎样才能得到捷克共和国的值(应该只返回0),这也需要一些预处理?因为并非所有键都用双引号括起来,例如-connections如果您知道捷克共和国键位于第一级,那么只需执行data.get('捷克共和国')
数据中的任何键都应该是不可变的,也就是说,它可以是string
、integer
或tuple
类型。JustConnections
无效,这就是我编辑问题的原因。您确实需要在答案中包含代码。仅仅链接到它是不够好的。@richmondwang,正是我想要的。然而,这次我的动态字符串如下所示,这给了我一个语法错误:您传递了什么数据@用户2605278AHH。这是因为前面的键的数值。我会修改它。只要用{data\u string}
括起数据,就不会出现解析错误:)您确实需要在答案中包含代码。仅仅链接到它是不够好的。@richmondwang,正是我想要的。然而,这次我的动态字符串如下所示,这给了我一个语法错误:您传递了什么数据@用户2605278AHH。这是因为前面的键的数值。我将修改它。只需用{data\u string}
将数据括起来,这样就不会出现解析错误:)