Python 如何处理多行日志文件并转换为csv
假设我有一个如下所示的日志文件:Python 如何处理多行日志文件并转换为csv,python,csv,Python,Csv,假设我有一个如下所示的日志文件: '2021-05-26T09:06:42Z UTC [ db=dev user=rdsdb pid=18771 userid=1 xid=20341068 ]' LOG: SET TRANSACTION READ ONLY; '2021-05-26T09:06:42Z UTC [ db=dev user=rdsdb pid=18771 userid=1 xid=20341068 ]' LOG: SET STATEMENT_TIMEOUT TO 300000; '
'2021-05-26T09:06:42Z UTC [ db=dev user=rdsdb pid=18771 userid=1 xid=20341068 ]' LOG: SET TRANSACTION READ ONLY;
'2021-05-26T09:06:42Z UTC [ db=dev user=rdsdb pid=18771 userid=1 xid=20341068 ]' LOG: SET STATEMENT_TIMEOUT TO 300000;
'2021-05-26T09:06:42Z UTC [ db=dev user=rdsdb pid=18771 userid=1 xid=20341068 ]' LOG: /* hash: 720a01bd6ef3747b7f0585c0a70c01e9 */
select logtime, tbl_id, trim(tbl_name) as tbl_name, col_id, src_encode, tgt_encode, scan_rows,
case
when command_phase = 'Add shadow column complete' then 0
when command_phase = 'Reset Table Partition Manager complete' then 1
when command_phase like 'Shadow Col corrupt sorted regions%' then 2
when command_phase like 'shadow cols must contain same data%' then 3
when command_phase like 'Shadow Col not conform to range partition%' then 4
when command_phase = 'Data copy phase 1 complete' then 5
when command_phase = 'Data copy phase 2 complete' then 6
when command_phase = 'Drop existing shadow column complete' then 7
else -1
end as command_phase,
t2.metadatawritten as committed
from stl_alter_column_encode_events t1, stl_commit_stats t2
where logtime > getdate() - INTERVAL '1 day' and node = -1 and t1.xid = t2.xid;
'2021-05-26T09:06:42Z UTC [ db=dev user=rdsdb pid=18771 userid=1 xid=20341068 ]' LOG: SELECT pg_catalog.stll_alter_column_encode_events.logtime AS logtime, pg_catalog.stll_alter_column_encode_events.tbl_id AS tbl_id, btrim( pg_catalog.stll_alter_column_encode_events.tbl_name ) AS tbl_name, pg_catalog.stll_alter_column_encode_events.col_id AS col_id, pg_catalog.stll_alter_column_encode_events.src_encode AS src_encode, pg_catalog.stll_alter_column_encode_events.tgt_encode AS tgt_encode, pg_catalog.stll_alter_column_encode_events.scan_rows AS scan_rows, CASE WHEN pg_catalog.stll_alter_column_encode_events.command_phase = 'Add shadow column complete'::Char(26) THEN 0 WHEN pg_catalog.stll_alter_column_encode_events.command_phase = 'Reset Table Partition Manager complete'::Char(38) THEN 1 WHEN pg_catalog.stll_alter_column_encode_events.command_phase LIKE 'Shadow Col corrupt sorted regions%' THEN 2 WHEN pg_catalog.stll_alter_column_encode_events.command_phase LIKE 'shadow cols must contain same data%' THEN 3 WHEN pg_catalog.stll_alter_column_encode_events.command_phase LIKE 'Shadow Col not conform to range partition%' THEN 4 WHEN pg_catalog.stll_alter_column_encode_events.command_phase = 'Data copy phase 1 complete'::Char(26) THEN 5 WHEN pg_catalog.stll_alter_column_encode_events.command_phase = 'Data copy phase 2 complete'::Char(26) THEN 6 WHEN pg_catalog.stll_alter_column_encode_events.command_phase = 'Drop existing shadow column complete'::Char(36) THEN 7 ELSE -1 END AS command_phase, pg_catalog.stll_commit_stats.metadatawritten AS committed FROM pg_catalog.stll_alter_column_encode_events, pg_catalog.stll_commit_stats WHERE pg_catalog.stll_alter_column_encode_events.xid = pg_catalog.stll_commit_stats.xid AND pg_catalog.stll_commit_stats.node = -1 AND pg_catalog.stll_alter_column_encode_events.logtime > getdate() - interval '1 day'::Interval;
'2021-05-26T09:06:42Z UTC [ db=dev user=rdsdb pid=18771 userid=1 xid=20341068 ]' LOG: COMMIT;
'2021-05-26T09:06:42Z UTC [ db=dev user=rdsdb pid=18771 userid=1 xid=20341069 ]' LOG: SET query_group to ''
我想转换成csv,如何找到合适的分隔符?
我希望csv的记录是这样的,左边是列名,右边是等号
timestamp='2021-05-26T09:06:42Z UTC
db=dev
user=rdsdb
pid=18771
userid=1
xid=20341068
query=SET TRANSACTION READ ONLY;
对于多行查询:
timestamp: '2021-05-26T09:06:42Z UTC
db=dev
user=rdsdb
pid=18771
userid=1
xid=20341068
query='select logtime, tbl_id, trim(tbl_name) as tbl_name... t2.xid;';
我必须使用正则表达式来创建捕获组吗?如何将其分解为csv记录。问题是,有时查询部分在同一行上,有时它在多行上。您不必“必须”使用正则表达式,尽管一个合适的正则表达式向导(我不是)可能会创建一些不那么脆弱的东西
下面是一个手动解析,它可能会生成一个目录的列表
,以传递给csv编写器。它不必是基于dict的dict
,您可能希望在解析时直接编写,而不是像我这样累积结果
import json
data = '''
'2021-05-26T09:06:42Z UTC [ db=dev user=rdsdb pid=18771 userid=1 xid=20341068 ]' LOG: SET TRANSACTION READ ONLY;
'2021-05-26T09:06:42Z UTC [ db=dev user=rdsdb pid=18771 userid=1 xid=20341068 ]' LOG: SET STATEMENT_TIMEOUT TO 300000;
'2021-05-26T09:06:42Z UTC [ db=dev user=rdsdb pid=18771 userid=1 xid=20341068 ]' LOG: /* hash: 720a01bd6ef3747b7f0585c0a70c01e9 */
select logtime, tbl_id, trim(tbl_name) as tbl_name, col_id, src_encode, tgt_encode, scan_rows,
case
when command_phase = 'Add shadow column complete' then 0
when command_phase = 'Reset Table Partition Manager complete' then 1
when command_phase like 'Shadow Col corrupt sorted regions%' then 2
when command_phase like 'shadow cols must contain same data%' then 3
when command_phase like 'Shadow Col not conform to range partition%' then 4
when command_phase = 'Data copy phase 1 complete' then 5
when command_phase = 'Data copy phase 2 complete' then 6
when command_phase = 'Drop existing shadow column complete' then 7
else -1
end as command_phase,
t2.metadatawritten as committed
from stl_alter_column_encode_events t1, stl_commit_stats t2
where logtime > getdate() - INTERVAL '1 day' and node = -1 and t1.xid = t2.xid;
'2021-05-26T09:06:42Z UTC [ db=dev user=rdsdb pid=18771 userid=1 xid=20341068 ]' LOG: SELECT pg_catalog.stll_alter_column_encode_events.logtime AS logtime, pg_catalog.stll_alter_column_encode_events.tbl_id AS tbl_id, btrim( pg_catalog.stll_alter_column_encode_events.tbl_name ) AS tbl_name, pg_catalog.stll_alter_column_encode_events.col_id AS col_id, pg_catalog.stll_alter_column_encode_events.src_encode AS src_encode, pg_catalog.stll_alter_column_encode_events.tgt_encode AS tgt_encode, pg_catalog.stll_alter_column_encode_events.scan_rows AS scan_rows, CASE WHEN pg_catalog.stll_alter_column_encode_events.command_phase = 'Add shadow column complete'::Char(26) THEN 0 WHEN pg_catalog.stll_alter_column_encode_events.command_phase = 'Reset Table Partition Manager complete'::Char(38) THEN 1 WHEN pg_catalog.stll_alter_column_encode_events.command_phase LIKE 'Shadow Col corrupt sorted regions%' THEN 2 WHEN pg_catalog.stll_alter_column_encode_events.command_phase LIKE 'shadow cols must contain same data%' THEN 3 WHEN pg_catalog.stll_alter_column_encode_events.command_phase LIKE 'Shadow Col not conform to range partition%' THEN 4 WHEN pg_catalog.stll_alter_column_encode_events.command_phase = 'Data copy phase 1 complete'::Char(26) THEN 5 WHEN pg_catalog.stll_alter_column_encode_events.command_phase = 'Data copy phase 2 complete'::Char(26) THEN 6 WHEN pg_catalog.stll_alter_column_encode_events.command_phase = 'Drop existing shadow column complete'::Char(36) THEN 7 ELSE -1 END AS command_phase, pg_catalog.stll_commit_stats.metadatawritten AS committed FROM pg_catalog.stll_alter_column_encode_events, pg_catalog.stll_commit_stats WHERE pg_catalog.stll_alter_column_encode_events.xid = pg_catalog.stll_commit_stats.xid AND pg_catalog.stll_commit_stats.node = -1 AND pg_catalog.stll_alter_column_encode_events.logtime > getdate() - interval '1 day'::Interval;
'2021-05-26T09:06:42Z UTC [ db=dev user=rdsdb pid=18771 userid=1 xid=20341068 ]' LOG: COMMIT;
'2021-05-26T09:06:42Z UTC [ db=dev user=rdsdb pid=18771 userid=1 xid=20341069 ]' LOG: SET query_group to ''
'''
## -----------------------
## You can clean this up :-)
## -----------------------
def parse_log_entry(log_entry):
## -----------------------
## super brittle and crappy hand parse
## variable names like these not be be spoken of
## in polite company
## -----------------------
x = log_entry.split(" [ ")
y = x[1].split(" ]' LOG: ")
z = y[0].split(" ")
## -----------------------
return {
"timestamp": x[0].strip(),
"db": z[0].split("=")[1],
"user": z[1].split("=")[1],
"pid": z[2].split("=")[1],
"userid": z[3].split("=")[1],
"xid": z[4].split("=")[1],
"query": y[1][:80] + ("..." if len(y[1]) > 80 else "")
}
## -----------------------
nice_logs = []
prior_log_entry = ""
for line in data.splitlines():
## -----------------------
## if this is not the start of a new log entry
## append it to the prior good starting point
## -----------------------
if not line.startswith("'2021-"):
prior_log_entry += line
continue
## -----------------------
## -----------------------
## if the prior log entry is not "empty" we
## can parse it to a dict and append it to our list
## or simply write it to our csv
## -----------------------
if prior_log_entry:
nice_logs.append(parse_log_entry(prior_log_entry))
## -----------------------
## -----------------------
## This entry IS the start of a new log entry to set it aside
## in case the next line(s) need to be appended to this one
## -----------------------
prior_log_entry = line
## -----------------------
## -----------------------
## if the final prior log entry is not "empty" we
## can parse it to a dict and append it to our list
## or simply write it to our csv
## -----------------------
if prior_log_entry:
nice_logs.append(parse_log_entry(prior_log_entry))
## -----------------------
## -----------------------
## At this point we can use a dict writer if
## have not already to write our CSV
## -----------------------
print(json.dumps(nice_logs, indent=4))
## -----------------------
这将生成一个dict列表,我希望您可以看到如何使用python CSV模块将其转换为CSV
[
{
"timestamp": "'2021-05-26T09:06:42Z UTC",
"db": "dev",
"user": "rdsdb",
"pid": "18771",
"userid": "1",
"xid": "20341068",
"query": "SET TRANSACTION READ ONLY;"
},
{
"timestamp": "'2021-05-26T09:06:42Z UTC",
"db": "dev",
"user": "rdsdb",
"pid": "18771",
"userid": "1",
"xid": "20341068",
"query": "SET STATEMENT_TIMEOUT TO 300000;"
},
{
"timestamp": "'2021-05-26T09:06:42Z UTC",
"db": "dev",
"user": "rdsdb",
"pid": "18771",
"userid": "1",
"xid": "20341068",
"query": "/* hash: 720a01bd6ef3747b7f0585c0a70c01e9 */select logtime, tbl_id, trim(tbl_nam..."
},
{
"timestamp": "'2021-05-26T09:06:42Z UTC",
"db": "dev",
"user": "rdsdb",
"pid": "18771",
"userid": "1",
"xid": "20341068",
"query": "SELECT pg_catalog.stll_alter_column_encode_events.logtime AS logtime, pg_catalog..."
},
{
"timestamp": "'2021-05-26T09:06:42Z UTC",
"db": "dev",
"user": "rdsdb",
"pid": "18771",
"userid": "1",
"xid": "20341068",
"query": "COMMIT;"
},
{
"timestamp": "'2021-05-26T09:06:42Z UTC",
"db": "dev",
"user": "rdsdb",
"pid": "18771",
"userid": "1",
"xid": "20341069",
"query": "SET query_group to ''"
}
]
一个快速而肮脏的修复程序能将每个条目都放到一行中吗?你仍然需要挑选你想要的东西。。。