用python按空间分割带引号的字符串更有效吗？_Python_Lexical Analysis

用python按空间分割带引号的字符串更有效吗？

python

用python按空间分割带引号的字符串更有效吗？,python,lexical-analysis,Python,Lexical Analysis,最近我正在处理nginx的访问日志，用python分析工作我找到了使用shlex根据但是它真的很慢，分析2000行日志需要1.2秒以上。我的nginx服务器每秒生成2500多行因此，我尝试了使用re或更为自然（和粗糙）的方法来索引字符串这些代码在虚拟机中运行，对于2000行日志来说，这两种代码都需要0.5秒以上的时间我还有其他选择来提高效率吗提前谢谢这是我的密码 import re import time import datetime line = '0.278 0.264 11

最近我正在处理nginx的访问日志，用python分析工作

我找到了使用

shlex

根据

但是它真的很慢，分析2000行日志需要1.2秒以上。我的nginx服务器每秒生成2500多行

因此，我尝试了使用

re

或更为自然（和粗糙）的方法来索引字符串

这些代码在虚拟机中运行，对于2000行日志来说，这两种代码都需要0.5秒以上的时间

我还有其他选择来提高效率吗

提前谢谢

这是我的密码

import re
import time
import datetime
line = '0.278 0.264 113.116.52.174 - 10.10.3.41:20080  [08/Apr/2012:23:59:08 +0800] shenzhen.anjuke.com "GET /ajax/propext/?proid=104178677&commid=97047&brokerid=262646&rand=0.905798233768953 HTTP/1.0" 200 10914 "http://shenzhen.anjuke.com/prop/view/104178677" "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; 360SE)" "-" "-" - "114.80.230.198 026AF756-C5DF-3EDA-B96A-F31A39F7C65E"'
def convert(line):
    line = re.split('\"', line)
    line_pre = re.split('\s+', line[0])

    r =re.compile(r"^((?:GET|POST|OPTIONS))\s+?.*HTTP\/1\.[10]$")
    http_method =r.findall(line[1])
    #http_method =re.findall(r"^((?:GET|POST|OPTIONS))\s+?.*HTTP\/1\.[10]$", line[1])
    if len(http_method):
        http_method = http_method[0]
    else:
        http_method = ''
    r = re.compile(r"^\s+(\d{1,3})\s+(\d+)")
    code_byte = r.findall(line[2])
    #code_byte = re.findall(r"^\s+(\d{1,3})\s+(\d+)", line[2])
    status = int(code_byte[0][0])
    bytes_sent = int(code_byte[0][1])
    r = re.compile(r":\d+$")
    upstream_addr = r.sub("", line_pre[4])
    request_time = int(float(line_pre[0])*1000)
    if line_pre[1] == '-':
        upstream_response_time = -1
    else:
        upstream_response_time = int(float(line_pre[1])*1000)
    remote_addr = line_pre[2]
    host = line_pre[7].replace(' ','')
    logdatetime = line_pre[5].replace('[','')
    dt = datetime.datetime.strptime(logdatetime, "%d/%b/%Y:%H:%M:%S")
    year = int(str(dt)[0:4])
    monthday = int(str(dt)[4:10].replace("-",""))
    hour = int(str(dt)[11:13])
    logtime = int(str(dt)[14:16])
    sec = time.mktime(dt.timetuple())
    r = re.compile(r"^[A-Z]+\s+?(.*)HTTP\/1\.[10]$")
    request_uri = r.findall(line[1])
    #request_uri = re.findall(r"^[A-Z]+\s+?(.*)HTTP\/1\.[10]$", line[1])
    http_referer = line[3]
    user_agent = line[5]
    gzip_ratio = line[7]
    http_x_forwarded_for = line[9]
    r = re.compile(r"^([0-9\.]+)\s+(.*)")
    serad_guid = r.findall(line[11])
    server_addr = serad_guid[0][0]
    guid = serad_guid[0][1]
    doc = {
                    "hour":hour,
                    "year":year,
                    "date":monthday,
                    "time":logtime,
                    "sec":sec,
                    "request_time":request_time,
                    "upstream_response_time":upstream_response_time,
                    "remote_addr":remote_addr,
                    "upstream_addr":upstream_addr,
                    "host":host,
                    "method":http_method,
                    "request_uri":request_uri,
                    #"request_protocal":"",
                    "status":status,
                    "bytes_sent":bytes_sent,
                    "http_referer":http_referer,
                    "user_agent":user_agent,
                    "gzip_ratio":gzip_ratio,
                    "http_x_forwarded_for":http_x_forwarded_for,
                    "server_addr":server_addr,
                    "guid":guid

    }
    return doc
t2 = time.time()
count =0
for i in range(12000):
    convert(line)
    count += 1
    if count % 2000 == 0:
    t1 = t2
        t2 = time.time()
        print str(t2-t1)

及

索引方式

import time
import datetime
line = '0.278 0.264 113.116.52.174 - 10.10.3.41:20080  [08/Apr/2012:23:59:08 +0800] shenzhen.anjuke.com "GET /ajax/propext/?proid=104178677&commid=97047&brokerid=262646&rand=0.905798233768953 HTTP/1.0" 200 10914 "http://shenzhen.anjuke.com/prop/view/104178677" "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; 360SE)" "-" "-" - "114.80.230.198 026AF756-C5DF-3EDA-B96A-F31A39F7C65E"'

def pair(l):
    for i in range(0, len(l), 2):
        yield (l[i], l[i+1])

def convert(line):
    line = line.replace("  ", "")
    quotes_positions = allindices(line, "\"")
    if len(quotes_positions) <= 0 or len(quotes_positions)% 2 != 0:
        return None

    space_positions = allindices(line, " ")

    target_positions = []

    for s in space_positions:
        true_target = True
        for qs, qe in pair(quotes_positions):
            if s > qs and s < qe:
                true_target = False
                break
        if true_target:
            target_positions.append(s)

    ret = []
    for i in range(0, len(target_positions)):
        if i + 1 == len(target_positions):
            ret.append(line[target_positions[i] + 1:])
        else:
            ret.append(line[target_positions[i] + 1:target_positions[i + 1]])
    return ret


# def allindices(string, sub, listindex=[], offset=0):
def allindices(string, sub):
    listindex = list()
    i = string.find(sub)
    while i >= 0:
        listindex.append(i)
        i = string.find(sub, i + 1)
    return listindex

t2 = time.time()
count =0
for i in range(12000):
    convert(line)
    count += 1
    if count % 2000 == 0:
    t1 = t2
        t2 = time.time()
        print str(t2-t1)

导入时间
导入日期时间
line='0.278 0.264 113.116.52.174-10.10.3.41:20080[08/Apr/2012:23:59:08+0800]shenzhen.anjuke.com“GET/ajax/propext/？proid=104178677&commid=97047&brokerid=262646&rand=0.905798233768953 HTTP/1.0“200 10914”http://shenzhen.anjuke.com/prop/view/104178677“Mozilla/4.0（兼容；MSIE 6.0；Windows NT 5.1；SV1；360SE）”-“”-“114.80.230.198 026AF756-C5DF-3EDA-B96A-F31A39F7C65E”
def对（l）：
对于范围内的i（0，len（l），2）：
收益率（l[i]，l[i+1]）
def转换（行）：
行=行。替换（“，”）
quotes\u positions=allindices（行“\”）
如果len（引用位置）qs和s=0时：
listindex.append（一）
i=string.find（sub，i+1）
返回列表索引
t2=时间。时间（）
计数=0
对于范围内的i（12000）：
转换（行）
计数+=1
如果计数%2000==0：
t1=t2
t2=时间。时间（）
打印str（t2-t1）

这看起来有点像CSV；我想知道csv模块是否会被滥用来处理这个问题

>>> for row in csv.reader([line], delimiter=' '):
...     print repr(row)
... 
['0.278', '0.264', '113.116.52.174', '-', '10.10.3.41:20080', '', '[08/Apr/2012:23:59:08', '+0800]', 'shenzhen.anjuke.com', 'GET /ajax/propext/?proid=104178677&commid=97047&brokerid=262646&rand=0.905798233768953 HTTP/1.0', '200', '10914', 'http://shenzhen.anjuke.com/prop/view/104178677', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; 360SE)', '-', '-', '-', '114.80.230.198 026AF756-C5DF-3EDA-B96A-F31A39F7C65E']

我只是基于示例行编写了一个正则表达式，实际上我不知道某些字段的含义，所以我使用了占位符名称，您可以将它们重命名为更有意义的字段。在我的机器上，这个片段比第一个快4~5倍

log\u line\u re=re.compile(
r”“”
（？P[0-9.]+）
\
（？P[0-9.]+）
\
（？P\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}）
\
（？P.+？）
\
（？P\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}）
\+
\[（？P.+？）\]
\
（？P.+？）
\
"
（？P[A-Z]+）
\
（？P.+？）
\
HTTP/（？P[0-9.]+）
"
\
（？P\d{3}）
\
（？P\d+）
\
"
（？P.+？）
"
\
“（？P.+？）”
\
“（？P.+？）”
\
“（？P.+？）”
\
（？P.+？）
"
（？P\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}）
\
（？P.+？）
"
“”，re.VERBOSE）
def转换（行）：
返回log\u line\u re.match（line）.groupdict（）

代码太多了。我不会坐在这里想它应该做什么。你能描述一下精确的解析，和/或显示示例输入和相应的输出吗？也许你在这方面有点错误？考虑更改服务器上的选项以适合您的解析器；也许让它看起来像json，或者添加一个不规则的分隔符（比如，

）而不是空白。是的，我怎么能忘记使用单个re:D呢