用python按空间分割带引号的字符串更有效吗?

用python按空间分割带引号的字符串更有效吗?,python,lexical-analysis,Python,Lexical Analysis,最近我正在处理nginx的访问日志,用python分析工作 我找到了使用shlex根据 但是它真的很慢,分析2000行日志需要1.2秒以上。我的nginx服务器每秒生成2500多行 因此,我尝试了使用re或更为自然(和粗糙)的方法来索引字符串 这些代码在虚拟机中运行,对于2000行日志来说,这两种代码都需要0.5秒以上的时间 我还有其他选择来提高效率吗 提前谢谢 这是我的密码 import re import time import datetime line = '0.278 0.264 11

最近我正在处理nginx的访问日志,用python分析工作

我找到了使用
shlex
根据

但是它真的很慢,分析2000行日志需要1.2秒以上。我的nginx服务器每秒生成2500多行

因此,我尝试了使用
re
或更为自然(和粗糙)的方法来索引字符串

这些代码在虚拟机中运行,对于2000行日志来说,这两种代码都需要0.5秒以上的时间

我还有其他选择来提高效率吗

提前谢谢

这是我的密码

import re
import time
import datetime
line = '0.278 0.264 113.116.52.174 - 10.10.3.41:20080  [08/Apr/2012:23:59:08 +0800] shenzhen.anjuke.com "GET /ajax/propext/?proid=104178677&commid=97047&brokerid=262646&rand=0.905798233768953 HTTP/1.0" 200 10914 "http://shenzhen.anjuke.com/prop/view/104178677" "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; 360SE)" "-" "-" - "114.80.230.198 026AF756-C5DF-3EDA-B96A-F31A39F7C65E"'
def convert(line):
    line = re.split('\"', line)
    line_pre = re.split('\s+', line[0])

    r =re.compile(r"^((?:GET|POST|OPTIONS))\s+?.*HTTP\/1\.[10]$")
    http_method =r.findall(line[1])
    #http_method =re.findall(r"^((?:GET|POST|OPTIONS))\s+?.*HTTP\/1\.[10]$", line[1])
    if len(http_method):
        http_method = http_method[0]
    else:
        http_method = ''
    r = re.compile(r"^\s+(\d{1,3})\s+(\d+)")
    code_byte = r.findall(line[2])
    #code_byte = re.findall(r"^\s+(\d{1,3})\s+(\d+)", line[2])
    status = int(code_byte[0][0])
    bytes_sent = int(code_byte[0][1])
    r = re.compile(r":\d+$")
    upstream_addr = r.sub("", line_pre[4])
    request_time = int(float(line_pre[0])*1000)
    if line_pre[1] == '-':
        upstream_response_time = -1
    else:
        upstream_response_time = int(float(line_pre[1])*1000)
    remote_addr = line_pre[2]
    host = line_pre[7].replace(' ','')
    logdatetime = line_pre[5].replace('[','')
    dt = datetime.datetime.strptime(logdatetime, "%d/%b/%Y:%H:%M:%S")
    year = int(str(dt)[0:4])
    monthday = int(str(dt)[4:10].replace("-",""))
    hour = int(str(dt)[11:13])
    logtime = int(str(dt)[14:16])
    sec = time.mktime(dt.timetuple())
    r = re.compile(r"^[A-Z]+\s+?(.*)HTTP\/1\.[10]$")
    request_uri = r.findall(line[1])
    #request_uri = re.findall(r"^[A-Z]+\s+?(.*)HTTP\/1\.[10]$", line[1])
    http_referer = line[3]
    user_agent = line[5]
    gzip_ratio = line[7]
    http_x_forwarded_for = line[9]
    r = re.compile(r"^([0-9\.]+)\s+(.*)")
    serad_guid = r.findall(line[11])
    server_addr = serad_guid[0][0]
    guid = serad_guid[0][1]
    doc = {
                    "hour":hour,
                    "year":year,
                    "date":monthday,
                    "time":logtime,
                    "sec":sec,
                    "request_time":request_time,
                    "upstream_response_time":upstream_response_time,
                    "remote_addr":remote_addr,
                    "upstream_addr":upstream_addr,
                    "host":host,
                    "method":http_method,
                    "request_uri":request_uri,
                    #"request_protocal":"",
                    "status":status,
                    "bytes_sent":bytes_sent,
                    "http_referer":http_referer,
                    "user_agent":user_agent,
                    "gzip_ratio":gzip_ratio,
                    "http_x_forwarded_for":http_x_forwarded_for,
                    "server_addr":server_addr,
                    "guid":guid

    }
    return doc
t2 = time.time()
count =0
for i in range(12000):
    convert(line)
    count += 1
    if count % 2000 == 0:
    t1 = t2
        t2 = time.time()
        print str(t2-t1)

索引方式

import time
import datetime
line = '0.278 0.264 113.116.52.174 - 10.10.3.41:20080  [08/Apr/2012:23:59:08 +0800] shenzhen.anjuke.com "GET /ajax/propext/?proid=104178677&commid=97047&brokerid=262646&rand=0.905798233768953 HTTP/1.0" 200 10914 "http://shenzhen.anjuke.com/prop/view/104178677" "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; 360SE)" "-" "-" - "114.80.230.198 026AF756-C5DF-3EDA-B96A-F31A39F7C65E"'

def pair(l):
    for i in range(0, len(l), 2):
        yield (l[i], l[i+1])

def convert(line):
    line = line.replace("  ", "")
    quotes_positions = allindices(line, "\"")
    if len(quotes_positions) <= 0 or len(quotes_positions)% 2 != 0:
        return None

    space_positions = allindices(line, " ")

    target_positions = []

    for s in space_positions:
        true_target = True
        for qs, qe in pair(quotes_positions):
            if s > qs and s < qe:
                true_target = False
                break
        if true_target:
            target_positions.append(s)

    ret = []
    for i in range(0, len(target_positions)):
        if i + 1 == len(target_positions):
            ret.append(line[target_positions[i] + 1:])
        else:
            ret.append(line[target_positions[i] + 1:target_positions[i + 1]])
    return ret


# def allindices(string, sub, listindex=[], offset=0):
def allindices(string, sub):
    listindex = list()
    i = string.find(sub)
    while i >= 0:
        listindex.append(i)
        i = string.find(sub, i + 1)
    return listindex

t2 = time.time()
count =0
for i in range(12000):
    convert(line)
    count += 1
    if count % 2000 == 0:
    t1 = t2
        t2 = time.time()
        print str(t2-t1)
导入时间
导入日期时间
line='0.278 0.264 113.116.52.174-10.10.3.41:20080[08/Apr/2012:23:59:08+0800]shenzhen.anjuke.com“GET/ajax/propext/?proid=104178677&commid=97047&brokerid=262646&rand=0.905798233768953 HTTP/1.0“200 10914”http://shenzhen.anjuke.com/prop/view/104178677“Mozilla/4.0(兼容;MSIE 6.0;Windows NT 5.1;SV1;360SE)”-“”-“114.80.230.198 026AF756-C5DF-3EDA-B96A-F31A39F7C65E”
def对(l):
对于范围内的i(0,len(l),2):
收益率(l[i],l[i+1])
def转换(行):
行=行。替换(“,”)
quotes\u positions=allindices(行“\”)
如果len(引用位置)qs和s=0时:
listindex.append(一)
i=string.find(sub,i+1)
返回列表索引
t2=时间。时间()
计数=0
对于范围内的i(12000):
转换(行)
计数+=1
如果计数%2000==0:
t1=t2
t2=时间。时间()
打印str(t2-t1)

这看起来有点像CSV;我想知道csv模块是否会被滥用来处理这个问题

>>> for row in csv.reader([line], delimiter=' '):
...     print repr(row)
... 
['0.278', '0.264', '113.116.52.174', '-', '10.10.3.41:20080', '', '[08/Apr/2012:23:59:08', '+0800]', 'shenzhen.anjuke.com', 'GET /ajax/propext/?proid=104178677&commid=97047&brokerid=262646&rand=0.905798233768953 HTTP/1.0', '200', '10914', 'http://shenzhen.anjuke.com/prop/view/104178677', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; 360SE)', '-', '-', '-', '114.80.230.198 026AF756-C5DF-3EDA-B96A-F31A39F7C65E']

我只是基于示例行编写了一个正则表达式,实际上我不知道某些字段的含义,所以我使用了占位符名称,您可以将它们重命名为更有意义的字段。在我的机器上,这个片段比第一个快4~5倍

log\u line\u re=re.compile(
r”“”
(?P[0-9.]+)
\
(?P[0-9.]+)
\
(?P\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})
\
(?P.+?)
\
(?P\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5})
\+
\[(?P.+?)\]
\
(?P.+?)
\
"
(?P[A-Z]+)
\
(?P.+?)
\
HTTP/(?P[0-9.]+)
"
\
(?P\d{3})
\
(?P\d+)
\
"
(?P.+?)
"
\
“(?P.+?)”
\
“(?P.+?)”
\
“(?P.+?)”
\
(?P.+?)
"
(?P\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})
\
(?P.+?)
"
“”,re.VERBOSE)
def转换(行):
返回log\u line\u re.match(line).groupdict()

代码太多了。我不会坐在这里想它应该做什么。你能描述一下精确的解析,和/或显示示例输入和相应的输出吗?也许你在这方面有点错误?考虑更改服务器上的选项以适合您的解析器;也许让它看起来像json,或者添加一个不规则的分隔符(比如,
|
)而不是空白。是的,我怎么能忘记使用单个re:D呢