Python清理域名-regex还是lambda？_Python_Regex_Lambda

Python清理域名-regex还是lambda？

python regex lambda

Python清理域名-regex还是lambda？,python,regex,lambda,Python,Regex,Lambda,我不确定这是否是用python解决这个问题的最佳方法。在bash中，我可能只使用awk和sed，然后就可以使用它了基于此，有两个建议，但我难以实施。我想清理域名代码 import re log = ["4/19/2020 11:59:09 PM 2604 PACKET 0000014DE1921330 UDP Rcv 192.168.1.28 f975 Q [0001 D NOERROR] A (7)pagead2(17)googlesyndication(3

我不确定这是否是用python解决这个问题的最佳方法。在bash中，我可能只使用awk和sed，然后就可以使用它了

基于此，有两个建议，但我难以实施。我想清理域名

代码

import re

log = ["4/19/2020 11:59:09 PM 2604 PACKET  0000014DE1921330 UDP Rcv 192.168.1.28   f975   Q [0001   D   NOERROR] A      (7)pagead2(17)googlesyndication(3)com(0)",
       "4/19/2020 11:59:09 PM 0574 PACKET  0000014DE18C4720 UDP R cv 192.168.2.54    9c63   Q [0001   D   NOERROR] A      (2)pg(3)cdn(5)viber(3)com(0)"]

rx_dict = {
    'date': re.compile(r'(?P<date>(\d+)[\/](\d+)[\/](\d+))'),
    'time': re.compile(r'(?P<time>\d{2}:\d{2}:\d{2}.(?:AM|PM))'),
    'client': re.compile(r'(?P<client>(?:[0-9]{1,3}\.){3}[0-9]{1,3})'),
    'flags': re.compile(r'(?P<flags>(?<=\].)(.\S{0,}))'),
    'query': re.compile(r'(?P<query>[\S]*)$')
    }

for item in log:
    counter = 0 
    for key, r_exp in rx_dict.items():
        print(f"{r_exp.search(item).group(1)}", end='')
        if counter < 4:
            print(',', end='')
            counter = counter + 1
    print()

首选输出

4/19/2020,11:59:09 PM,192.168.1.28,A,(7)pagead2(17)googlesyndication(3)com(0)
4/19/2020,11:59:09 PM,192.168.2.54,A,(2)pg(3)cdn(5)viber(3)com(0)

4/19/2020,11:59:09 PM,192.168.1.28,A,pagead2.googlesyndication.com
4/19/2020,11:59:09 PM,192.168.2.54,A,pg.cdn.viber.com

我假设您希望清理

查询

结果

第一个参数是模式（这里是

（AnyNumber）

）。第二个参数是

repl

（这里是

clean\u query

函数）。对于模式的每个非重叠出现，都将调用此函数

>>> import re
>>>
>>> log = [
...     "4/19/2020 11:59:09 PM 2604 PACKET  0000014DE1921330 UDP Rcv 192.168.1.28   f975   Q [0001   D   NOERROR] A      (7)pagead2(17)googlesyndication(3)com(0)",
...     "4/19/2020 11:59:09 PM 0574 PACKET  0000014DE18C4720 UDP R cv 192.168.2.54    9c63   Q [0001   D   NOERROR] A      (2)pg(3)cdn(5)viber(3)com(0)",
... ]
>>>
>>> rx_dict = {
...     "date": re.compile(r"(?P<date>(\d+)[\/](\d+)[\/](\d+))"),
...     "time": re.compile(r"(?P<time>\d{2}:\d{2}:\d{2}.(?:AM|PM))"),
...     "client": re.compile(r"(?P<client>(?:[0-9]{1,3}\.){3}[0-9]{1,3})"),
...     "flags": re.compile(r"(?P<flags>(?<=\].)(.\S{0,}))"),
...     "query": re.compile(r"(?P<query>[\S]*)$"),
... }
>>>
>>> def clean_up_query(match):
...     match_start, match_stop = match.span()
...     if (match_start == 0) or (
...         match_stop == len(match.string)
...     ):  # we do not want "." to be appeared on the result if the match is at the beginning or at the end.
...         return ""
...     return "."
...
>>> for item in log:
...     counter = 0
...     for key, r_exp in rx_dict.items():
...         if key == "query":
...             print(
...                 re.sub(r"\(\d+\)", clean_up_query, r_exp.search(item).group(1)), end=""
...             )
...         else:
...             print(f"{r_exp.search(item).group(1)}", end="")
...         if counter < 4:
...             print(",", end="")
...             counter = counter + 1
...     print()
...
4/19/2020,11:59:09 PM,192.168.1.28,A,pagead2.googlesyndication.com
4/19/2020,11:59:09 PM,192.168.2.54,A,pg.cdn.viber.com

>>重新导入
>>>
>>>日志=[
…“2020年4月19日11:59:09 PM 2604数据包00000 14DE1921330 UDP Rcv 192.168.1.28 f975 Q[0001 D NOERROR]A（7）pagead2（17）googlesyndication（3）com（0）”，
…“2020年4月19日11:59:09 PM 0574数据包00000 14DE18C4720 UDP R cv 192.168.2.54 9c63 Q[0001 D NOERROR]A（2）pg（3）cdn（5）viber（3）com（0）”，
... ]
>>>
>>>rx_dict={
…日期：重新编译（r）（？P（\d+[\/]（\d+[\/]（\d+））），
…“time”：重新编译（r“（？P\d{2}:\d{2}:\d{2}.（？：AM | PM））”，
…“客户机”：重新编译（r“（？P（？[0-9]{1,3}\）{3}[0-9]{1,3}）”，
…标志：重新编译（r）（？P（？）？
>>> import re
>>>
>>> log = [
...     "4/19/2020 11:59:09 PM 2604 PACKET  0000014DE1921330 UDP Rcv 192.168.1.28   f975   Q [0001   D   NOERROR] A      (7)pagead2(17)googlesyndication(3)com(0)",
...     "4/19/2020 11:59:09 PM 0574 PACKET  0000014DE18C4720 UDP R cv 192.168.2.54    9c63   Q [0001   D   NOERROR] A      (2)pg(3)cdn(5)viber(3)com(0)",
... ]
>>>
>>> rx_dict = {
...     "date": re.compile(r"(?P<date>(\d+)[\/](\d+)[\/](\d+))"),
...     "time": re.compile(r"(?P<time>\d{2}:\d{2}:\d{2}.(?:AM|PM))"),
...     "client": re.compile(r"(?P<client>(?:[0-9]{1,3}\.){3}[0-9]{1,3})"),
...     "flags": re.compile(r"(?P<flags>(?<=\].)(.\S{0,}))"),
...     "query": re.compile(r"(?P<query>[\S]*)$"),
... }
>>>
>>> def clean_up_query(match):
...     match_start, match_stop = match.span()
...     if (match_start == 0) or (
...         match_stop == len(match.string)
...     ):  # we do not want "." to be appeared on the result if the match is at the beginning or at the end.
...         return ""
...     return "."
...
>>> for item in log:
...     counter = 0
...     for key, r_exp in rx_dict.items():
...         if key == "query":
...             print(
...                 re.sub(r"\(\d+\)", clean_up_query, r_exp.search(item).group(1)), end=""
...             )
...         else:
...             print(f"{r_exp.search(item).group(1)}", end="")
...         if counter < 4:
...             print(",", end="")
...             counter = counter + 1
...     print()
...
4/19/2020,11:59:09 PM,192.168.1.28,A,pagead2.googlesyndication.com
4/19/2020,11:59:09 PM,192.168.2.54,A,pg.cdn.viber.com