Python 如何根据日期筛选csv文件中的行?
我有一个名为Python 如何根据日期筛选csv文件中的行?,python,list,csv,datetime,glob,Python,List,Csv,Datetime,Glob,我有一个名为aa_20200907.txt的文件,它看起来像这样: #DATA:DD,CARS_INTERNATIONAL:VERSION01.1 2020-09-07T00:00:00.285+02:00,New-York,XX,Audi 2020-09-07T00:01:00.385+02:00,London,100,Mercedes 2020-09-07T00:02:00.255+02:00,New-York,90,Ford 2020-09-07T00:03:00.523+02:00,Ne
aa_20200907.txt的文件,它看起来像这样:
#DATA:DD,CARS_INTERNATIONAL:VERSION01.1
2020-09-07T00:00:00.285+02:00,New-York,XX,Audi
2020-09-07T00:01:00.385+02:00,London,100,Mercedes
2020-09-07T00:02:00.255+02:00,New-York,90,Ford
2020-09-07T00:03:00.523+02:00,New-York,91,BMW
2020-09-08T00:00:58.444+02:00,New-York,12,BMW
2020-09-08T00:01:55.336+02:00,New-York,11,Mercedes
我有一个基于2个条件过滤行的代码
条件\u 1:
我只想要索引[2]
是数字的行
Condition_2
:如果索引[1]
(日期)与处理文件名中提到的日期相同,我只需要这些行。文件名的日期被分配到名为missing\u dates
的列表中
现在下面的代码在条件_1
上完美工作,问题是条件_2
没有按我希望的方式工作。请注意,我通常在多个文件上运行此代码,这意味着缺少\u日期
包含更多值
这是我的代码:
import csv
import datetime
from pathlib import Path
root=Path(r'c:\data\PPE\Desktop\test_folder')
def filter_row(r, date):
condition_1 = r[2].isdigit() #<-- select only the rows if index 2 is numbers.
condition_2 = date != missing_date #<-- select only the rows of that specific day.
return condition_1 and condition_2
missing_dates = ['20200907']
output_list = []
for missing_date in missing_dates:
# print(f"processing {missing_date}")
files=[fn for fn in (e for e in root.glob(f"**/*_{missing_date}.txt") if e.is_file())]
for file in files:
with open(file, 'r') as log_file:
reader = csv.reader(log_file, delimiter = ',')
next(reader) # skip header
for row in reader:
if filter_row(row, missing_date):
output_list.append(row)
print(output_list)
[]
['2020-09-07T00:01:00.385+02:00', 'London', '100', 'Mercedes']
['2020-09-07T00:02:00.255+02:00', 'New-York', '90', 'Ford']
['2020-09-07T00:03:00.523+02:00', 'New-York', '91', 'BMW']
import os
import re
import csv
import datetime
with open(filePath) as infile:
filename = os.path.basename(filePath)
print(filename) # -> aa_20200907.txt
dateToCheck = datetime.datetime.strptime(re.sub(r"[^\d]", "", filename), "%Y%m%d").strftime("%Y-%m-%d") # Get Date
reader = csv.reader(infile)
for line in reader:
if line[0].startswith(dateToCheck) and line[2].isdigit(): # Conditions
print(line)
['2020-09-07T00:01:00.385+02:00', 'London', '100', 'Mercedes']
['2020-09-07T00:02:00.255+02:00', 'New-York', '90', 'Ford']
['2020-09-07T00:03:00.523+02:00', 'New-York', '91', 'BMW']
这是所需的输出:
import csv
import datetime
from pathlib import Path
root=Path(r'c:\data\PPE\Desktop\test_folder')
def filter_row(r, date):
condition_1 = r[2].isdigit() #<-- select only the rows if index 2 is numbers.
condition_2 = date != missing_date #<-- select only the rows of that specific day.
return condition_1 and condition_2
missing_dates = ['20200907']
output_list = []
for missing_date in missing_dates:
# print(f"processing {missing_date}")
files=[fn for fn in (e for e in root.glob(f"**/*_{missing_date}.txt") if e.is_file())]
for file in files:
with open(file, 'r') as log_file:
reader = csv.reader(log_file, delimiter = ',')
next(reader) # skip header
for row in reader:
if filter_row(row, missing_date):
output_list.append(row)
print(output_list)
[]
['2020-09-07T00:01:00.385+02:00', 'London', '100', 'Mercedes']
['2020-09-07T00:02:00.255+02:00', 'New-York', '90', 'Ford']
['2020-09-07T00:03:00.523+02:00', 'New-York', '91', 'BMW']
import os
import re
import csv
import datetime
with open(filePath) as infile:
filename = os.path.basename(filePath)
print(filename) # -> aa_20200907.txt
dateToCheck = datetime.datetime.strptime(re.sub(r"[^\d]", "", filename), "%Y%m%d").strftime("%Y-%m-%d") # Get Date
reader = csv.reader(infile)
for line in reader:
if line[0].startswith(dateToCheck) and line[2].isdigit(): # Conditions
print(line)
['2020-09-07T00:01:00.385+02:00', 'London', '100', 'Mercedes']
['2020-09-07T00:02:00.255+02:00', 'New-York', '90', 'Ford']
['2020-09-07T00:03:00.523+02:00', 'New-York', '91', 'BMW']
*请注意,我不想写一个全新的代码。我只想修复条件2
,并保留当前代码,因为我对它感到满意。这是一种方法
Ex:
import csv
import datetime
from pathlib import Path
root=Path(r'c:\data\PPE\Desktop\test_folder')
def filter_row(r, date):
condition_1 = r[2].isdigit() #<-- select only the rows if index 2 is numbers.
condition_2 = date != missing_date #<-- select only the rows of that specific day.
return condition_1 and condition_2
missing_dates = ['20200907']
output_list = []
for missing_date in missing_dates:
# print(f"processing {missing_date}")
files=[fn for fn in (e for e in root.glob(f"**/*_{missing_date}.txt") if e.is_file())]
for file in files:
with open(file, 'r') as log_file:
reader = csv.reader(log_file, delimiter = ',')
next(reader) # skip header
for row in reader:
if filter_row(row, missing_date):
output_list.append(row)
print(output_list)
[]
['2020-09-07T00:01:00.385+02:00', 'London', '100', 'Mercedes']
['2020-09-07T00:02:00.255+02:00', 'New-York', '90', 'Ford']
['2020-09-07T00:03:00.523+02:00', 'New-York', '91', 'BMW']
import os
import re
import csv
import datetime
with open(filePath) as infile:
filename = os.path.basename(filePath)
print(filename) # -> aa_20200907.txt
dateToCheck = datetime.datetime.strptime(re.sub(r"[^\d]", "", filename), "%Y%m%d").strftime("%Y-%m-%d") # Get Date
reader = csv.reader(infile)
for line in reader:
if line[0].startswith(dateToCheck) and line[2].isdigit(): # Conditions
print(line)
['2020-09-07T00:01:00.385+02:00', 'London', '100', 'Mercedes']
['2020-09-07T00:02:00.255+02:00', 'New-York', '90', 'Ford']
['2020-09-07T00:03:00.523+02:00', 'New-York', '91', 'BMW']
输出:
import csv
import datetime
from pathlib import Path
root=Path(r'c:\data\PPE\Desktop\test_folder')
def filter_row(r, date):
condition_1 = r[2].isdigit() #<-- select only the rows if index 2 is numbers.
condition_2 = date != missing_date #<-- select only the rows of that specific day.
return condition_1 and condition_2
missing_dates = ['20200907']
output_list = []
for missing_date in missing_dates:
# print(f"processing {missing_date}")
files=[fn for fn in (e for e in root.glob(f"**/*_{missing_date}.txt") if e.is_file())]
for file in files:
with open(file, 'r') as log_file:
reader = csv.reader(log_file, delimiter = ',')
next(reader) # skip header
for row in reader:
if filter_row(row, missing_date):
output_list.append(row)
print(output_list)
[]
['2020-09-07T00:01:00.385+02:00', 'London', '100', 'Mercedes']
['2020-09-07T00:02:00.255+02:00', 'New-York', '90', 'Ford']
['2020-09-07T00:03:00.523+02:00', 'New-York', '91', 'BMW']
import os
import re
import csv
import datetime
with open(filePath) as infile:
filename = os.path.basename(filePath)
print(filename) # -> aa_20200907.txt
dateToCheck = datetime.datetime.strptime(re.sub(r"[^\d]", "", filename), "%Y%m%d").strftime("%Y-%m-%d") # Get Date
reader = csv.reader(infile)
for line in reader:
if line[0].startswith(dateToCheck) and line[2].isdigit(): # Conditions
print(line)
['2020-09-07T00:01:00.385+02:00', 'London', '100', 'Mercedes']
['2020-09-07T00:02:00.255+02:00', 'New-York', '90', 'Ford']
['2020-09-07T00:03:00.523+02:00', 'New-York', '91', 'BMW']
给你,伙计:
输入:
#DATA:DD,CARS_INTERNATIONAL:VERSION01.1
2020-09-07T00:00:00.285+02:00,New-York,XX,Audi
2020-09-07T00:01:00.385+02:00,London,100,Mercedes
2020-09-07T00:02:00.255+02:00,New-York,90,Ford
2020-09-07T00:03:00.523+02:00,New-York,91,BMW
2020-09-08T00:00:58.444+02:00,New-York,12,BMW
2020-09-08T00:01:55.336+02:00,New-York,11,Mercedes
代码:
您的代码在条件_2和代码行下方存在问题,在代码段下方运行时,没有文件
files=[fn for fn in (e for e in root.glob(f"**/*_{missing_date}.txt") if e.is_file())]
不使用熊猫有什么好的理由吗?Pandas使这些任务变得更简单。@quest这段代码是更大脚本的一部分。将所有内容重写为pandas将花费大量时间。您正在执行此操作if filter_row(row,'20200907')
这将始终返回false,因为'20200907'将始终等于其自身20200907。你的条件2没有意义。@ChandanMalla这就是为什么我试图重新定义我的条件2,但我不知道如何…谢谢你,但我不想重写一个全新的代码。我想保留我现在的剧本。我只想在过滤器行
@TangerCity中修复条件2
。您不需要重写整个代码。将如果筛选行(行,缺少日期)
替换为如果行[0]。开始使用(日期检查)和行[2]。isdigit():
我想保留函数筛选行
。我只想调整条件_2
您可以使用代码将'20200907'更改为'2020-09-07'。另外,如果文件的最后一行有日期为2020-09-07的记录,它还将添加一些垃圾字符,请确保修复它。谢谢,但您已将我代码的第二部分更改得太多了。这是为什么?`files=[fn for fn in(e for e in root.glob(f“*/{missing_date}.txt”)如果e.is_file())]`这会产生空结果,因此我使用了不同的解决方案从目录中获取文件