Python 为什么多线程程序比单线程程序慢,尽管它们读取单独的txt文件?
我有几个包含CDR信息的txt文件,CDR分布在几个文件中。我需要在这些文件中找到电话号码,然后检查与xls文件的匹配情况。我编写了单线程版本,然后是多线程,发现sometimers多线程比单线程慢 多线程:Python 为什么多线程程序比单线程程序慢,尽管它们读取单独的txt文件?,python,python-3.x,multithreading,file,Python,Python 3.x,Multithreading,File,我有几个包含CDR信息的txt文件,CDR分布在几个文件中。我需要在这些文件中找到电话号码,然后检查与xls文件的匹配情况。我编写了单线程版本,然后是多线程,发现sometimers多线程比单线程慢 多线程: import re import os import time import sys import pandas import ipaddress import threading def improve_view_n(string_to_improve): string_to
import re
import os
import time
import sys
import pandas
import ipaddress
import threading
def improve_view_n(string_to_improve):
string_to_improve = string_to_improve.split(',')
improved_string = ""
for i in string_to_improve:
if i != "":
improved_string = improved_string + i + " "
return improved_string
def upload_number_list():
numbers = []
try:
with open(file="number_list.txt", mode="r") as f:
for i in f:
numbers.append(i.strip("\\\n"))
except FileNotFoundError:
print("number_list.txt file does not exist or corrupted.\n\n")
print("The program will be terminated in 5 seconds")
time.sleep(5)
sys.exit()
return numbers
def search_for_pattern(number, file_name, semaphore, found_ip):
semaphore.acquire()
if file_name.startswith("MSK"):
with open(file=file_name, mode='r') as f:
text_of_file = f.read()
results = re.findall(pattern=f",,,,,.*{number}.*,", string=text_of_file)
if results:
for element in results:
write_searh_results_to_file(file_name, element)
element = improve_view_n(element).split()
for subeleement in element:
try:
ipaddress.IPv4Address(subeleement)
except ipaddress.AddressValueError:
pass
else:
found_ip.append(subeleement)
else:
nothing_was_found(file_name, number)
semaphore.release()
def write_searh_results_to_file(file_where_match_was_found, element):
with open(file="found_results.txt", mode='a') as f:
f.write(f"{file_where_match_was_found}: {improve_view_n(element)} \n")
def nothing_was_found(file_where_match_wasnt_found, number_to_search):
with open(file="found_results.txt", mode='a') as f:
f.write(f"NO MATCHES FOUND FOR {number_to_search} IN {file_where_match_wasnt_found}\n\n")
def check_if_ip_in_norma(ip, trunk_names):
line_which_contains_ip = []
for line in trunk_names:
if ip in line:
line_which_contains_ip.append(line)
if line_which_contains_ip == []:
line_which_contains_ip.append(f"Norma does not contain information about {ip}")
return line_which_contains_ip
def main():
threads = []
our_files = ('y.py', "found_results.txt", "number_list.txt", 'norma.xls', 'MultyThread.py')
list_files = os.listdir()
for file in our_files:
if file in list_files:
list_files.remove(file)
semaphore = threading.Semaphore(10)
t1 = int(round(time.time() * 1000))
found_ip_list = []
if "norma.xls" not in os.listdir():
print("norma.xls file was not found in the current directory")
print("The program will be terminated")
sys.exit()
time.sleep(3)
normafile = pandas.read_excel('norma.xls', skiprows=2, header=None)
trunk_names = normafile[2]
numbers_to_search_list = upload_number_list()
for number in numbers_to_search_list:
for file_number in range(len(list_files)):
threads.append(threading.Thread(target=search_for_pattern,
args=(number, list_files[file_number],
semaphore, found_ip_list,),)
)
threads[file_number].start()
for file_number in range(len(list_files)):
threads[file_number].join()
print(set(found_ip_list))
for ip in set(found_ip_list):
x = check_if_ip_in_norma(ip, trunk_names)
print(f"{x}\n")
with open('found_results.txt', 'a') as f:
f.write(f"{x}\n")
print("The program completed fine!")
print("Take found_results.txt from the current folder")
print("If you want to repeat search, remove found_results.txt")
t2 = int(round(time.time() * 1000))
print(f"Job is done within {t2 - t1} miliseconds")
time.sleep(90)
print("Bye!")
time.sleep(1)
if __name__ == '__main__':
try:
main()
except Exception as ex:
print("The following error happened:")
print(ex)
time.sleep(20)
单线程:
import re
import os
import time
import sys
import pandas
import ipaddress
def improve_view_n(string_to_improve):
string_to_improve = string_to_improve.split(',')
improved_string = ""
for i in string_to_improve:
if i != "":
improved_string = improved_string + i + " "
return improved_string
def upload_number_list():
numbers = []
try:
with open(file="number_list.txt", mode="r") as f:
for i in f:
numbers.append(i.strip("\\\n"))
except FileNotFoundError:
print("number_list.txt file does not exist or corrupted.\n\n")
print("The program will be terminated in 5 seconds")
time.sleep(5)
sys.exit()
return numbers
def search_for_pattern(number):
found_ip = []
our_files = ('y.py', "found_results.txt", "number_list.txt", 'norma.xls')
list_files = os.listdir()
for file_name in list_files:
#if file_name not in our_files:
if file_name.startswith("MSK"):
with open(file=file_name, mode='r') as f:
text_of_file = f.read()
results = re.findall(pattern=f",,,,,.*{number}.*,", string=text_of_file)
if results:
for element in results:
write_searh_results_to_file(file_name, element)
element = improve_view_n(element).split()
for subeleement in element:
try:
ipaddress.IPv4Address(subeleement)
except ipaddress.AddressValueError:
pass
else:
found_ip.append(subeleement)
else:
nothing_was_found(file_name, number)
return found_ip
def write_searh_results_to_file(file_where_match_was_found, element):
with open(file="found_results.txt", mode='a') as f:
f.write(f"{file_where_match_was_found}: {improve_view_n(element)} \n")
def nothing_was_found(file_where_match_wasnt_found, number_to_search):
with open(file="found_results.txt", mode='a') as f:
f.write(f"NO MATCHES FOUND FOR {number_to_search} IN {file_where_match_wasnt_found}\n\n")
def check_if_ip_in_norma(ip, trunk_names):
line_which_contains_ip = []
for line in trunk_names:
if ip in line:
line_which_contains_ip.append(line)
if line_which_contains_ip == []:
line_which_contains_ip.append(f"Norma does not contain information about {ip}")
return line_which_contains_ip
def main():
t1 = int(round(time.time() * 1000))
found_ip_lists = []
found_ip_list = []
if "norma.xls" not in os.listdir():
print("norma.xls file was not found in the current directory")
print("The program will be terminated")
sys.exit()
time.sleep(3)
normafile = pandas.read_excel('norma.xls', skiprows=2, header=None)
trunk_names = normafile[2]
numbers_to_search_list = upload_number_list()
for i in numbers_to_search_list:
found_ip_lists.append(search_for_pattern(i))
for i in found_ip_lists:
found_ip_list += i
print(set(found_ip_list))
for ip in set(found_ip_list):
x = check_if_ip_in_norma(ip, trunk_names)
print(f"{x}\n")
with open('found_results.txt', 'a') as f:
f.write(f"{x}\n")
print("The program completed fine!")
print("Take found_results.txt from the current folder")
print("If you want to repeat search, remove found_results.txt")
t2 = int(round(time.time() * 1000))
print(f"Job is done within {t2 - t1} miliseconds")
time.sleep(90)
print("Bye!")
time.sleep(1)
if __name__ == '__main__':
try:
main()
except Exception as ex:
print("The following error happened:")
print(ex)
time.sleep(20)
Python不支持真正的多线程,您总是有全局解释器锁[],它一次只允许执行一条语句。因此,实际上只有一个线程加上用于处理线程的附加代码,因此在大多数情况下,处理速度会较慢 在I/O操作中可能会有一些加速,但并非总是如此。多线程模块更多地用于不同类型的编程风格,而不是异步编程(python也有一个模块)。如果您希望看到真正的性能改进,您应该使用python多处理模块,它不会受到GIL的影响,但是两个进程之间的数据交换比使用线程更复杂
Python使用全局解释器锁(GIL)。它本质上使整个过程成为一个单线程应用程序
Python多线程只有在I/O绑定的情况下才有用。如果您想并行化您的工作负载,那么您应该使用。它与多线程有一个类似的API,只是进程之间不共享内存。在python中使用多线程处理数据的速度较慢,因为实际上python使用一个线程(由于GIL)在一些Pyton“线程”之间切换,请选中此项 由于切换时间的原因,速度较慢
您应该使用多处理如果解决了您的问题,请将答案标记为正确。@mrangry777我使用多处理模块编写了一个版本,它的工作速度比多线程和单线程慢得多,新脚本不能正确地完成我需要的工作(不要正确地给出查找ip列表-而是给出空列表),我会试着解决这个问题,然后如果我没有问题,我就结束这个问题。谢谢你的支持。你能发布你的测试结果吗?迭代次数、文件大小和执行时间?@mrangry777是的,测试完成后会立即向您提供信息