Python 为什么多线程程序比单线程程序慢,尽管它们读取单独的txt文件?

Python 为什么多线程程序比单线程程序慢,尽管它们读取单独的txt文件?,python,python-3.x,multithreading,file,Python,Python 3.x,Multithreading,File,我有几个包含CDR信息的txt文件,CDR分布在几个文件中。我需要在这些文件中找到电话号码,然后检查与xls文件的匹配情况。我编写了单线程版本,然后是多线程,发现sometimers多线程比单线程慢 多线程: import re import os import time import sys import pandas import ipaddress import threading def improve_view_n(string_to_improve): string_to

我有几个包含CDR信息的txt文件,CDR分布在几个文件中。我需要在这些文件中找到电话号码,然后检查与xls文件的匹配情况。我编写了单线程版本,然后是多线程,发现sometimers多线程比单线程慢

多线程:

import re
import os
import time
import sys
import pandas
import ipaddress
import threading


def improve_view_n(string_to_improve):
    string_to_improve = string_to_improve.split(',')
    improved_string = ""
    for i in string_to_improve:
        if i != "":
            improved_string = improved_string + i + "  "
    return improved_string


def upload_number_list():
    numbers = []
    try:
        with open(file="number_list.txt", mode="r") as f:
            for i in f:
                numbers.append(i.strip("\\\n"))
    except FileNotFoundError:
        print("number_list.txt file does not exist or corrupted.\n\n")
        print("The program will be terminated in 5 seconds")
        time.sleep(5)
        sys.exit()
    return numbers


def search_for_pattern(number, file_name, semaphore, found_ip):
    semaphore.acquire()
    if file_name.startswith("MSK"):
        with open(file=file_name, mode='r') as f:
            text_of_file = f.read()
            results = re.findall(pattern=f",,,,,.*{number}.*,", string=text_of_file)
            if results:
                for element in results:
                    write_searh_results_to_file(file_name, element)
                    element = improve_view_n(element).split()
                    for subeleement in element:
                        try:
                            ipaddress.IPv4Address(subeleement)
                        except ipaddress.AddressValueError:
                            pass
                        else:
                            found_ip.append(subeleement)
            else:
                nothing_was_found(file_name, number)
    semaphore.release()


def write_searh_results_to_file(file_where_match_was_found, element):
    with open(file="found_results.txt", mode='a') as f:
        f.write(f"{file_where_match_was_found}: {improve_view_n(element)} \n")


def nothing_was_found(file_where_match_wasnt_found, number_to_search):
    with open(file="found_results.txt", mode='a') as f:
        f.write(f"NO MATCHES FOUND FOR {number_to_search} IN {file_where_match_wasnt_found}\n\n")


def check_if_ip_in_norma(ip, trunk_names):
    line_which_contains_ip = []
    for line in trunk_names:
        if ip in line:
            line_which_contains_ip.append(line)
    if line_which_contains_ip == []:
        line_which_contains_ip.append(f"Norma does not contain information about {ip}")
    return line_which_contains_ip


def main():
    threads = []
    our_files = ('y.py', "found_results.txt", "number_list.txt", 'norma.xls', 'MultyThread.py')
    list_files = os.listdir()
    for file in our_files:
        if file in list_files:
            list_files.remove(file)
    semaphore = threading.Semaphore(10)
    t1 = int(round(time.time() * 1000))
    found_ip_list = []
    if "norma.xls" not in os.listdir():
        print("norma.xls file was not found in the current directory")
        print("The program will be terminated")
        sys.exit()
        time.sleep(3)
    normafile = pandas.read_excel('norma.xls', skiprows=2, header=None)
    trunk_names = normafile[2]
    numbers_to_search_list = upload_number_list()
    for number in numbers_to_search_list:
        for file_number in range(len(list_files)):
            threads.append(threading.Thread(target=search_for_pattern,
                                            args=(number, list_files[file_number],
                                                  semaphore, found_ip_list,),)
                           )
            threads[file_number].start()
        for file_number in range(len(list_files)):
            threads[file_number].join()
    print(set(found_ip_list))
    for ip in set(found_ip_list):
        x = check_if_ip_in_norma(ip, trunk_names)
        print(f"{x}\n")
        with open('found_results.txt', 'a') as f:
            f.write(f"{x}\n")
    print("The program completed fine!")
    print("Take found_results.txt from the current folder")
    print("If you want to repeat search, remove found_results.txt")
    t2 = int(round(time.time() * 1000))
    print(f"Job is done within {t2 - t1} miliseconds")
    time.sleep(90)
    print("Bye!")
    time.sleep(1)


if __name__ == '__main__':
    try:
        main()
    except Exception as ex:
        print("The following error happened:")
        print(ex)
    time.sleep(20)
单线程:

import re
import os
import time
import sys
import pandas
import ipaddress


def improve_view_n(string_to_improve):
    string_to_improve = string_to_improve.split(',')
    improved_string = ""
    for i in string_to_improve:
        if i != "":
            improved_string = improved_string + i + "  "
    return improved_string


def upload_number_list():
    numbers = []
    try:
        with open(file="number_list.txt", mode="r") as f:
            for i in f:
                numbers.append(i.strip("\\\n"))
    except FileNotFoundError:
        print("number_list.txt file does not exist or corrupted.\n\n")
        print("The program will be terminated in 5 seconds")
        time.sleep(5)
        sys.exit()
    return numbers


def search_for_pattern(number):
    found_ip = []
    our_files = ('y.py', "found_results.txt", "number_list.txt", 'norma.xls')
    list_files = os.listdir()
    for file_name in list_files:
        #if file_name not in our_files:
        if file_name.startswith("MSK"):
            with open(file=file_name, mode='r') as f:
                text_of_file = f.read()
                results = re.findall(pattern=f",,,,,.*{number}.*,", string=text_of_file)
                if results:
                    for element in results:
                        write_searh_results_to_file(file_name, element)
                        element = improve_view_n(element).split()
                        for subeleement in element:
                            try:
                                ipaddress.IPv4Address(subeleement)
                            except ipaddress.AddressValueError:
                                pass
                            else:
                                found_ip.append(subeleement)
                else:
                    nothing_was_found(file_name, number)
    return found_ip


def write_searh_results_to_file(file_where_match_was_found, element):
    with open(file="found_results.txt", mode='a') as f:
        f.write(f"{file_where_match_was_found}: {improve_view_n(element)} \n")


def nothing_was_found(file_where_match_wasnt_found, number_to_search):
    with open(file="found_results.txt", mode='a') as f:
        f.write(f"NO MATCHES FOUND FOR {number_to_search} IN {file_where_match_wasnt_found}\n\n")


def check_if_ip_in_norma(ip, trunk_names):
    line_which_contains_ip = []
    for line in trunk_names:
        if ip in line:
            line_which_contains_ip.append(line)
    if line_which_contains_ip == []:
        line_which_contains_ip.append(f"Norma does not contain information about {ip}")
    return line_which_contains_ip


def main():
    t1 = int(round(time.time() * 1000))
    found_ip_lists = []
    found_ip_list = []
    if "norma.xls" not in os.listdir():
        print("norma.xls file was not found in the current directory")
        print("The program will be terminated")
        sys.exit()
        time.sleep(3)
    normafile = pandas.read_excel('norma.xls', skiprows=2, header=None)
    trunk_names = normafile[2]
    numbers_to_search_list = upload_number_list()
    for i in numbers_to_search_list:
        found_ip_lists.append(search_for_pattern(i))
    for i in found_ip_lists:
        found_ip_list += i
    print(set(found_ip_list))
    for ip in set(found_ip_list):
        x = check_if_ip_in_norma(ip, trunk_names)
        print(f"{x}\n")
        with open('found_results.txt', 'a') as f:
            f.write(f"{x}\n")
    print("The program completed fine!")
    print("Take found_results.txt from the current folder")
    print("If you want to repeat search, remove found_results.txt")
    t2 = int(round(time.time() * 1000))
    print(f"Job is done within {t2 - t1} miliseconds")
    time.sleep(90)
    print("Bye!")
    time.sleep(1)


if __name__ == '__main__':
    try:
        main()
    except Exception as ex:
        print("The following error happened:")
        print(ex)
        time.sleep(20)

Python不支持真正的多线程,您总是有全局解释器锁[],它一次只允许执行一条语句。因此,实际上只有一个线程加上用于处理线程的附加代码,因此在大多数情况下,处理速度会较慢

在I/O操作中可能会有一些加速,但并非总是如此。多线程模块更多地用于不同类型的编程风格,而不是异步编程(python也有一个模块)。如果您希望看到真正的性能改进,您应该使用python多处理模块,它不会受到GIL的影响,但是两个进程之间的数据交换比使用线程更复杂


Python使用全局解释器锁(GIL)。它本质上使整个过程成为一个单线程应用程序


Python多线程只有在I/O绑定的情况下才有用。如果您想并行化您的工作负载,那么您应该使用。它与多线程有一个类似的API,只是进程之间不共享内存。

在python中使用多线程处理数据的速度较慢,因为实际上python使用一个线程(由于GIL)在一些Pyton“线程”之间切换,请选中此项

由于切换时间的原因,速度较慢


您应该使用多处理

如果解决了您的问题,请将答案标记为正确。@mrangry777我使用多处理模块编写了一个版本,它的工作速度比多线程和单线程慢得多,新脚本不能正确地完成我需要的工作(不要正确地给出查找ip列表-而是给出空列表),我会试着解决这个问题,然后如果我没有问题,我就结束这个问题。谢谢你的支持。你能发布你的测试结果吗?迭代次数、文件大小和执行时间?@mrangry777是的,测试完成后会立即向您提供信息