Python xlsxwriter占用太多内存，进程被终止_Python_Django_Xlsxwriter

Python xlsxwriter占用太多内存，进程被终止

python django

Python xlsxwriter占用太多内存，进程被终止,python,django,xlsxwriter,Python,Django,Xlsxwriter,我正在使用xlsxwriter python包将数据从PostgreSQL数据库导出到django项目中的excel。我已经实现了一个Django命令来实现这一点，但问题是有超过400万条数据记录，写入文件会消耗我所有的RAM，并且进程会被终止日志：我试过使用一个名为“常量内存”的参数，但它似乎没有什么不同。以下是写入excel文件的方法： def write_to_excel_perf(filename, instagram_publications, instagram_tags, tw

我正在使用xlsxwriter python包将数据从PostgreSQL数据库导出到django项目中的excel。我已经实现了一个Django命令来实现这一点，但问题是有超过400万条数据记录，写入文件会消耗我所有的RAM，并且进程会被终止

日志：

我试过使用一个名为“常量内存”的参数，但它似乎没有什么不同。以下是写入excel文件的方法：

def write_to_excel_perf(filename, instagram_publications, instagram_tags, twitter_publications, twitter_tags, instance):
    """
    Export the current queryset to an excel file in xlsx format.
    Optimized for low memory consumption and better performance
    http://xlsxwriter.readthedocs.io/working_with_memory.html#memory-perf
    """
    logger.info("[write_to_excel_perf]> Openning Workbook..")
    book = xlsxwriter.Workbook(filename, {'constant_memory': True})
    if 'instagram' in instance:
        logger.info("[write_to_excel_perf]> Writting Instagram publications..")
        sheet = book.add_worksheet('Instagram Media')
        # Adding media page
        titles = ["Type", "City", "Date", "Instagram Id", "Instagram URL", "caption", "likes",
                  "author", "location id", "location name", "lat", "lng"]
        i = 0
        for title in titles:
            sheet.write(0, i, title)
            i += 1
        row_index = 1
        # We improve the performance making sure that we query by related data using select_related
        # and prefetch_related when needed
        instagram_publications = instagram_publications.select_related('location__spot__city', 'author', 'location')
        for el in instagram_publications:
            # ["Type", "Date", "Instagram Id", "Instagram URL", "caption", "likes", "author", "author_profile",
            #      "location id", "location name", "lat", "lng"]
            mediaType = 'Photo' if el.mediaType == '1' else 'Video'
            city = el.location.spot.city.name if el.location is not None and el.location.spot.city is not None else "Undefined"
            publication_date = el.publication_date.strftime("%d/%m/%Y %H:%M")
            username = el.author.username if el.author is not None else "Undefined"
            location_id = el.location.instagramID if el.location is not None else "Undefined"
            location_name = el.location.name if el.location is not None else "Undefined"
            location_lat = el.location.position.y if el.location is not None else "Undefined"
            location_lng = el.location.position.x if el.location is not None else "Undefined"

            row = [mediaType, city, publication_date, el.instagramID, el.instagram_url, el.caption, el.likes,
                   username, location_id, location_name, location_lat,
                   location_lng]
            column_index = 0
            for value in row:
                sheet.write(row_index, column_index, value)
                column_index += 1
            row_index += 1

        # Adding tag page
        sheet = book.add_worksheet('Instagram Tags')
        titles = ["Hashtag", "Quantity"]
        i = 0
        for title in titles:
            sheet.write(0, i, title)
            i += 1
        row_index = 1
        if instagram_tags is not None:
            logger.info("[write_to_excel_perf]> Writting Instagram hashtags..")
            for el in instagram_tags:
                hashtag_id = el.get('hashtag__id')
                label = Hashtag.objects.get(id=hashtag_id).label
                sheet.write(row_index, 0, label)
                sheet.write(row_index, 1, el.get('count'))
                row_index += 1
        else:
            sheet.write(1, 0, "No hashtags in query")

    if 'twitter' in instance:
        # TwitterPublication
        logger.info("[write_to_excel_perf]> Writting Twitter publications..")
        sheet = book.add_worksheet('Twitter Media')

        titles = ["City", "Date", "Twitter Id", "Twitter URL", "caption", "likes",
                  "author", "lat", "lng"]
        i = 0
        for title in titles:
            sheet.write(0, i, title)
            i += 1
        row_index = 1

        twitter_publications = twitter_publications.select_related('location__spot__city', 'author', 'location')
        for el in twitter_publications:
            city = el.location.spot.city.name if el.location is not None and el.location.spot.city is not None else "Undefined"
            publication_date = el.publication_date.strftime("%d/%m/%Y %H:%M")
            username = el.author.username if el.author is not None else "Undefined"
            location_lat = el.location.position.y if el.location is not None else "Undefined"
            location_lng = el.location.position.x if el.location is not None else "Undefined"

            row = [city, publication_date, el.twitterID, el.twitter_url, el.caption, el.likes,
                   username, location_lat, location_lng]
            column_index = 0
            for value in row:
                sheet.write(row_index, column_index, value)
                column_index += 1
            row_index += 1

        # Adding tag page
        sheet = book.add_worksheet('Twitter Tags')
        titles = ["Hashtag", "Quantity"]
        i = 0
        for title in titles:
            sheet.write(0, i, title)
            i += 1
        row_index = 1
        if twitter_tags is not None:
            logger.info("[write_to_excel_perf]> Writting Twitter hashtags..")
            for el in twitter_tags:
                hashtag_id = el.get('hashtag__id')
                label = TwitterHashtag.objects.get(id=hashtag_id).label
                sheet.write(row_index, 0, label)
                sheet.write(row_index, 1, el.get('count'))
                row_index += 1
        else:
            sheet.write(1, 0, "No hashtags in query")

    book.close()

    logger.info("[write_to_excel_perf]> Export file generated sucessfully.")
    return book

我试过使用一个名为

常量\u memory

的参数，但它似乎没有什么不同

应该可以。如图所示，

constant\u memory

选项保持内存使用量恒定且较小

因此，如果它对您的应用程序没有影响，那么问题可能不是XlsxWriter，而是其他东西在消耗内存

您能否通过注释掉对

工作表.write（）的所有调用并再次运行测试来验证这一点。
谢谢！我想你是对的，我还在调试它，但我发现主要问题是使用select_related（）一次将大量相关对象的数据写入内存。我想知道是否有办法在写入每一行后强制垃圾收集，因为内存似乎仍在缓慢增加，但还是在增加。我不知道为什么，但我想那可能是相关对象的数据。@Mariano在常量内存模式XlsxWriter中只在内存中保留一行数据，并用每一行将数据刷新到磁盘。您确定是XlsxWriter导致内存增加吗？问题不是XlsxWriter，您是对的。问题是，我正在使用for循环迭代一个巨大的django查询集，当我访问每个元素时，也会执行一些其他查询以从其他相关对象获取数据。由于某种原因，直到循环完成，内存才会释放。在迭代大型查询集时，这似乎是一个常见的python/Django问题，而我似乎不是这方面的解决方案。
def write_to_excel_perf(filename, instagram_publications, instagram_tags, twitter_publications, twitter_tags, instance):
    """
    Export the current queryset to an excel file in xlsx format.
    Optimized for low memory consumption and better performance
    http://xlsxwriter.readthedocs.io/working_with_memory.html#memory-perf
    """
    logger.info("[write_to_excel_perf]> Openning Workbook..")
    book = xlsxwriter.Workbook(filename, {'constant_memory': True})
    if 'instagram' in instance:
        logger.info("[write_to_excel_perf]> Writting Instagram publications..")
        sheet = book.add_worksheet('Instagram Media')
        # Adding media page
        titles = ["Type", "City", "Date", "Instagram Id", "Instagram URL", "caption", "likes",
                  "author", "location id", "location name", "lat", "lng"]
        i = 0
        for title in titles:
            sheet.write(0, i, title)
            i += 1
        row_index = 1
        # We improve the performance making sure that we query by related data using select_related
        # and prefetch_related when needed
        instagram_publications = instagram_publications.select_related('location__spot__city', 'author', 'location')
        for el in instagram_publications:
            # ["Type", "Date", "Instagram Id", "Instagram URL", "caption", "likes", "author", "author_profile",
            #      "location id", "location name", "lat", "lng"]
            mediaType = 'Photo' if el.mediaType == '1' else 'Video'
            city = el.location.spot.city.name if el.location is not None and el.location.spot.city is not None else "Undefined"
            publication_date = el.publication_date.strftime("%d/%m/%Y %H:%M")
            username = el.author.username if el.author is not None else "Undefined"
            location_id = el.location.instagramID if el.location is not None else "Undefined"
            location_name = el.location.name if el.location is not None else "Undefined"
            location_lat = el.location.position.y if el.location is not None else "Undefined"
            location_lng = el.location.position.x if el.location is not None else "Undefined"

            row = [mediaType, city, publication_date, el.instagramID, el.instagram_url, el.caption, el.likes,
                   username, location_id, location_name, location_lat,
                   location_lng]
            column_index = 0
            for value in row:
                sheet.write(row_index, column_index, value)
                column_index += 1
            row_index += 1

        # Adding tag page
        sheet = book.add_worksheet('Instagram Tags')
        titles = ["Hashtag", "Quantity"]
        i = 0
        for title in titles:
            sheet.write(0, i, title)
            i += 1
        row_index = 1
        if instagram_tags is not None:
            logger.info("[write_to_excel_perf]> Writting Instagram hashtags..")
            for el in instagram_tags:
                hashtag_id = el.get('hashtag__id')
                label = Hashtag.objects.get(id=hashtag_id).label
                sheet.write(row_index, 0, label)
                sheet.write(row_index, 1, el.get('count'))
                row_index += 1
        else:
            sheet.write(1, 0, "No hashtags in query")

    if 'twitter' in instance:
        # TwitterPublication
        logger.info("[write_to_excel_perf]> Writting Twitter publications..")
        sheet = book.add_worksheet('Twitter Media')

        titles = ["City", "Date", "Twitter Id", "Twitter URL", "caption", "likes",
                  "author", "lat", "lng"]
        i = 0
        for title in titles:
            sheet.write(0, i, title)
            i += 1
        row_index = 1

        twitter_publications = twitter_publications.select_related('location__spot__city', 'author', 'location')
        for el in twitter_publications:
            city = el.location.spot.city.name if el.location is not None and el.location.spot.city is not None else "Undefined"
            publication_date = el.publication_date.strftime("%d/%m/%Y %H:%M")
            username = el.author.username if el.author is not None else "Undefined"
            location_lat = el.location.position.y if el.location is not None else "Undefined"
            location_lng = el.location.position.x if el.location is not None else "Undefined"

            row = [city, publication_date, el.twitterID, el.twitter_url, el.caption, el.likes,
                   username, location_lat, location_lng]
            column_index = 0
            for value in row:
                sheet.write(row_index, column_index, value)
                column_index += 1
            row_index += 1

        # Adding tag page
        sheet = book.add_worksheet('Twitter Tags')
        titles = ["Hashtag", "Quantity"]
        i = 0
        for title in titles:
            sheet.write(0, i, title)
            i += 1
        row_index = 1
        if twitter_tags is not None:
            logger.info("[write_to_excel_perf]> Writting Twitter hashtags..")
            for el in twitter_tags:
                hashtag_id = el.get('hashtag__id')
                label = TwitterHashtag.objects.get(id=hashtag_id).label
                sheet.write(row_index, 0, label)
                sheet.write(row_index, 1, el.get('count'))
                row_index += 1
        else:
            sheet.write(1, 0, "No hashtags in query")

    book.close()

    logger.info("[write_to_excel_perf]> Export file generated sucessfully.")
    return book