Python 解析XML并将大数据保存到Django DB_Python_Django_Xml_Database_Postgresql

Python 解析XML并将大数据保存到Django DB

python django xml database postgresql

Python 解析XML并将大数据保存到Django DB,python,django,xml,database,postgresql,Python,Django,Xml,Database,Postgresql,我有一个Django函数，它接收Nessus文件，然后在将其保存到数据库之前解析数据，我的Nessus文件通常有大约30k行，将其保存到数据库可能需要2个小时，我曾尝试使用bulk\u create但这破坏了代码，同时我使用Django 1.11，有没有办法加快这些大型数据库插入（postgres）这是我的代码： def process_nessus_file(*args, **kwargs): process_obj = kwargs.get('file') context

我有一个Django函数，它接收Nessus文件，然后在将其保存到数据库之前解析数据，我的Nessus文件通常有大约30k行，将其保存到数据库可能需要2个小时，我曾尝试使用bulk\u create但这破坏了代码，同时我使用Django 1.11，有没有办法加快这些大型数据库插入（postgres）

这是我的代码：

def process_nessus_file(*args, **kwargs):
    process_obj = kwargs.get('file')
    context = kwargs.get('context')
    request = kwargs.get('request')
    file_obj = process_obj.first()
    file_path = file_obj.file.path
    context = etree.iterparse(
        file_path,
        events=('end', ),
        tag="ReportHost"
    )
    total_issues = 0
    detected_issues = 0
    undetected_issues = 0
    already_exist_issue = 0
    low_risk_count = 0
    medium_risk_count = 0
    high_risk_count = 0
    critical_risk_count = 0
    low_new_issue = 0
    medium_new_issue = 0
    high_new_issue = 0
    critical_new_issue = 0
    vul_history = []
    for event, elem in context:
        first_identified = None
        last_seen = None
        host = elem.get('name')
        logger.info('Processing issue for host : {}'.format(host))
        for child in elem:
            if child.tag == "HostProperties":
                for host_prop_tags in child:
                    if host_prop_tags.attrib['name'] == "HOST_START":
                        first_identified = host_prop_tags.text
                    elif host_prop_tags.attrib['name'] == "HOST_END":
                        last_seen = host_prop_tags.text
            if child.tag == "ReportItem":
                main_tags = child.attrib
                child_tags = dict()
                for ch_tags in child:
                    if ch_tags.text:
                        tag_text = ch_tags.text.strip()
                    else:
                        tag_text = ch_tags.text
                    child_tags[ch_tags.tag] = tag_text
                if child_tags.get('solution') and \
                    child_tags.get('solution') in ['n/a', 'N/A']:
                    child_tags['solution'] = ''
                plugin_output = child_tags.get('plugin_output')
                pluginid = int(main_tags.get('pluginID'))
                if plugin_output and (pluginid == 10107):
                    if re.search(BANNER_PATTERN, plugin_output):
                        banner_pattern = plugin_output.replace("{}".\
                            format(BANNER_PATTERN), "")
                        banner = banner_pattern.strip()
                    else:
                        banner = ''
                else:
                    banner = ''
                risk = child_tags.get('risk_factor')
                synopsis = child_tags.get('synopsis')
                description = child_tags.get('description')
                solution = child_tags.get('solution')
                protocol = main_tags.get('protocol')
                port = main_tags.get('port')
                pluginname = main_tags.get('pluginName')
                svcname = main_tags.get('svc_type')
                try:
                    host_type = get_host_type(host)
                    user_host = check_host_exists(host, host_type)
                    if user_host and not NessusData.objects.filter(
                        plugin_id=int(pluginid), host=host,
                        port=int(port), name=pluginname
                        ).exists():
                        try:
                            host_link_obj = Host.objects.get(
                            host=host
                        )
                        except Host.MultipleObjectsReturned:
                            host_link_obj = host.objects.filter(
                                host=host
                            ).first()
                        except Host.DoesNotExist:
                            host_link_obj = Host.objects.create(
                                host=host,
                                user_host=user_host
                            )
                        nessus_obj = NessusFile.objects.create(
                            user_host=user_host,
                            host_link=host_link_obj,
                            linked_file=file_obj,
                            plugin_id=int(pluginid),
                            risk=risk, host=host,
                            protocol=protocol, port=int(port),
                            banner=banner, name=pluginname,
                            svc_type=svcname,
                            description=description,
                            first_identified=first_identified,
                            last_seen=last_seen,
                            synopsis=synopsis,
                            plugin_output=plugin_output,
                            solution=solution
                        )
                        issue = "Issue with host {}, port {} and"\
                            " pluginID {} is added.".\
                            format(
                                nessus_obj.host, nessus_obj.port,
                                nessus_obj.plugin_id
                            )
                        NessusFileLog.objects.create(
                            linked_file=file_obj,
                            issue_type="new",
                            issue=issue
                        )
                        detected_issues = detected_issues + 1
                        if risk == 'Medium':
                            medium_new_issue = medium_new_issue + 1
                        elif risk == 'Low':
                            low_new_issue = low_new_issue + 1
                        elif risk == 'High':
                            high_new_issue = high_new_issue + 1
                        elif risk == 'Critical':
                            critical_new_issue = critical_new_issue + 1
                    else:
                        nessus_obj = NessusFile.objects.filter(
                            plugin_id=int(pluginid), host=host,
                            port=int(port), name=pluginname
                        ).first()
                        if nessus_obj and not nessus_obj.last_seen:
                            nessus_obj.last_seen = last_seen
                            nessus_obj.save()
                        issue = "Issue with host {}, port {} and"\
                            " pluginID {} is already exists.".\
                            format(host,port, pluginid)
                        NessusFileLog.objects.create(
                            linked_file=file_obj,
                            issue_type="duplicate",
                            issue=issue
                        )
                        already_exist_issue = already_exist_issue + 1
                except Exception as e:
                    pass
                if risk == 'Medium':
                    medium_risk_count = medium_risk_count + 1
                elif risk == 'Low':
                    low_risk_count = low_risk_count + 1
                elif risk == 'High':
                    high_risk_count = high_risk_count + 1
                elif risk == 'Critical':
                    critical_risk_count = critical_risk_count + 1
                total_issues = total_issues + 1
        elem.clear()
        while elem.getprevious() is not None:
            del elem.getparent()[0]

我听说使用原始sql查询会加快速度，但我无法理解这个过程

您正在使用Django自己的方法进行插入。您应该尝试使用纯

SQL

函数，如

INSERT-INTO

。它将帮助您将多个数据插入数据库，但请小心，它不会执行django的任何

保存

或

信号