Python 解析XML并将大数据保存到Django DB
我有一个Django函数,它接收Nessus文件,然后在将其保存到数据库之前解析数据,我的Nessus文件通常有大约30k行,将其保存到数据库可能需要2个小时,我曾尝试使用bulk\u create但这破坏了代码,同时我使用Django 1.11,有没有办法加快这些大型数据库插入(postgres) 这是我的代码:Python 解析XML并将大数据保存到Django DB,python,django,xml,database,postgresql,Python,Django,Xml,Database,Postgresql,我有一个Django函数,它接收Nessus文件,然后在将其保存到数据库之前解析数据,我的Nessus文件通常有大约30k行,将其保存到数据库可能需要2个小时,我曾尝试使用bulk\u create但这破坏了代码,同时我使用Django 1.11,有没有办法加快这些大型数据库插入(postgres) 这是我的代码: def process_nessus_file(*args, **kwargs): process_obj = kwargs.get('file') context
def process_nessus_file(*args, **kwargs):
process_obj = kwargs.get('file')
context = kwargs.get('context')
request = kwargs.get('request')
file_obj = process_obj.first()
file_path = file_obj.file.path
context = etree.iterparse(
file_path,
events=('end', ),
tag="ReportHost"
)
total_issues = 0
detected_issues = 0
undetected_issues = 0
already_exist_issue = 0
low_risk_count = 0
medium_risk_count = 0
high_risk_count = 0
critical_risk_count = 0
low_new_issue = 0
medium_new_issue = 0
high_new_issue = 0
critical_new_issue = 0
vul_history = []
for event, elem in context:
first_identified = None
last_seen = None
host = elem.get('name')
logger.info('Processing issue for host : {}'.format(host))
for child in elem:
if child.tag == "HostProperties":
for host_prop_tags in child:
if host_prop_tags.attrib['name'] == "HOST_START":
first_identified = host_prop_tags.text
elif host_prop_tags.attrib['name'] == "HOST_END":
last_seen = host_prop_tags.text
if child.tag == "ReportItem":
main_tags = child.attrib
child_tags = dict()
for ch_tags in child:
if ch_tags.text:
tag_text = ch_tags.text.strip()
else:
tag_text = ch_tags.text
child_tags[ch_tags.tag] = tag_text
if child_tags.get('solution') and \
child_tags.get('solution') in ['n/a', 'N/A']:
child_tags['solution'] = ''
plugin_output = child_tags.get('plugin_output')
pluginid = int(main_tags.get('pluginID'))
if plugin_output and (pluginid == 10107):
if re.search(BANNER_PATTERN, plugin_output):
banner_pattern = plugin_output.replace("{}".\
format(BANNER_PATTERN), "")
banner = banner_pattern.strip()
else:
banner = ''
else:
banner = ''
risk = child_tags.get('risk_factor')
synopsis = child_tags.get('synopsis')
description = child_tags.get('description')
solution = child_tags.get('solution')
protocol = main_tags.get('protocol')
port = main_tags.get('port')
pluginname = main_tags.get('pluginName')
svcname = main_tags.get('svc_type')
try:
host_type = get_host_type(host)
user_host = check_host_exists(host, host_type)
if user_host and not NessusData.objects.filter(
plugin_id=int(pluginid), host=host,
port=int(port), name=pluginname
).exists():
try:
host_link_obj = Host.objects.get(
host=host
)
except Host.MultipleObjectsReturned:
host_link_obj = host.objects.filter(
host=host
).first()
except Host.DoesNotExist:
host_link_obj = Host.objects.create(
host=host,
user_host=user_host
)
nessus_obj = NessusFile.objects.create(
user_host=user_host,
host_link=host_link_obj,
linked_file=file_obj,
plugin_id=int(pluginid),
risk=risk, host=host,
protocol=protocol, port=int(port),
banner=banner, name=pluginname,
svc_type=svcname,
description=description,
first_identified=first_identified,
last_seen=last_seen,
synopsis=synopsis,
plugin_output=plugin_output,
solution=solution
)
issue = "Issue with host {}, port {} and"\
" pluginID {} is added.".\
format(
nessus_obj.host, nessus_obj.port,
nessus_obj.plugin_id
)
NessusFileLog.objects.create(
linked_file=file_obj,
issue_type="new",
issue=issue
)
detected_issues = detected_issues + 1
if risk == 'Medium':
medium_new_issue = medium_new_issue + 1
elif risk == 'Low':
low_new_issue = low_new_issue + 1
elif risk == 'High':
high_new_issue = high_new_issue + 1
elif risk == 'Critical':
critical_new_issue = critical_new_issue + 1
else:
nessus_obj = NessusFile.objects.filter(
plugin_id=int(pluginid), host=host,
port=int(port), name=pluginname
).first()
if nessus_obj and not nessus_obj.last_seen:
nessus_obj.last_seen = last_seen
nessus_obj.save()
issue = "Issue with host {}, port {} and"\
" pluginID {} is already exists.".\
format(host,port, pluginid)
NessusFileLog.objects.create(
linked_file=file_obj,
issue_type="duplicate",
issue=issue
)
already_exist_issue = already_exist_issue + 1
except Exception as e:
pass
if risk == 'Medium':
medium_risk_count = medium_risk_count + 1
elif risk == 'Low':
low_risk_count = low_risk_count + 1
elif risk == 'High':
high_risk_count = high_risk_count + 1
elif risk == 'Critical':
critical_risk_count = critical_risk_count + 1
total_issues = total_issues + 1
elem.clear()
while elem.getprevious() is not None:
del elem.getparent()[0]
我听说使用原始sql查询会加快速度,但我无法理解这个过程您正在使用Django自己的方法进行插入。您应该尝试使用纯
SQL
函数,如INSERT-INTO
。它将帮助您将多个数据插入数据库,但请小心,它不会执行django的任何保存
或信号