Python Asyncio跳过添加的任务
我正在尝试使用Asyncio、aiohttp和BeautifulSoup库创建一个web爬虫程序,该程序可以在网页上找到特定的下载URL并下载其内容。目前,我将大约1000个任务添加到任务队列中,但只收到大约500个输出。每次运行该文件时,我都会收到不同数量的输出,使我相信这与Asyncio跳过某些任务有关。此外,附加到下载URL列表(mcif_URL)的所有下载URL似乎都是由我的下载数据和写入文件函数下载的。因此,问题似乎源于我的identify_结构函数。我对这些库没有太多的经验,所以我可能只是遗漏了一些东西,但是有人知道为什么会发生这种情况吗Python Asyncio跳过添加的任务,python,asynchronous,web,web-crawler,python-asyncio,Python,Asynchronous,Web,Web Crawler,Python Asyncio,我正在尝试使用Asyncio、aiohttp和BeautifulSoup库创建一个web爬虫程序,该程序可以在网页上找到特定的下载URL并下载其内容。目前,我将大约1000个任务添加到任务队列中,但只收到大约500个输出。每次运行该文件时,我都会收到不同数量的输出,使我相信这与Asyncio跳过某些任务有关。此外,附加到下载URL列表(mcif_URL)的所有下载URL似乎都是由我的下载数据和写入文件函数下载的。因此,问题似乎源于我的identify_结构函数。我对这些库没有太多的经验,所以我可
# -*- coding: utf-8 -*-
"""
Created on Sun Oct 18 10:11:19 2020
@author: hgheiberger
"""
import asyncio
import time
import aiohttp
import nest_asyncio
import requests
from bs4 import BeautifulSoup
headers = {"Accept-Language": "en-US, en;q=0.5"}
indexes = {"2.1.1", "3.4", "0.4", "0.5", "0.1"}
mcif_urls = ["http://webbdcrista1.ehu.es/magndata/tmp/0.409_TmNi.mcif"]
#Adds asyncio support for IDE
nest_asyncio.apply()
def batch_indexes():
"""
Scrapes MAGNDATA homepage and appends mcif structure index values
Returns
-------
None.
"""
#Pulls datbase homepage through HTML GET request
url = "http://webbdcrista1.ehu.es/magndata/index.php?show_db=1"
page = requests.get(url, headers=headers, timeout=10.00, allow_redirects=True)
#Parses recieved HTML content
parsed_page = BeautifulSoup(page.text, "lxml")
#Finds and appends mcif index values
for link in parsed_page.find_all('a'):
link_text = str(link.get('href'))
if "index=" in link_text:
index = link_text.replace("?index=", "")
indexes.add(str(index))
async def identify_structures(structure_index: str):
"""
Scrapes individual structure database entries and appends mcif download link
Parameters
----------
structure_index : str
Identification index of indvidual magnetic structure
Returns
-------
link : str
Mcif download link of individual magnetic structure
"""
url = f"http://webbdcrista1.ehu.es/magndata/index_incomm.php?index={structure_index}"
async with aiohttp.ClientSession() as session:
async with session.get(url) as resp:
page = await resp.text()
parsed_page = BeautifulSoup(page, "lxml")
#Finds and appends mcif download links
for link in parsed_page.find_all('a'):
if "mcif" in link.text:
mcif_urls.append("http://webbdcrista1.ehu.es/magndata/" + link.get("href"))
link = "http://webbdcrista1.ehu.es/magndata/" + link.get("href")
return link
async def download_data(structure_index: str, link: str):
"""
Reads individual download links and returns file data
Parameters
----------
structure_index : str
Identification index of indvidual magnetic structure
link : str
Mcif download link of individual magnetic structure
Returns
-------
file_data : bytes
Mcif file data
"""
url = link
async with aiohttp.ClientSession() as session:
async with session.get(url, timeout=1000*60) as resp:
file_data = await resp.read()
return file_data
async def write_to_file(structure_index: str, file_data: bytes):
"""
Parameters
----------
structure_index : str
Identification index of indvidual magnetic structure
file_data : bytes
DESCRIPTION.
Returns
-------
None.
"""
filename = f"structure_{structure_index}.mcif"
with open(filename, "wb") as structure_file:
structure_file.write(file_data)
print(f"Finished writing {filename}")
async def web_scrape_task(structure_index: str):
"""
Parameters
----------
structure_index : str
DESCRIPTION.
Returns
-------
None.
"""
link = await identify_structures(structure_index)
file_data = await download_data(structure_index, link)
await write_to_file(structure_index, file_data)
async def main():
"""
Returns
-------
None.
"""
tasks = []
for index in indexes:
tasks.append(web_scrape_task(index))
await asyncio.wait(tasks)
if __name__ == "__main__":
batch_indexes()
s = time.perf_counter()
asyncio.run(main())
elapsed = time.perf_counter() - s
print(f"Execution time: {elapsed:0.2f} seconds.")
wait asyncio.wait(tasks)
将等待任务完成,但不会传播它们引发的异常(如果有),因为您需要自己获得任务结果。将其替换为wait asyncio.gather(*tasks)
,您将收到异常发生的通知,异常可能是跳过任务的原因。