使用python Beautifulsoup进行网页抓取。如何在dd和dt元素中刮取文本?
我正试图在黄页上搜寻一些公司信息。到目前为止一切进展顺利。但是我无法在特定公司的页面中获取dd和dt元素中的文本。你能帮我吗?非常感谢您的每一个建议! 谢谢 这是我的密码: (我首先进入网站并获取搜索结果。然后,我获取单个公司网页的链接并解析其中的内容。问题是我无法获取存储在单个公司网页dd元素中的信息。)使用python Beautifulsoup进行网页抓取。如何在dd和dt元素中刮取文本?,python,web-scraping,beautifulsoup,Python,Web Scraping,Beautifulsoup,我正试图在黄页上搜寻一些公司信息。到目前为止一切进展顺利。但是我无法在特定公司的页面中获取dd和dt元素中的文本。你能帮我吗?非常感谢您的每一个建议! 谢谢 这是我的密码: (我首先进入网站并获取搜索结果。然后,我获取单个公司网页的链接并解析其中的内容。问题是我无法获取存储在单个公司网页dd元素中的信息。) 如果您想大量修改代码或以完全不同的方式进行修改,请给出一些解释,以便我能理解。我是编程新手。非常感谢。很有趣。很值得添加几个简短的说明。 import httpx import trio f
如果您想大量修改代码或以完全不同的方式进行修改,请给出一些解释,以便我能理解。我是编程新手。非常感谢。很有趣。很值得添加几个简短的说明。
import httpx
import trio
from bs4 import BeautifulSoup
import csv
limit = trio.CapacityLimiter(6)
async def scrape(client, item, sender):
async with limit, sender:
r = await client.get(f'https://www.yellowpages.com{item[1]}')
soup = BeautifulSoup(r.text, 'lxml')
try:
bw = soup.select_one('.primary-btn')['href']
except (TypeError, AttributeError):
bw = None
try:
phone = soup.select_one('p.phone').text
except (TypeError, AttributeError):
phone = None
try:
biy = soup.select_one('.number').text
except AttributeError:
biy = None
result = [item[0], bw, biy, phone]
print(result)
await sender.send(result)
async def worker(client, num, sender, nurse):
async with limit, sender:
params = {
"search_terms": "expeditors",
"geo_location_terms": "NJ",
"page": num,
"sort": "",
"source": "real-user"
}
r = await client.get('https://www.yellowpages.com/search', params=params)
soup = BeautifulSoup(r.text, 'lxml')
goal = [(i.span.text, i['href'])
for i in soup.select('.business-name')]
for what in goal:
nurse.start_soon(scrape, client, what, sender.clone())
async def main():
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:88.0) Gecko/20100101 Firefox/88.0'
}
async with httpx.AsyncClient(timeout=None) as client, trio.open_nursery() as nurse:
client.headers.update(headers)
sender, receiver = trio.open_memory_channel(0)
nurse.start_soon(rec, receiver)
async with sender:
for item in range(1, 2):
nurse.start_soon(worker, client, item, sender.clone(), nurse)
async def rec(receiver):
with open('result.csv', 'w', buffering=1, newline='') as f:
writer = csv.writer(f)
writer.writerow(['Business Name', 'Website',
'Years In Business', 'Phone'])
async with receiver:
async for value in receiver:
writer.writerow(value)
if __name__ == "__main__":
trio.run(main)
import httpx
import trio
from bs4 import BeautifulSoup
import csv
limit = trio.CapacityLimiter(6)
async def scrape(client, item, sender):
async with limit, sender:
r = await client.get(f'https://www.yellowpages.com{item[1]}')
soup = BeautifulSoup(r.text, 'lxml')
try:
bw = soup.select_one('.primary-btn')['href']
except (TypeError, AttributeError):
bw = None
try:
phone = soup.select_one('p.phone').text
except (TypeError, AttributeError):
phone = None
try:
biy = soup.select_one('.number').text
except AttributeError:
biy = None
result = [item[0], bw, biy, phone]
print(result)
await sender.send(result)
async def worker(client, num, sender, nurse):
async with limit, sender:
params = {
"search_terms": "expeditors",
"geo_location_terms": "NJ",
"page": num,
"sort": "",
"source": "real-user"
}
r = await client.get('https://www.yellowpages.com/search', params=params)
soup = BeautifulSoup(r.text, 'lxml')
goal = [(i.span.text, i['href'])
for i in soup.select('.business-name')]
for what in goal:
nurse.start_soon(scrape, client, what, sender.clone())
async def main():
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:88.0) Gecko/20100101 Firefox/88.0'
}
async with httpx.AsyncClient(timeout=None) as client, trio.open_nursery() as nurse:
client.headers.update(headers)
sender, receiver = trio.open_memory_channel(0)
nurse.start_soon(rec, receiver)
async with sender:
for item in range(1, 2):
nurse.start_soon(worker, client, item, sender.clone(), nurse)
async def rec(receiver):
with open('result.csv', 'w', buffering=1, newline='') as f:
writer = csv.writer(f)
writer.writerow(['Business Name', 'Website',
'Years In Business', 'Phone'])
async with receiver:
async for value in receiver:
writer.writerow(value)
if __name__ == "__main__":
trio.run(main)