使用python Beautifulsoup进行网页抓取。如何在dd和dt元素中刮取文本？_Python_Web Scraping_Beautifulsoup

使用python Beautifulsoup进行网页抓取。如何在dd和dt元素中刮取文本？

python web-scraping

使用python Beautifulsoup进行网页抓取。如何在dd和dt元素中刮取文本？,python,web-scraping,beautifulsoup,Python,Web Scraping,Beautifulsoup,我正试图在黄页上搜寻一些公司信息。到目前为止一切进展顺利。但是我无法在特定公司的页面中获取dd和dt元素中的文本。你能帮我吗？非常感谢您的每一个建议！谢谢这是我的密码：（我首先进入网站并获取搜索结果。然后，我获取单个公司网页的链接并解析其中的内容。问题是我无法获取存储在单个公司网页dd元素中的信息。）如果您想大量修改代码或以完全不同的方式进行修改，请给出一些解释，以便我能理解。我是编程新手。非常感谢。很有趣。很值得添加几个简短的说明。 import httpx import trio f

我正试图在黄页上搜寻一些公司信息。到目前为止一切进展顺利。但是我无法在特定公司的页面中获取dd和dt元素中的文本。你能帮我吗？非常感谢您的每一个建议！谢谢

这是我的密码：（我首先进入网站并获取搜索结果。然后，我获取单个公司网页的链接并解析其中的内容。问题是我无法获取存储在单个公司网页dd元素中的信息。）

如果您想大量修改代码或以完全不同的方式进行修改，请给出一些解释，以便我能理解。我是编程新手。非常感谢。

很有趣。很值得添加几个简短的说明。

import httpx
import trio
from bs4 import BeautifulSoup
import csv

limit = trio.CapacityLimiter(6)


async def scrape(client, item, sender):
    async with limit, sender:
        r = await client.get(f'https://www.yellowpages.com{item[1]}')
        soup = BeautifulSoup(r.text, 'lxml')

        try:
            bw = soup.select_one('.primary-btn')['href']
        except (TypeError, AttributeError):
            bw = None
        try:
            phone = soup.select_one('p.phone').text
        except (TypeError, AttributeError):
            phone = None
        try:
            biy = soup.select_one('.number').text
        except AttributeError:
            biy = None

        result = [item[0], bw, biy, phone]
        print(result)
        await sender.send(result)


async def worker(client, num, sender, nurse):
    async with limit, sender:
        params = {
            "search_terms": "expeditors",
            "geo_location_terms": "NJ",
            "page": num,
            "sort": "",
            "source": "real-user"
        }
        r = await client.get('https://www.yellowpages.com/search', params=params)
        soup = BeautifulSoup(r.text, 'lxml')
        goal = [(i.span.text, i['href'])
                for i in soup.select('.business-name')]
        for what in goal:
            nurse.start_soon(scrape, client, what, sender.clone())


async def main():
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:88.0) Gecko/20100101 Firefox/88.0'
    }
    async with httpx.AsyncClient(timeout=None) as client, trio.open_nursery() as nurse:
        client.headers.update(headers)

        sender, receiver = trio.open_memory_channel(0)
        nurse.start_soon(rec, receiver)

        async with sender:
            for item in range(1, 2):
                nurse.start_soon(worker, client, item, sender.clone(), nurse)


async def rec(receiver):
    with open('result.csv', 'w', buffering=1, newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['Business Name', 'Website',
                        'Years In Business', 'Phone'])
        async with receiver:
            async for value in receiver:
                writer.writerow(value)

if __name__ == "__main__":
    trio.run(main)

import httpx
import trio
from bs4 import BeautifulSoup
import csv

limit = trio.CapacityLimiter(6)


async def scrape(client, item, sender):
    async with limit, sender:
        r = await client.get(f'https://www.yellowpages.com{item[1]}')
        soup = BeautifulSoup(r.text, 'lxml')

        try:
            bw = soup.select_one('.primary-btn')['href']
        except (TypeError, AttributeError):
            bw = None
        try:
            phone = soup.select_one('p.phone').text
        except (TypeError, AttributeError):
            phone = None
        try:
            biy = soup.select_one('.number').text
        except AttributeError:
            biy = None

        result = [item[0], bw, biy, phone]
        print(result)
        await sender.send(result)


async def worker(client, num, sender, nurse):
    async with limit, sender:
        params = {
            "search_terms": "expeditors",
            "geo_location_terms": "NJ",
            "page": num,
            "sort": "",
            "source": "real-user"
        }
        r = await client.get('https://www.yellowpages.com/search', params=params)
        soup = BeautifulSoup(r.text, 'lxml')
        goal = [(i.span.text, i['href'])
                for i in soup.select('.business-name')]
        for what in goal:
            nurse.start_soon(scrape, client, what, sender.clone())


async def main():
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:88.0) Gecko/20100101 Firefox/88.0'
    }
    async with httpx.AsyncClient(timeout=None) as client, trio.open_nursery() as nurse:
        client.headers.update(headers)

        sender, receiver = trio.open_memory_channel(0)
        nurse.start_soon(rec, receiver)

        async with sender:
            for item in range(1, 2):
                nurse.start_soon(worker, client, item, sender.clone(), nurse)


async def rec(receiver):
    with open('result.csv', 'w', buffering=1, newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['Business Name', 'Website',
                        'Years In Business', 'Phone'])
        async with receiver:
            async for value in receiver:
                writer.writerow(value)

if __name__ == "__main__":
    trio.run(main)