Python 3.x 选择带有关键字的页面并将其刮除_Python 3.x_Beautifulsoup

Python 3.x 选择带有关键字的页面并将其刮除

python-3.x

Python 3.x 选择带有关键字的页面并将其刮除,python-3.x,beautifulsoup,Python 3.x,Beautifulsoup,我试图在一个网站的外国版本上刮取一个项目的标题。运行Python脚本后，cli将启动，但根本不返回任何内容在iPython中，要获得标题，title=soup.find（'a'，{'class'：'vip'}）。文本本身工作得很好，但在Pycharm中，它在完整的代码中不起作用，即使我去了我的设置下载了当前解释器的BeautifulSoup包知道为什么吗？谢谢 #!/usr/bin/python3 import csv import time import requests from b

我试图在一个网站的外国版本上刮取一个项目的标题。运行Python脚本后，cli将启动，但根本不返回任何内容

在iPython中，要获得标题，

title=soup.find（'a'，{'class'：'vip'}）。文本本身工作得很好，但在Pycharm中，它在完整的代码中不起作用，即使我去了我的设置下载了当前解释器的BeautifulSoup包
知道为什么吗？谢谢
#!/usr/bin/python3

import csv
import time
import requests
from bs4 import BeautifulSoup

product_category = input("Enter your product category: ")


def get_page(url):
    response = requests.get(url)
    if not response.ok:
        print('Server responded:', response.status_code)
    else:
        soup = BeautifulSoup(response.text, 'html.parser')
    return soup


def get_detail_data(soup):
    # title
    try:
        title = soup.find('a', {'class': 'vip'}).text

    except:
        title = ''

    # price
    try:
        price = soup.find_all('span', {'itemprop': 'price'})
        for p in price:
            price = p.get('content')
    except:
        price = ''

    # currency
    try:
        currency = soup.find_all('span', {'class': 'bold'}).text

    except:
        currency = ''

    # items sold
    try:
        i_s = soup.find('div', {'class': 'hotness-signal red'}).text
        items_sold = i_s.strip().split(' ')[0]
    except:
        items_sold = ''

    data = {
        'title': title,
        'price': price,
        'currency': currency,
        'total sold': items_sold
    }

    return data


def get_index_data(soup):
    try:
        links = soup.find_all('a', class_='s-item__link')
    except:
        links = []

    urls = [item.get('href') for item in links]

    return urls


def write_csv(data, url):
    with open('output.csv', 'a') as csvfile:
        writer = csv.writer(csvfile)
        row = [data['title'], data['price'], data['currency'], data['total sold'], url]
        writer.writerow(['Title', 'Price', 'Currency', 'Sales Volume', 'URL'])
        writer.writerow(row)


def main():
    # Store URL formats for each search engine with placeholders
    url = f"https://www.ebay.fr/sch/i.html?_nkw={product_category}&_pgn=1"
    print(url)
    products = get_index_data(get_page(url))

    for link in products:
        time.sleep(7)
        data = get_detail_data(get_page(link))
        print(data)
        write_csv(data, link)


if __name__ == '__main__':
     main()

似乎.fr站点使用了不同的标记，所以您需要相应地更改类名/属性
例如：
import re
import csv
import time
import requests
from bs4 import BeautifulSoup

product_category = input("Enter your product category: ")


def get_page(url):
    response = requests.get(url)
    if not response.ok:
        print('Server responded:', response.status_code)
    else:
        soup = BeautifulSoup(response.text, 'html.parser')
    return soup


def get_detail_data(soup):
    # title
    try:
        title = soup.select_one('h1[itemprop="name"]')
        for span in title.select('span'):
            span.extract()
        title = title.get_text(strip=True)
    except:
        title = ''

    # price
    try:
        price = soup.find_all('span', {'itemprop': 'price'})
        for p in price:
            price = p.get('content')
    except:
        price = ''

    # currency
    try:
        currency = soup.select_one('span[itemprop="priceCurrency"][content]')["content"]
    except:
        currency = ''

    # items sold
    try:
        items_sold = re.findall(r'\d+', soup.select_one('.soldwithfeedback').text)[0]
    except:
        items_sold = ''

    data = {
        'title': title,
        'price': price,
        'currency': currency,
        'total sold': items_sold
    }

    return data


def get_index_data(soup):
    links = soup.select('.sresult h3 a')
    urls = [item.get('href') for item in links]
    return urls


def write_csv(data, url):
    with open('output.csv', 'a') as csvfile:
        writer = csv.writer(csvfile)
        row = [data['title'], data['price'], data['currency'], data['total sold'], url]
        writer.writerow(['Title', 'Price', 'Currency', 'Sales Volume', 'URL'])
        writer.writerow(row)


def main():
    # Store URL formats for each search engine with placeholders
    url = f"https://www.ebay.fr/sch/i.html?_nkw={product_category}&_pgn=1"
    print(url)
    products = get_index_data(get_page(url))

    for link in products:
        time.sleep(0.5)
        data = get_detail_data(get_page(link))
        print(data)
        # write_csv(data, link)  # <-- I commented it, to just print to screen


if __name__ == '__main__':
     main()

当我的两个浏览器都启用了JS时，我的cli返回“启用Javascript”。就目前的情况而言，我对.com版本没有任何问题（工作非常完美），但只有国际网站（.fr，.es）不能从我这方面删除。我不想用Ebay API来做这个。太好了，Andrej。谢谢！从现在起，我将尝试继续添加到我的代码中。：）
Enter your product category: ddr4
https://www.ebay.fr/sch/i.html?_nkw=ddr4&_pgn=1
{'title': '16 Go 8 Go 4 Go DDR3 DDR4 1333 1600 1866 2133 RAM 2400 2666 MHz pour HyperX FURY Lot', 'price': '19.74', 'currency': 'USD', 'total sold': '1'}
{'title': '4 Go 8 Go 16 Go DDR4 2133 2400 2666 Mhz pour HyperX FURY DIMM Desktop Mémoire RAM Lot', 'price': '23.87', 'currency': 'USD', 'total sold': '93'}
{'title': '8 Go DDR4 2133 MHz pour HyperX FURY CL15 288 Pin DIMM PC4-17000 Desktop RAM RL1US', 'price': '39.96', 'currency': 'USD', 'total sold': '17'}
{'title': '16 Go G. Skill DDR4 Trident 3200 MHz Z PC4-25600 CL16 1.35 V Double Kit (2x8GB)', 'price': '70.0', 'currency': 'GBP', 'total sold': ''}
{'title': 'DDR4 4 Go 8 Go 16 Go Desktop 2666 MHz Desktop DIMM Mémoire RAM pour Kingston HyperX Fury R1US', 'price': '24.13', 'currency': 'USD', 'total sold': '19'}
{'title': 'Micron 8GB RAM DDR4 1Rx8 PC4-2400T-UAB-10', 'price': '23.0', 'currency': 'EUR', 'total sold': ''}
{'title': 'PATRIOT Viper Blackout 16 Go DDR4 3000 (2x8)', 'price': '54.99', 'currency': 'GBP', 'total sold': ''}
{'title': 'Samsung 8GB RAM DDR4 1Rx8 PC4-2133P SO-DIMM', 'price': '21.0', 'currency': 'EUR', 'total sold': ''}
{'title': 'Kingston 8 Go DDR4 2133 MHz Desktop PC RAM ~~ PC4 17000 Mémoire 2133P 288 broches 2Rx8', 'price': '31.99', 'currency': 'GBP', 'total sold': ''}

...and so on.