Python bs4中的数据刮取问题_Python_Beautifulsoup_Cmd_Python Requests_Repl.it

Python bs4中的数据刮取问题

python cmd

Python bs4中的数据刮取问题,python,beautifulsoup,cmd,python-requests,repl.it,Python,Beautifulsoup,Cmd,Python Requests,Repl.it,我想从这个网站上抓取数据：https://sephora.ae" 我已经写了这个代码 import requests from bs4 import BeautifulSoup import json def sephora(URL): # opening our output file in append mode File = open("out.csv", "a") print("function start&

我想从这个网站上抓取数据：https://sephora.ae" 我已经写了这个代码

import requests
from bs4 import BeautifulSoup
import json



def sephora(URL):
    # opening our output file in append mode
    File = open("out.csv", "a")
    print("function start")
    # specifying user agent, You can use other user agents
    # available on the internet
    print(URL)
    HEADERS = ({'User-Agent': 
           'Mozilla/5.0 (X11; Linux x86_64)                  AppleWebKit/537.36 (KHTML, like Gecko)                     Chrome/44.0.2403.157 Safari/537.36', 
                           'Accept-Language': 'en-US, en;q=0.5'}) 
 
    
    # Making the HTTP Request 
    webpage = requests.get(URL, headers=HEADERS)  
    
    # Creating the Soup Object containing all data 
    soup = BeautifulSoup(webpage.content, 'html.parser')
    print("a")



    #brand name
    try:
        brand = soup.find_all('span', {"class": "brand-name"})
    except AttributeError:
        brand = 'NA'

    #Product Name
    try:
        prname = soup.find_all('meta', {"itemprop": "name"})["content"].text
    except AttributeError:
        prname = 'NA'

    #price
    try:
        price = soup.find_all('meta', {"itemprop": "price"})["content"].text
    except AttributeError:
        price = 'NA'

    #price currency
    try:
        price_cur = soup.find_all(
            'meta', {"itemprop": "priceCurrency"})["content"].text
    except AttributeError:
        price_cur = 'NA'

    #item code
    try:
        item_code = soup.find_all('meta', {"itemprop": "sku"})["content"].text
    except AttributeError:
        item_code = 'NA'

    #variant url
    try:
        urll = soup.find_all('meta', {"itemprop": "url"})["content"].text
    except AttributeError:
        urll = 'NA'

    #category
    try:
        category = soup.find_all('span', {"class": "product-name"}).text
    except AttributeError:
        category = 'NA'

    #size
    try:
        size = soup.find_all('img', {"class": "variation-image"})["alt"].text
    except AttributeError:
        size = 'NA'

    #image_src
    image_tags = soup.find_all("a", {"class": "variation-display-name"},
                               {"rel": "nofollow"})
    imagess = image_tags["data-lgimg"]
    res = json.loads(imagess)
    img_src = res['url']
    img_title = res['title']
    img_alt = res['alt']

    #description
    try:
        desc_div = soup.find_all('div', {"class": "product-description-box"})
    except AttributeError:
        desc_div = 'NA'
    
    # print(img_src)
    print(img_alt)
    # print(Item_code)
    # print(Size)
    File.write(f"{URL};")
    File.write(f"{brand};")
    File.write(f"{prname};")
    File.write(f"{category};")
    File.write(f"{urll};")
    File.write(f"{desc_div};")
    # File.write(f"{ingredients};")
    File.write(f"{price};")
    File.write(f"{price_cur};")
    File.write(f"{img_src};")
    File.write(f"{img_title};")
    File.write(f"{item_code};")
    File.write(f"{size}\n")


if __name__ == '__main__':
    # opening our url file to access URLs
    print("start")
    file = open("url.txt", "r")
    header = "URL;BRAND;NAME;VARIANT LINK;DESCRIPTION;PRICE;PRICE CUR;IMG;TITLE;ITEMCODE;SIZE"
    File = open("out.csv", "w")
    File.write(f"{header}\n")
    File.close()

    URLs = file.readlines()
   
    for links in URLs:
        sephora(links)
    File.close()

但是在out.csv中，当我使用repl.it运行时，只有标题出现，没有其他内容。当我使用cmd运行时，就会出现这个错误

C:\Users\Admin\Desktop\sephora>python main.py
Traceback (most recent call last):
  File "C:\Users\Admin\Desktop\sephora\main.py", line 137, in <module>
    sephora(links)
  File "C:\Users\Admin\Desktop\sephora\main.py", line 33, in sephora
    prname = soup.find_all('meta', {"itemprop": "name"})["content"]
TypeError: list indices must be integers or slices, not str

C:\Users\Admin\Desktop\sephora>python main.py
回溯（最近一次呼叫最后一次）：
文件“C:\Users\Admin\Desktop\sephora\main.py”，第137行，在
丝芙兰（链接）
文件“C:\Users\Admin\Desktop\sephora\main.py”，第33行，丝芙兰
prname=soup.find_all（'meta'，{“itemprop”：“name”}）[“content”]
TypeError:列表索引必须是整数或片，而不是str

请帮我解决这个问题

我正在使用此URL启动

首先感谢您，我认为您需要将标题类型从tuple更改为dict：

HEADERS = {'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36', 'Accept-Language': 'en-US, en;q=0.5'}

然后

soup.find_all（'meta'，{“itemprop”：“name”}）

返回元素列表

你需要改变：

prname=soup.find_all（'meta'，{“itemprop”：“name”}）[“content”]。text

到

prname=soup.find_all（'meta'，{“itemprop”：“name”}）[0]。text

category=soup.find_all（'span'，{“class”：“product name”}）。text

category=soup.find_all（'span'，{“class”：“product name”}）[0]。text

size=soup.find_all（'img'，{“class”：“variation image”}）[“alt”].text

到

size=soup.find_all（'img'，{“class”：“variation image”}）[0][“alt”]

诸如此类

文档：

您似乎对BeautifulSoup的一些概念/方法感到困惑。特别是抓取属性，而不是抓取文本/内容。还有一点是理解

.find_all（）

与

.find（）

的区别

.find_all（）

将返回所有这些元素的列表。因此，如果您想从该列表中获取特定的项，则需要使用索引

.find（）

将只返回它找到的第一个元素，其中包含您要查找的特定标记和属性

尝试一下：

import requests
from bs4 import BeautifulSoup
import json



def sephora(URL):
    # opening our output file in append mode
    File = open("out.csv", "a")
    print("function start")
    # specifying user agent, You can use other user agents
    # available on the internet
    print(URL)
    HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36', 
                           'Accept-Language': 'en-US, en;q=0.5'}
 
    
    # Making the HTTP Request 
    webpage = requests.get(URL, headers=HEADERS)  
    
    # Creating the Soup Object containing all data 
    soup = BeautifulSoup(webpage.content, 'html.parser')
    print("a")



    #brand name
    try:
        brand = soup.find('span', {"class": "brand-name"}).text
    except AttributeError:
        brand = 'NA'

    #Product Name
    try:
        prname = soup.find('meta', {"itemprop": "name"})["content"]
    except AttributeError:
        prname = 'NA'

    #price
    try:
        price = soup.find('meta', {"itemprop": "price"})["content"]
    except AttributeError:
        price = 'NA'

    #price currency
    try:
        price_cur = soup.find(
            'meta', {"itemprop": "priceCurrency"})["content"]
    except AttributeError:
        price_cur = 'NA'

    #item code
    try:
        item_code = soup.find('meta', {"itemprop": "sku"})["content"]
    except AttributeError:
        item_code = 'NA'

    #variant url
    try:
        urll = soup.find('meta', {"itemprop": "url"})["content"]
    except AttributeError:
        urll = 'NA'

    #category
    try:
        category = soup.find('span', {"class": "product-name"}).text
    except AttributeError:
        category = 'NA'

    #size
    try:
        size = soup.find('img', {"class": "variation-image"})["alt"]
    except AttributeError:
        size = 'NA'

    #image_src
    image_tags = soup.find("a", {"class": "variation-display-name"},
                               {"rel": "nofollow"})
    imagess = image_tags["data-lgimg"]
    res = json.loads(imagess)
    img_src = res['url']
    img_title = res['title']
    img_alt = res['alt']

    #description
    try:
        desc_div = soup.find('div', {"class": "product-description-box"}).text.strip()
    except AttributeError:
        desc_div = 'NA'
    
    # print(img_src)
    print(img_alt)
    # print(Item_code)
    # print(Size)
    File.write(f"{URL};")
    File.write(f"{brand};")
    File.write(f"{prname};")
    File.write(f"{category};")
    File.write(f"{urll};")
    File.write(f"{desc_div};")
    # File.write(f"{ingredients};")
    File.write(f"{price};")
    File.write(f"{price_cur};")
    File.write(f"{img_src};")
    File.write(f"{img_title};")
    File.write(f"{item_code};")
    File.write(f"{size}\n")


if __name__ == '__main__':
    # opening our url file to access URLs
    print("start")
    file = open("url.txt", "r")
    header = "URL;BRAND;NAME;VARIANT LINK;DESCRIPTION;PRICE;PRICE CUR;IMG;TITLE;ITEMCODE;SIZE"
    File = open("out.csv", "w")
    File.write(f"{header}\n")
    File.close()

    URLs = file.readlines()
   
    for links in URLs:
        sephora(links)
    File.close()

然后也只有头球来了，sirI也用了同样的头球。我可以和你分享回复吗@RuslanZanevskiysorry，但我没有找到任何itemprop等于price name等的meta标记。我只找到类为“price-sales-price-sales-standard”的snap标记和类为“product-name-product-name-bold”的snap标记。可能你需要改变这一点一般来说，问题是这个网站没有为我加载，我无法检查我的电脑上的代码是的，这将是很高兴看到repl.it不工作，先生，即使在遵循你的。你能帮我回复一下吗？当然可以。我的屏幕名和这里一样什么“不起作用”？我这边很好，你能过来吗，先生？？我将向您展示，即使在控制台中打印之后，也没有输出。

start function starthttps://www.sephora.ae/en/p/color-lip-last-lipstick-P1074023.html

先生，只有这么多钱