Python bs4中的数据刮取问题

Python bs4中的数据刮取问题,python,beautifulsoup,cmd,python-requests,repl.it,Python,Beautifulsoup,Cmd,Python Requests,Repl.it,我想从这个网站上抓取数据:https://sephora.ae" 我已经写了这个代码 import requests from bs4 import BeautifulSoup import json def sephora(URL): # opening our output file in append mode File = open("out.csv", "a") print("function start&

我想从这个网站上抓取数据:https://sephora.ae" 我已经写了这个代码

import requests
from bs4 import BeautifulSoup
import json



def sephora(URL):
    # opening our output file in append mode
    File = open("out.csv", "a")
    print("function start")
    # specifying user agent, You can use other user agents
    # available on the internet
    print(URL)
    HEADERS = ({'User-Agent': 
           'Mozilla/5.0 (X11; Linux x86_64)                  AppleWebKit/537.36 (KHTML, like Gecko)                     Chrome/44.0.2403.157 Safari/537.36', 
                           'Accept-Language': 'en-US, en;q=0.5'}) 
 
    
    # Making the HTTP Request 
    webpage = requests.get(URL, headers=HEADERS)  
    
    # Creating the Soup Object containing all data 
    soup = BeautifulSoup(webpage.content, 'html.parser')
    print("a")



    #brand name
    try:
        brand = soup.find_all('span', {"class": "brand-name"})
    except AttributeError:
        brand = 'NA'

    #Product Name
    try:
        prname = soup.find_all('meta', {"itemprop": "name"})["content"].text
    except AttributeError:
        prname = 'NA'

    #price
    try:
        price = soup.find_all('meta', {"itemprop": "price"})["content"].text
    except AttributeError:
        price = 'NA'

    #price currency
    try:
        price_cur = soup.find_all(
            'meta', {"itemprop": "priceCurrency"})["content"].text
    except AttributeError:
        price_cur = 'NA'

    #item code
    try:
        item_code = soup.find_all('meta', {"itemprop": "sku"})["content"].text
    except AttributeError:
        item_code = 'NA'

    #variant url
    try:
        urll = soup.find_all('meta', {"itemprop": "url"})["content"].text
    except AttributeError:
        urll = 'NA'

    #category
    try:
        category = soup.find_all('span', {"class": "product-name"}).text
    except AttributeError:
        category = 'NA'

    #size
    try:
        size = soup.find_all('img', {"class": "variation-image"})["alt"].text
    except AttributeError:
        size = 'NA'

    #image_src
    image_tags = soup.find_all("a", {"class": "variation-display-name"},
                               {"rel": "nofollow"})
    imagess = image_tags["data-lgimg"]
    res = json.loads(imagess)
    img_src = res['url']
    img_title = res['title']
    img_alt = res['alt']

    #description
    try:
        desc_div = soup.find_all('div', {"class": "product-description-box"})
    except AttributeError:
        desc_div = 'NA'
    
    # print(img_src)
    print(img_alt)
    # print(Item_code)
    # print(Size)
    File.write(f"{URL};")
    File.write(f"{brand};")
    File.write(f"{prname};")
    File.write(f"{category};")
    File.write(f"{urll};")
    File.write(f"{desc_div};")
    # File.write(f"{ingredients};")
    File.write(f"{price};")
    File.write(f"{price_cur};")
    File.write(f"{img_src};")
    File.write(f"{img_title};")
    File.write(f"{item_code};")
    File.write(f"{size}\n")


if __name__ == '__main__':
    # opening our url file to access URLs
    print("start")
    file = open("url.txt", "r")
    header = "URL;BRAND;NAME;VARIANT LINK;DESCRIPTION;PRICE;PRICE CUR;IMG;TITLE;ITEMCODE;SIZE"
    File = open("out.csv", "w")
    File.write(f"{header}\n")
    File.close()

    URLs = file.readlines()
   
    for links in URLs:
        sephora(links)
    File.close()

但是在out.csv中,当我使用repl.it运行时,只有标题出现,没有其他内容。 当我使用cmd运行时,就会出现这个错误

C:\Users\Admin\Desktop\sephora>python main.py
Traceback (most recent call last):
  File "C:\Users\Admin\Desktop\sephora\main.py", line 137, in <module>
    sephora(links)
  File "C:\Users\Admin\Desktop\sephora\main.py", line 33, in sephora
    prname = soup.find_all('meta', {"itemprop": "name"})["content"]
TypeError: list indices must be integers or slices, not str
C:\Users\Admin\Desktop\sephora>python main.py
回溯(最近一次呼叫最后一次):
文件“C:\Users\Admin\Desktop\sephora\main.py”,第137行,在
丝芙兰(链接)
文件“C:\Users\Admin\Desktop\sephora\main.py”,第33行,丝芙兰
prname=soup.find_all('meta',{“itemprop”:“name”})[“content”]
TypeError:列表索引必须是整数或片,而不是str
请帮我解决这个问题

我正在使用此URL启动


首先感谢您,我认为您需要将标题类型从tuple更改为dict:

HEADERS = {'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36', 'Accept-Language': 'en-US, en;q=0.5'}
然后
soup.find_all('meta',{“itemprop”:“name”})
返回元素列表

你需要改变:

  • prname=soup.find_all('meta',{“itemprop”:“name”})[“content”]。text
  • prname=soup.find_all('meta',{“itemprop”:“name”})[0]。text

  • category=soup.find_all('span',{“class”:“product name”})。text
  • to
    category=soup.find_all('span',{“class”:“product name”})[0]。text

  • size=soup.find_all('img',{“class”:“variation image”})[“alt”].text
  • size=soup.find_all('img',{“class”:“variation image”})[0][“alt”]

    诸如此类


    文档:

    您似乎对BeautifulSoup的一些概念/方法感到困惑。特别是抓取属性,而不是抓取文本/内容。还有一点是理解
    .find_all()
    .find()
    的区别
    .find_all()
    将返回所有这些元素的列表。因此,如果您想从该列表中获取特定的项,则需要使用索引
    .find()
    将只返回它找到的第一个元素,其中包含您要查找的特定标记和属性

    尝试一下:

    import requests
    from bs4 import BeautifulSoup
    import json
    
    
    
    def sephora(URL):
        # opening our output file in append mode
        File = open("out.csv", "a")
        print("function start")
        # specifying user agent, You can use other user agents
        # available on the internet
        print(URL)
        HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36', 
                               'Accept-Language': 'en-US, en;q=0.5'}
     
        
        # Making the HTTP Request 
        webpage = requests.get(URL, headers=HEADERS)  
        
        # Creating the Soup Object containing all data 
        soup = BeautifulSoup(webpage.content, 'html.parser')
        print("a")
    
    
    
        #brand name
        try:
            brand = soup.find('span', {"class": "brand-name"}).text
        except AttributeError:
            brand = 'NA'
    
        #Product Name
        try:
            prname = soup.find('meta', {"itemprop": "name"})["content"]
        except AttributeError:
            prname = 'NA'
    
        #price
        try:
            price = soup.find('meta', {"itemprop": "price"})["content"]
        except AttributeError:
            price = 'NA'
    
        #price currency
        try:
            price_cur = soup.find(
                'meta', {"itemprop": "priceCurrency"})["content"]
        except AttributeError:
            price_cur = 'NA'
    
        #item code
        try:
            item_code = soup.find('meta', {"itemprop": "sku"})["content"]
        except AttributeError:
            item_code = 'NA'
    
        #variant url
        try:
            urll = soup.find('meta', {"itemprop": "url"})["content"]
        except AttributeError:
            urll = 'NA'
    
        #category
        try:
            category = soup.find('span', {"class": "product-name"}).text
        except AttributeError:
            category = 'NA'
    
        #size
        try:
            size = soup.find('img', {"class": "variation-image"})["alt"]
        except AttributeError:
            size = 'NA'
    
        #image_src
        image_tags = soup.find("a", {"class": "variation-display-name"},
                                   {"rel": "nofollow"})
        imagess = image_tags["data-lgimg"]
        res = json.loads(imagess)
        img_src = res['url']
        img_title = res['title']
        img_alt = res['alt']
    
        #description
        try:
            desc_div = soup.find('div', {"class": "product-description-box"}).text.strip()
        except AttributeError:
            desc_div = 'NA'
        
        # print(img_src)
        print(img_alt)
        # print(Item_code)
        # print(Size)
        File.write(f"{URL};")
        File.write(f"{brand};")
        File.write(f"{prname};")
        File.write(f"{category};")
        File.write(f"{urll};")
        File.write(f"{desc_div};")
        # File.write(f"{ingredients};")
        File.write(f"{price};")
        File.write(f"{price_cur};")
        File.write(f"{img_src};")
        File.write(f"{img_title};")
        File.write(f"{item_code};")
        File.write(f"{size}\n")
    
    
    if __name__ == '__main__':
        # opening our url file to access URLs
        print("start")
        file = open("url.txt", "r")
        header = "URL;BRAND;NAME;VARIANT LINK;DESCRIPTION;PRICE;PRICE CUR;IMG;TITLE;ITEMCODE;SIZE"
        File = open("out.csv", "w")
        File.write(f"{header}\n")
        File.close()
    
        URLs = file.readlines()
       
        for links in URLs:
            sephora(links)
        File.close()
    

    然后也只有头球来了,sirI也用了同样的头球。我可以和你分享回复吗@RuslanZanevskiysorry,但我没有找到任何itemprop等于price name等的meta标记。我只找到类为“price-sales-price-sales-standard”的snap标记和类为“product-name-product-name-bold”的snap标记。可能你需要改变这一点一般来说,问题是这个网站没有为我加载,我无法检查我的电脑上的代码是的,这将是很高兴看到repl.it不工作,先生,即使在遵循你的。你能帮我回复一下吗?当然可以。我的屏幕名和这里一样什么“不起作用”?我这边很好,你能过来吗,先生??我将向您展示,即使在控制台中打印之后,也没有输出。
    start function starthttps://www.sephora.ae/en/p/color-lip-last-lipstick-P1074023.html
    先生,只有这么多钱