Python bs4中的数据刮取问题
我想从这个网站上抓取数据:https://sephora.ae" 我已经写了这个代码Python bs4中的数据刮取问题,python,beautifulsoup,cmd,python-requests,repl.it,Python,Beautifulsoup,Cmd,Python Requests,Repl.it,我想从这个网站上抓取数据:https://sephora.ae" 我已经写了这个代码 import requests from bs4 import BeautifulSoup import json def sephora(URL): # opening our output file in append mode File = open("out.csv", "a") print("function start&
import requests
from bs4 import BeautifulSoup
import json
def sephora(URL):
# opening our output file in append mode
File = open("out.csv", "a")
print("function start")
# specifying user agent, You can use other user agents
# available on the internet
print(URL)
HEADERS = ({'User-Agent':
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',
'Accept-Language': 'en-US, en;q=0.5'})
# Making the HTTP Request
webpage = requests.get(URL, headers=HEADERS)
# Creating the Soup Object containing all data
soup = BeautifulSoup(webpage.content, 'html.parser')
print("a")
#brand name
try:
brand = soup.find_all('span', {"class": "brand-name"})
except AttributeError:
brand = 'NA'
#Product Name
try:
prname = soup.find_all('meta', {"itemprop": "name"})["content"].text
except AttributeError:
prname = 'NA'
#price
try:
price = soup.find_all('meta', {"itemprop": "price"})["content"].text
except AttributeError:
price = 'NA'
#price currency
try:
price_cur = soup.find_all(
'meta', {"itemprop": "priceCurrency"})["content"].text
except AttributeError:
price_cur = 'NA'
#item code
try:
item_code = soup.find_all('meta', {"itemprop": "sku"})["content"].text
except AttributeError:
item_code = 'NA'
#variant url
try:
urll = soup.find_all('meta', {"itemprop": "url"})["content"].text
except AttributeError:
urll = 'NA'
#category
try:
category = soup.find_all('span', {"class": "product-name"}).text
except AttributeError:
category = 'NA'
#size
try:
size = soup.find_all('img', {"class": "variation-image"})["alt"].text
except AttributeError:
size = 'NA'
#image_src
image_tags = soup.find_all("a", {"class": "variation-display-name"},
{"rel": "nofollow"})
imagess = image_tags["data-lgimg"]
res = json.loads(imagess)
img_src = res['url']
img_title = res['title']
img_alt = res['alt']
#description
try:
desc_div = soup.find_all('div', {"class": "product-description-box"})
except AttributeError:
desc_div = 'NA'
# print(img_src)
print(img_alt)
# print(Item_code)
# print(Size)
File.write(f"{URL};")
File.write(f"{brand};")
File.write(f"{prname};")
File.write(f"{category};")
File.write(f"{urll};")
File.write(f"{desc_div};")
# File.write(f"{ingredients};")
File.write(f"{price};")
File.write(f"{price_cur};")
File.write(f"{img_src};")
File.write(f"{img_title};")
File.write(f"{item_code};")
File.write(f"{size}\n")
if __name__ == '__main__':
# opening our url file to access URLs
print("start")
file = open("url.txt", "r")
header = "URL;BRAND;NAME;VARIANT LINK;DESCRIPTION;PRICE;PRICE CUR;IMG;TITLE;ITEMCODE;SIZE"
File = open("out.csv", "w")
File.write(f"{header}\n")
File.close()
URLs = file.readlines()
for links in URLs:
sephora(links)
File.close()
但是在out.csv中,当我使用repl.it运行时,只有标题出现,没有其他内容。
当我使用cmd运行时,就会出现这个错误
C:\Users\Admin\Desktop\sephora>python main.py
Traceback (most recent call last):
File "C:\Users\Admin\Desktop\sephora\main.py", line 137, in <module>
sephora(links)
File "C:\Users\Admin\Desktop\sephora\main.py", line 33, in sephora
prname = soup.find_all('meta', {"itemprop": "name"})["content"]
TypeError: list indices must be integers or slices, not str
C:\Users\Admin\Desktop\sephora>python main.py
回溯(最近一次呼叫最后一次):
文件“C:\Users\Admin\Desktop\sephora\main.py”,第137行,在
丝芙兰(链接)
文件“C:\Users\Admin\Desktop\sephora\main.py”,第33行,丝芙兰
prname=soup.find_all('meta',{“itemprop”:“name”})[“content”]
TypeError:列表索引必须是整数或片,而不是str
请帮我解决这个问题
我正在使用此URL启动
首先感谢您,我认为您需要将标题类型从tuple更改为dict:
HEADERS = {'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36', 'Accept-Language': 'en-US, en;q=0.5'}
然后soup.find_all('meta',{“itemprop”:“name”})
返回元素列表
你需要改变:
prname=soup.find_all('meta',{“itemprop”:“name”})[“content”]。text
prname=soup.find_all('meta',{“itemprop”:“name”})[0]。text
category=soup.find_all('span',{“class”:“product name”})。text
category=soup.find_all('span',{“class”:“product name”})[0]。text
size=soup.find_all('img',{“class”:“variation image”})[“alt”].text
size=soup.find_all('img',{“class”:“variation image”})[0][“alt”]
诸如此类
文档:您似乎对BeautifulSoup的一些概念/方法感到困惑。特别是抓取属性,而不是抓取文本/内容。还有一点是理解
.find_all()
与.find()
的区别.find_all()
将返回所有这些元素的列表。因此,如果您想从该列表中获取特定的项,则需要使用索引.find()
将只返回它找到的第一个元素,其中包含您要查找的特定标记和属性
尝试一下:
import requests
from bs4 import BeautifulSoup
import json
def sephora(URL):
# opening our output file in append mode
File = open("out.csv", "a")
print("function start")
# specifying user agent, You can use other user agents
# available on the internet
print(URL)
HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36',
'Accept-Language': 'en-US, en;q=0.5'}
# Making the HTTP Request
webpage = requests.get(URL, headers=HEADERS)
# Creating the Soup Object containing all data
soup = BeautifulSoup(webpage.content, 'html.parser')
print("a")
#brand name
try:
brand = soup.find('span', {"class": "brand-name"}).text
except AttributeError:
brand = 'NA'
#Product Name
try:
prname = soup.find('meta', {"itemprop": "name"})["content"]
except AttributeError:
prname = 'NA'
#price
try:
price = soup.find('meta', {"itemprop": "price"})["content"]
except AttributeError:
price = 'NA'
#price currency
try:
price_cur = soup.find(
'meta', {"itemprop": "priceCurrency"})["content"]
except AttributeError:
price_cur = 'NA'
#item code
try:
item_code = soup.find('meta', {"itemprop": "sku"})["content"]
except AttributeError:
item_code = 'NA'
#variant url
try:
urll = soup.find('meta', {"itemprop": "url"})["content"]
except AttributeError:
urll = 'NA'
#category
try:
category = soup.find('span', {"class": "product-name"}).text
except AttributeError:
category = 'NA'
#size
try:
size = soup.find('img', {"class": "variation-image"})["alt"]
except AttributeError:
size = 'NA'
#image_src
image_tags = soup.find("a", {"class": "variation-display-name"},
{"rel": "nofollow"})
imagess = image_tags["data-lgimg"]
res = json.loads(imagess)
img_src = res['url']
img_title = res['title']
img_alt = res['alt']
#description
try:
desc_div = soup.find('div', {"class": "product-description-box"}).text.strip()
except AttributeError:
desc_div = 'NA'
# print(img_src)
print(img_alt)
# print(Item_code)
# print(Size)
File.write(f"{URL};")
File.write(f"{brand};")
File.write(f"{prname};")
File.write(f"{category};")
File.write(f"{urll};")
File.write(f"{desc_div};")
# File.write(f"{ingredients};")
File.write(f"{price};")
File.write(f"{price_cur};")
File.write(f"{img_src};")
File.write(f"{img_title};")
File.write(f"{item_code};")
File.write(f"{size}\n")
if __name__ == '__main__':
# opening our url file to access URLs
print("start")
file = open("url.txt", "r")
header = "URL;BRAND;NAME;VARIANT LINK;DESCRIPTION;PRICE;PRICE CUR;IMG;TITLE;ITEMCODE;SIZE"
File = open("out.csv", "w")
File.write(f"{header}\n")
File.close()
URLs = file.readlines()
for links in URLs:
sephora(links)
File.close()
然后也只有头球来了,sirI也用了同样的头球。我可以和你分享回复吗@RuslanZanevskiysorry,但我没有找到任何itemprop等于price name等的meta标记。我只找到类为“price-sales-price-sales-standard”的snap标记和类为“product-name-product-name-bold”的snap标记。可能你需要改变这一点一般来说,问题是这个网站没有为我加载,我无法检查我的电脑上的代码是的,这将是很高兴看到repl.it不工作,先生,即使在遵循你的。你能帮我回复一下吗?当然可以。我的屏幕名和这里一样什么“不起作用”?我这边很好,你能过来吗,先生??我将向您展示,即使在控制台中打印之后,也没有输出。
start function starthttps://www.sephora.ae/en/p/color-lip-last-lipstick-P1074023.html
先生,只有这么多钱