Python 使用bs4在元素内单独取出元素
网站数据结构如下: 瓦片 ->a ->divPython 使用bs4在元素内单独取出元素,python,web-scraping,beautifulsoup,Python,Web Scraping,Beautifulsoup,网站数据结构如下: 瓦片 ->a ->div def grabData(url): from urllib.request import urlopen as uReq from bs4 import BeautifulSoup as soup import pandas as pd import numpy as np my_url = url #opening up the connection, grabbing the page
def grabData(url):
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
import pandas as pd
import numpy as np
my_url = url
#opening up the connection, grabbing the page
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
#html parsing
page_soup = soup(page_html, "html.parser")
#grabs each item
tiles = page_soup.findAll('div', {'class':'category-tile-grid-item'})
for tile in tiles:
content_container = tile.findAll('div',{'class':'shop-tile__content--border-bottom'})
store_name = content_container[0].text
print(store_name)
if __name__ == '__main__':
grabData('https://exchange.shopify.com/shops?sale_price=1000&shopify_shop_created_at=6m%2C0m')
->div
def grabData(url):
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
import pandas as pd
import numpy as np
my_url = url
#opening up the connection, grabbing the page
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
#html parsing
page_soup = soup(page_html, "html.parser")
#grabs each item
tiles = page_soup.findAll('div', {'class':'category-tile-grid-item'})
for tile in tiles:
content_container = tile.findAll('div',{'class':'shop-tile__content--border-bottom'})
store_name = content_container[0].text
print(store_name)
if __name__ == '__main__':
grabData('https://exchange.shopify.com/shops?sale_price=1000&shopify_shop_created_at=6m%2C0m')
--->内容
------>p
------>p
------>p
->div
def grabData(url):
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
import pandas as pd
import numpy as np
my_url = url
#opening up the connection, grabbing the page
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
#html parsing
page_soup = soup(page_html, "html.parser")
#grabs each item
tiles = page_soup.findAll('div', {'class':'category-tile-grid-item'})
for tile in tiles:
content_container = tile.findAll('div',{'class':'shop-tile__content--border-bottom'})
store_name = content_container[0].text
print(store_name)
if __name__ == '__main__':
grabData('https://exchange.shopify.com/shops?sale_price=1000&shopify_shop_created_at=6m%2C0m')
->div
def grabData(url):
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
import pandas as pd
import numpy as np
my_url = url
#opening up the connection, grabbing the page
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
#html parsing
page_soup = soup(page_html, "html.parser")
#grabs each item
tiles = page_soup.findAll('div', {'class':'category-tile-grid-item'})
for tile in tiles:
content_container = tile.findAll('div',{'class':'shop-tile__content--border-bottom'})
store_name = content_container[0].text
print(store_name)
if __name__ == '__main__':
grabData('https://exchange.shopify.com/shops?sale_price=1000&shopify_shop_created_at=6m%2C0m')
我希望分别检索段落值。我该怎么做呢
编辑:
我已经按照orhan solak更新了我的代码
我现在想弄清楚如何在metrics容器中获得收入和其他价值。我该怎么做呢?为什么选择流量而不是收入?我建议您使用特定的Xpath提取数据。我检查了您的代码,Xpath有点错误。我已经重新配置了你的代码。我把类名赋给了一个变量。之后,我将它们结合起来创建了一个精确的xpath。在下面的代码示例中;我创建了7个列表,分别包含价格、名称、网站类型、收入、流量、利润和库存价值
输出:
24
24
24
24
24
24
24
['$1,150USD', '$3,000USD', '$2,500USD', '$1,000USD', '$2,300USD']
['www.cosmicdetail.co.uk', 'prestige-timepiece.com', 'gomommyboutique.com', 'sunnysx.com', 'squishywishy.com']
['Automotive', 'Fashion and apparel', 'Toys and games', 'Fashion and apparel', 'Gifts and collectibles']
['$56', '$961', '$70', '$1.3K', '$403']
['111', '7.5K', '454', '2.8K', '2.6K']
['$50', '$1.0K', '$700', '$500', '$100']
['$1.8K', '', '', '', '']
"normalize-space(//div[@class='grid grid--equal-height']//div[@class='grid__item grid__item--tablet-up-third grid__item--desktop-up-quarter grid__item--wide-up-quarter gutter-bottom category-tile-grid-item layout-flex'][24]//div[@class='shop-tile__content shop-tile__metrics-container']//div[@class='shop-tile__metric'][1]//span[@class='shop-tile__metric__value text-bold'][1])"
DIODE BRAND
diodebrand.us
Art and photography
Private listing #572131
URL Hidden
Gifts and collectibles
检查清单:
print(price_list[:5])
print(name_list[:5])
print(website_type_list[:5])
print(revenue_list[:5])
print(traffic_list[:5])
print(profit_list[:5])
print(inventory_value_list[:5])
输出:
24
24
24
24
24
24
24
['$1,150USD', '$3,000USD', '$2,500USD', '$1,000USD', '$2,300USD']
['www.cosmicdetail.co.uk', 'prestige-timepiece.com', 'gomommyboutique.com', 'sunnysx.com', 'squishywishy.com']
['Automotive', 'Fashion and apparel', 'Toys and games', 'Fashion and apparel', 'Gifts and collectibles']
['$56', '$961', '$70', '$1.3K', '$403']
['111', '7.5K', '454', '2.8K', '2.6K']
['$50', '$1.0K', '$700', '$500', '$100']
['$1.8K', '', '', '', '']
"normalize-space(//div[@class='grid grid--equal-height']//div[@class='grid__item grid__item--tablet-up-third grid__item--desktop-up-quarter grid__item--wide-up-quarter gutter-bottom category-tile-grid-item layout-flex'][24]//div[@class='shop-tile__content shop-tile__metrics-container']//div[@class='shop-tile__metric'][1]//span[@class='shop-tile__metric__value text-bold'][1])"
DIODE BRAND
diodebrand.us
Art and photography
Private listing #572131
URL Hidden
Gifts and collectibles
如果要在控制台上尝试,请检查revenue的xpath:
revenue_xpath
输出:
24
24
24
24
24
24
24
['$1,150USD', '$3,000USD', '$2,500USD', '$1,000USD', '$2,300USD']
['www.cosmicdetail.co.uk', 'prestige-timepiece.com', 'gomommyboutique.com', 'sunnysx.com', 'squishywishy.com']
['Automotive', 'Fashion and apparel', 'Toys and games', 'Fashion and apparel', 'Gifts and collectibles']
['$56', '$961', '$70', '$1.3K', '$403']
['111', '7.5K', '454', '2.8K', '2.6K']
['$50', '$1.0K', '$700', '$500', '$100']
['$1.8K', '', '', '', '']
"normalize-space(//div[@class='grid grid--equal-height']//div[@class='grid__item grid__item--tablet-up-third grid__item--desktop-up-quarter grid__item--wide-up-quarter gutter-bottom category-tile-grid-item layout-flex'][24]//div[@class='shop-tile__content shop-tile__metrics-container']//div[@class='shop-tile__metric'][1]//span[@class='shop-tile__metric__value text-bold'][1])"
DIODE BRAND
diodebrand.us
Art and photography
Private listing #572131
URL Hidden
Gifts and collectibles
我建议您使用特定的Xpath来提取数据。我检查了您的代码,Xpath有点错误。我已经重新配置了你的代码。我把类名赋给了一个变量。之后,我将它们结合起来创建了一个精确的xpath。在下面的代码示例中;我创建了7个列表,分别包含价格、名称、网站类型、收入、流量、利润和库存价值
输出:
24
24
24
24
24
24
24
['$1,150USD', '$3,000USD', '$2,500USD', '$1,000USD', '$2,300USD']
['www.cosmicdetail.co.uk', 'prestige-timepiece.com', 'gomommyboutique.com', 'sunnysx.com', 'squishywishy.com']
['Automotive', 'Fashion and apparel', 'Toys and games', 'Fashion and apparel', 'Gifts and collectibles']
['$56', '$961', '$70', '$1.3K', '$403']
['111', '7.5K', '454', '2.8K', '2.6K']
['$50', '$1.0K', '$700', '$500', '$100']
['$1.8K', '', '', '', '']
"normalize-space(//div[@class='grid grid--equal-height']//div[@class='grid__item grid__item--tablet-up-third grid__item--desktop-up-quarter grid__item--wide-up-quarter gutter-bottom category-tile-grid-item layout-flex'][24]//div[@class='shop-tile__content shop-tile__metrics-container']//div[@class='shop-tile__metric'][1]//span[@class='shop-tile__metric__value text-bold'][1])"
DIODE BRAND
diodebrand.us
Art and photography
Private listing #572131
URL Hidden
Gifts and collectibles
检查清单:
print(price_list[:5])
print(name_list[:5])
print(website_type_list[:5])
print(revenue_list[:5])
print(traffic_list[:5])
print(profit_list[:5])
print(inventory_value_list[:5])
输出:
24
24
24
24
24
24
24
['$1,150USD', '$3,000USD', '$2,500USD', '$1,000USD', '$2,300USD']
['www.cosmicdetail.co.uk', 'prestige-timepiece.com', 'gomommyboutique.com', 'sunnysx.com', 'squishywishy.com']
['Automotive', 'Fashion and apparel', 'Toys and games', 'Fashion and apparel', 'Gifts and collectibles']
['$56', '$961', '$70', '$1.3K', '$403']
['111', '7.5K', '454', '2.8K', '2.6K']
['$50', '$1.0K', '$700', '$500', '$100']
['$1.8K', '', '', '', '']
"normalize-space(//div[@class='grid grid--equal-height']//div[@class='grid__item grid__item--tablet-up-third grid__item--desktop-up-quarter grid__item--wide-up-quarter gutter-bottom category-tile-grid-item layout-flex'][24]//div[@class='shop-tile__content shop-tile__metrics-container']//div[@class='shop-tile__metric'][1]//span[@class='shop-tile__metric__value text-bold'][1])"
DIODE BRAND
diodebrand.us
Art and photography
Private listing #572131
URL Hidden
Gifts and collectibles
如果要在控制台上尝试,请检查revenue的xpath:
revenue_xpath
输出:
24
24
24
24
24
24
24
['$1,150USD', '$3,000USD', '$2,500USD', '$1,000USD', '$2,300USD']
['www.cosmicdetail.co.uk', 'prestige-timepiece.com', 'gomommyboutique.com', 'sunnysx.com', 'squishywishy.com']
['Automotive', 'Fashion and apparel', 'Toys and games', 'Fashion and apparel', 'Gifts and collectibles']
['$56', '$961', '$70', '$1.3K', '$403']
['111', '7.5K', '454', '2.8K', '2.6K']
['$50', '$1.0K', '$700', '$500', '$100']
['$1.8K', '', '', '', '']
"normalize-space(//div[@class='grid grid--equal-height']//div[@class='grid__item grid__item--tablet-up-third grid__item--desktop-up-quarter grid__item--wide-up-quarter gutter-bottom category-tile-grid-item layout-flex'][24]//div[@class='shop-tile__content shop-tile__metrics-container']//div[@class='shop-tile__metric'][1]//span[@class='shop-tile__metric__value text-bold'][1])"
DIODE BRAND
diodebrand.us
Art and photography
Private listing #572131
URL Hidden
Gifts and collectibles
要单独获取段落,您可以执行以下操作:
from urllib.request import urlopen
from bs4 import BeautifulSoup
weblink = 'https://exchange.shopify.com/shops?sale_price=1000&shopify_shop_created_at=6m%2C0m'
def grabData(url):
res = urlopen(url)
soup = BeautifulSoup(res, "html.parser")
for items in soup.find_all(class_="category-tile-grid-item"):
title = items.find(class_="shop-tile__title").text
name = items.find(class_="shop-tile__url").text
category = items.find(class_="shop-tile__category").text
print("{}\n{}\n{}\n".format(title,name,category))
if __name__ == '__main__':
grabData(weblink)
输出:
24
24
24
24
24
24
24
['$1,150USD', '$3,000USD', '$2,500USD', '$1,000USD', '$2,300USD']
['www.cosmicdetail.co.uk', 'prestige-timepiece.com', 'gomommyboutique.com', 'sunnysx.com', 'squishywishy.com']
['Automotive', 'Fashion and apparel', 'Toys and games', 'Fashion and apparel', 'Gifts and collectibles']
['$56', '$961', '$70', '$1.3K', '$403']
['111', '7.5K', '454', '2.8K', '2.6K']
['$50', '$1.0K', '$700', '$500', '$100']
['$1.8K', '', '', '', '']
"normalize-space(//div[@class='grid grid--equal-height']//div[@class='grid__item grid__item--tablet-up-third grid__item--desktop-up-quarter grid__item--wide-up-quarter gutter-bottom category-tile-grid-item layout-flex'][24]//div[@class='shop-tile__content shop-tile__metrics-container']//div[@class='shop-tile__metric'][1]//span[@class='shop-tile__metric__value text-bold'][1])"
DIODE BRAND
diodebrand.us
Art and photography
Private listing #572131
URL Hidden
Gifts and collectibles
要单独获取段落,您可以执行以下操作:
from urllib.request import urlopen
from bs4 import BeautifulSoup
weblink = 'https://exchange.shopify.com/shops?sale_price=1000&shopify_shop_created_at=6m%2C0m'
def grabData(url):
res = urlopen(url)
soup = BeautifulSoup(res, "html.parser")
for items in soup.find_all(class_="category-tile-grid-item"):
title = items.find(class_="shop-tile__title").text
name = items.find(class_="shop-tile__url").text
category = items.find(class_="shop-tile__category").text
print("{}\n{}\n{}\n".format(title,name,category))
if __name__ == '__main__':
grabData(weblink)
输出:
24
24
24
24
24
24
24
['$1,150USD', '$3,000USD', '$2,500USD', '$1,000USD', '$2,300USD']
['www.cosmicdetail.co.uk', 'prestige-timepiece.com', 'gomommyboutique.com', 'sunnysx.com', 'squishywishy.com']
['Automotive', 'Fashion and apparel', 'Toys and games', 'Fashion and apparel', 'Gifts and collectibles']
['$56', '$961', '$70', '$1.3K', '$403']
['111', '7.5K', '454', '2.8K', '2.6K']
['$50', '$1.0K', '$700', '$500', '$100']
['$1.8K', '', '', '', '']
"normalize-space(//div[@class='grid grid--equal-height']//div[@class='grid__item grid__item--tablet-up-third grid__item--desktop-up-quarter grid__item--wide-up-quarter gutter-bottom category-tile-grid-item layout-flex'][24]//div[@class='shop-tile__content shop-tile__metrics-container']//div[@class='shop-tile__metric'][1]//span[@class='shop-tile__metric__value text-bold'][1])"
DIODE BRAND
diodebrand.us
Art and photography
Private listing #572131
URL Hidden
Gifts and collectibles
非常感谢你这么做。我决定以这种方式实施它,并对其进行扩展。我的问题是,例如,我如何获得收入而不是流量?为什么它选择流量而不是收入,因为收入在你申报的方式中是第一位的。非常感谢你!不客气!我再次更新了代码。我还添加了所有的表格参数和网站类型。再次非常感谢!很抱歉打扰您,您有什么办法可以帮助我吗:非常感谢您的帮助。我决定以这种方式实施它,并对其进行扩展。我的问题是,例如,我如何获得收入而不是流量?为什么它选择流量而不是收入,因为收入在你申报的方式中是第一位的。非常感谢你!不客气!我再次更新了代码。我还添加了所有的表格参数和网站类型。再次非常感谢!很抱歉打扰您-您有什么办法可以在这里帮助我: