如何使用python将列表拆分为表格式_Python_Selenium Webdriver_Web Scraping_Beautifulsoup

如何使用python将列表拆分为表格式

python selenium-webdriver web-scraping

如何使用python将列表拆分为表格式,python,selenium-webdriver,web-scraping,beautifulsoup,Python,Selenium Webdriver,Web Scraping,Beautifulsoup,我从网站上的多个表格中提取数据，并将其转换为列表格式，plz帮助我将列表拆分为表格格式。我尝试过split（），但没有用 from bs4 import BeautifulSoup from selenium import webdriver import time class ProductTracker: def __init__(self, url): self.url = url self.user_agent = {'User-Agent':

我从网站上的多个表格中提取数据，并将其转换为列表格式，plz帮助我将列表拆分为表格格式。我尝试过split（），但没有用

from bs4 import BeautifulSoup
from selenium import webdriver
import time

class ProductTracker:

    def __init__(self, url):
        self.url = url
        self.user_agent = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36'}
        # self.responce = requests.get(url=self.url, headers= self.user_agent).text
        driver = webdriver.Chrome()
        driver.get(url)
        time.sleep(5)
        self.responce = driver.page_source
        driver.close()
        self.soup = BeautifulSoup(self.responce, 'lxml')

    def product_header(self):
        # for tabletitle in self.soup.findAll('h3', attrs={'class': 'search-table-view__heading'}).text:
        tabletitles = self.soup.find_all('h3', {'class': 'search-table-view__heading'})
        table_titles_list = []
        for title in tabletitles:
            table_titles_list.append(title.text)
        return table_titles_list
        return "Tag Not Found"

    def product_tableheader(self):
        tableheaders = self.soup.find_all('th',
                                          class_=lambda value: value and value.startswith("search-table-view__cell"))
        header_lst = []
        for tableheader in tableheaders:
            try:
                header_lst.append(tableheader.div.a.span.text.strip())
            except:
                try:
                    header_lst.append(tableheader.div.text.strip())
                except:
                    pass
        return header_lst
        return "Tag Not Found"

    def product_tablevalues(self):
        tablevalues = self.soup.find_all('tr', class_=lambda value: value and value.startswith(
            "search-table-view__web-parent-table-row"))
        values_lst = []
        for tablevalue in tablevalues:
            td_lst = tablevalue.find_all('td', class_=lambda value: value and value.startswith(
                "search-table-view__web-parent-table-row-cell"))
            for td in td_lst:
                try:
                    values_lst.append(td.text.strip().replace("\n", "").replace("\r", "").replace("                                                   "," ").replace("                                           "," "))
                except:
                    pass
        return values_lst
        return "Tag Not Found"
material = ProductTracker(url = "https://www.grainger.com/category/power-transmission/bearings/ball-bearings/radial-ball-bearings")
print(material.product_header())
print(material.product_tableheader())
print(material.product_tablevalues())

以下是我的输出，我已从输出中删除了一些数据：

['NTN Single Row Radial Ball Bearings, Metric Series', 'BL Single Row Radial Ball Bearings, Metric Series', 'BL Single Row Radial Ball Bearings, Inch Series', 'DAYTON Single Row Radial Ball Bearings, Metric Series',........'TORRINGTON BEARINGS Single Row Radial Ball Bearings, Metric Series']
['Bore Dia.', 'Outside Dia.', 'Width', 'Seal/Shield Type', 'Item #', 'Price', 'Bore Dia.', 'Outside Dia.', 'Width', 'Seal/Shield Type', 'Item #', 'Price', 'Bore Dia.', 'Outside Dia.', 'Width', 'Seal/Shield Type', 'Item #', 'Price']
['4 mm', '13 mm', '5 mm', 'Double Shielded', '5U557', 'Regular Price $15.22 / each', '5 mm', '16 mm', '5 mm', 'Double Shielded', '5U592', 'Regular Price $10.10 / each', '6 mm', '19 mm', '6 mm', 'Select Seal/Shield Type', 'Multiple Items', '$9.10 - $14.65', '7 mm', '19 mm', '6 mm', 'Select Seal/Shield Type', 'Multiple Items', '$5.75 - $9.90', '7 mm', '22 mm', '7 mm', 'Select Seal/Shield Type', 'Multiple Items', '$8.50 - $15.05', '8 mm', '22 mm', '7 mm', 'Select Seal/Shield Type', 'Multiple Items', '$8.15, '9 mm', '24 mm', '7 mm', 'Double Shielded', '5U530', 'Regular Price $8.65 / each', '0.4375 in', '1.125 in', .............'0.375 in', 'Select Seal/Shield Type', 'Multiple Items', '$13.25 - $13.85', '0.4375 in', '0.9062 in', '0.3125 in', 'Select Seal/Shield Type', 'Multiple Items', '$11.25 - $11.80']

我必须拆分列表并像表格一样更改，例如：

['NTN Single Row Radial Ball Bearings, Metric Series']

['Bore Dia.', 'Outside Dia.', 'Width', 'Seal/Shield Type', 'Item #', 'Price']
['4 mm','3 mm','5 mm','Double Shielded','5U557','Regular Price$15.22 / each']
['5 mm','16 mm','5 mm','Double Shielded','5U592','Regular Price$10.10 / each']

['BL Single Row Radial Ball Bearings, Metric Series']

['Bore Dia.', 'Outside Dia.', 'Width', 'Seal/Shield Type', 'Item #', 'Price']
['10 mm','15 mm','12 mm','Double Shielded','5U559','Regular Price$16.22 / each']
['54 mm','21 mm','9 mm','Double Shielded','5U598','Regular Price$10.10 / each']

也许是这样的

import re
from bs4 import BeautifulSoup as bs

soup = bs(driver.page_source, 'lxml')

for section in soup.select('.search-table-view__web-parent'):
    title = [section.select_one('.search-table-view__heading').text]
    header_row = [i.text.strip() for i in section.select_one(' .search-table-view__web-parent-table-head-row').select('th')][:-1]

    print(title)
    print()
    print(header_row)

    for row in section.select('.search-table-view__web-parent-table-row'):
        print([re.sub('\n+|\s{2,}',' ',r.text.strip()) for r in row.select('td')][:-1])
    
    print()

当您需要循环部分时，是否需要重新写入以包含允许此操作的调用

from bs4 import BeautifulSoup
from selenium import webdriver
import time, re

class ProductTracker:

    def __init__(self, url):
        self.url = url
        self.user_agent = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36'}
        # self.responce = requests.get(url=self.url, headers= self.user_agent).text
        driver = webdriver.Chrome()
        driver.get(url)
        time.sleep(5)
        self.response = driver.page_source
        driver.close()
        self.soup = BeautifulSoup(self.response, 'lxml')
    
    def loop_sections(self):
        try:
            for section in self.soup.select('.search-table-view__web-parent'):
                print(self.product_header(section))
                print()
                print(self.product_tableheader(section))
                self.product_tablevalues(section) # you might change this from a list of lists to print during as loop rather than nested list comprehension
                print()
        except Exception as e:
            print("error", e)
            return
        
    def product_header(self, section):
        try:
            table_title = [section.select_one('.search-table-view__heading').text]
            return table_title
        except Exception as e:
            print("error", e)
            return

    def product_tableheader(self, section):
        try:
            tableheaders = [i.text.strip() for i in section.select_one(' .search-table-view__web-parent-table-head-row').select('th')][:-1]
            return tableheaders
        except Exception as e:
            print("error", e)
            return
    
    def product_tablevalues(self, section):
        try:
            for row in section.select('.search-table-view__web-parent-table-row'):
                print([re.sub('\n+|\s{2,}',' ',r.text.strip()) for r in row.select('td')][:-1])
            return
        except Exception as e:
            print("error", e)
            return
        
material = ProductTracker(url = "https://www.grainger.com/category/power-transmission/bearings/ball-bearings/radial-ball-bearings")
material.loop_sections()

也许是这样的

import re
from bs4 import BeautifulSoup as bs

soup = bs(driver.page_source, 'lxml')

for section in soup.select('.search-table-view__web-parent'):
    title = [section.select_one('.search-table-view__heading').text]
    header_row = [i.text.strip() for i in section.select_one(' .search-table-view__web-parent-table-head-row').select('th')][:-1]

    print(title)
    print()
    print(header_row)

    for row in section.select('.search-table-view__web-parent-table-row'):
        print([re.sub('\n+|\s{2,}',' ',r.text.strip()) for r in row.select('td')][:-1])
    
    print()

当您需要循环部分时，是否需要重新写入以包含允许此操作的调用

from bs4 import BeautifulSoup
from selenium import webdriver
import time, re

class ProductTracker:

    def __init__(self, url):
        self.url = url
        self.user_agent = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36'}
        # self.responce = requests.get(url=self.url, headers= self.user_agent).text
        driver = webdriver.Chrome()
        driver.get(url)
        time.sleep(5)
        self.response = driver.page_source
        driver.close()
        self.soup = BeautifulSoup(self.response, 'lxml')
    
    def loop_sections(self):
        try:
            for section in self.soup.select('.search-table-view__web-parent'):
                print(self.product_header(section))
                print()
                print(self.product_tableheader(section))
                self.product_tablevalues(section) # you might change this from a list of lists to print during as loop rather than nested list comprehension
                print()
        except Exception as e:
            print("error", e)
            return
        
    def product_header(self, section):
        try:
            table_title = [section.select_one('.search-table-view__heading').text]
            return table_title
        except Exception as e:
            print("error", e)
            return

    def product_tableheader(self, section):
        try:
            tableheaders = [i.text.strip() for i in section.select_one(' .search-table-view__web-parent-table-head-row').select('th')][:-1]
            return tableheaders
        except Exception as e:
            print("error", e)
            return
    
    def product_tablevalues(self, section):
        try:
            for row in section.select('.search-table-view__web-parent-table-row'):
                print([re.sub('\n+|\s{2,}',' ',r.text.strip()) for r in row.select('td')][:-1])
            return
        except Exception as e:
            print("error", e)
            return
        
material = ProductTracker(url = "https://www.grainger.com/category/power-transmission/bearings/ball-bearings/radial-ball-bearings")
material.loop_sections()

底部不是桌子。请以实际表格的形式显示所需的输出，如果可能，请包含url，否则请包含相关的html。@QHarr我需要输出，我在示例中给出了输出，我尝试了很多，但无法将其分开。链接的底部不是表格。请以实际表格的形式显示所需的输出，如果可能，请包含url，否则请包含相关的html。@QHarr我需要输出，我在示例中给出了输出，我尝试了很多，但无法将其分离链接很抱歉QHarr先生，您的输出与我的示例输出不同，您能帮助我使示例输出的修订更接近要求吗？非常感谢QHarrI先生尝试将此输出写入excel，但出现属性错误，您能帮助我吗？请…抱歉，QHarr先生，您的输出与我的示例输出不同。您能帮助我使示例输出的修订更接近要求吗？非常感谢QHarrI先生尝试将此输出写入excel，但出现属性错误。您能帮助我。。。