python中使用selenium的动态表刮取_Python_Selenium_Dynamic Tables

python中使用selenium的动态表刮取

python selenium

python中使用selenium的动态表刮取,python,selenium,dynamic-tables,Python,Selenium,Dynamic Tables,我正在尝试访问此网站上的数据：。到目前为止，我的代码在两个下拉菜单中循环，但表是动态命名的，从中获取数据时遇到了困难。我试图通过“output_data_table”上面的类访问数据，但遇到了问题 # importing libraries from selenium import webdriver import time from selenium.webdriver.support.ui import Select import lxml.html driver = webdriv

我正在尝试访问此网站上的数据：。到目前为止，我的代码在两个下拉菜单中循环，但表是动态命名的，从中获取数据时遇到了困难。我试图通过“output_data_table”上面的类访问数据，但遇到了问题

# importing libraries
from selenium import webdriver
import time
from selenium.webdriver.support.ui import Select
import lxml.html



driver = webdriver.Firefox()
driver.get("http://surge.srcc.lsu.edu/s1.html")

# definition for switching frames
def frame_switch(css_selector):
  driver.switch_to.frame(driver.find_element_by_css_selector(css_selector))  

frame_switch("iframe")

html_source = driver.page_source
nameSelect = Select(driver.find_element_by_xpath('//select[@id="storm_name"]'))
stormCount = len(nameSelect.options)

for i in range(1, stormCount):
    print("starting loop on option storm " + nameSelect.options[i].text)
    nameSelect.select_by_index(i)
    time.sleep(3)


    yearSelect = Select(driver.find_element_by_xpath('//select[@id="year"]'))
    yearCount = len(yearSelect.options)
    for j in range(1, yearCount):
        print("starting loop on option year " + yearSelect.options[j].text)
        yearSelect.select_by_index(j)


        root = lxml.html.fromstring(driver.page_source)

        #table=driver.find_element_by_id("output_data_table")

        for row in root.xpath('.//table[@id="output_data_table"]//tr'):
        # needs dynamic table name
            cells = row.xpath('.//td/text()')
            dict_value = {'0th': cells[0],
                  '1st': cells[1],
                  '2nd': cells[2],
                  '3rd': cells[3],
                  '4th': cells[5],
                  '5th': cells[6],
                  '6th': cells[7],
                  '7th': cells[8]}
            print(dict_value)

似乎在调用“root=lxml.html.fromstring（driver.page\u source）”之前必须等待

如果您不等待，就可以获得html源代码，而不必使用javascript生成表。在它前面写上“时间。睡眠（10）”

这似乎占了上风。我用BeautifulSoup作为一个简单的例子

from selenium import webdriver
import time, re
from selenium.webdriver.support.ui import Select
import lxml.html
from bs4 import BeautifulSoup

driver = webdriver.Firefox()
driver.get("http://surge.srcc.lsu.edu/s1.html")

# definition for switching frames
def frame_switch(css_selector):
  driver.switch_to.frame(driver.find_element_by_css_selector(css_selector))

frame_switch("iframe")

html_source = driver.page_source

nameSelect = Select(driver.find_element_by_xpath('//select[@id="storm_name"]'))
stormCount = len(nameSelect.options)

for i in range(1, stormCount):
    print("starting loop on option storm " + nameSelect.options[i].text)
    nameSelect.select_by_index(i)
    time.sleep(3)


    yearSelect = Select(driver.find_element_by_xpath('//select[@id="year"]'))
    yearCount = len(yearSelect.options)
    for j in range(1, yearCount):
        print("starting loop on option year " + yearSelect.options[j].text)
        yearSelect.select_by_index(j)


        time.sleep(10)

        soup = BeautifulSoup(driver.page_source, 'html.parser')

        # get the needed table body
        print soup.find_all("tbody", {"class" : re.compile(".*")})[1].prettify()


        # print out each column

        get_table = soup.find_all("tbody", {"class" : re.compile(".*")})[1]
        columns = get_table.find_all("tr")

        for column in columns:
           print column.getText()

似乎在调用“root=lxml.html.fromstring（driver.page\u source）”之前必须等待

如果您不等待，就可以获得html源代码，而不必使用javascript生成表。在它前面写上“时间。睡眠（10）”

这似乎占了上风。我用BeautifulSoup作为一个简单的例子

from selenium import webdriver
import time, re
from selenium.webdriver.support.ui import Select
import lxml.html
from bs4 import BeautifulSoup

driver = webdriver.Firefox()
driver.get("http://surge.srcc.lsu.edu/s1.html")

# definition for switching frames
def frame_switch(css_selector):
  driver.switch_to.frame(driver.find_element_by_css_selector(css_selector))

frame_switch("iframe")

html_source = driver.page_source

nameSelect = Select(driver.find_element_by_xpath('//select[@id="storm_name"]'))
stormCount = len(nameSelect.options)

for i in range(1, stormCount):
    print("starting loop on option storm " + nameSelect.options[i].text)
    nameSelect.select_by_index(i)
    time.sleep(3)


    yearSelect = Select(driver.find_element_by_xpath('//select[@id="year"]'))
    yearCount = len(yearSelect.options)
    for j in range(1, yearCount):
        print("starting loop on option year " + yearSelect.options[j].text)
        yearSelect.select_by_index(j)


        time.sleep(10)

        soup = BeautifulSoup(driver.page_source, 'html.parser')

        # get the needed table body
        print soup.find_all("tbody", {"class" : re.compile(".*")})[1].prettify()


        # print out each column

        get_table = soup.find_all("tbody", {"class" : re.compile(".*")})[1]
        columns = get_table.find_all("tr")

        for column in columns:
           print column.getText()

现在的问题到底是什么？谢谢。似乎在调用“root=lxml.html.fromstring（driver.page\u source）”之前您必须等待。如果您不等待，就可以获得html源代码，而不必使用javascript生成表。在它前面加上一个“时间睡眠（10）”现在到底是什么问题？谢谢。似乎在调用“root=lxml.html.fromstring（driver.page\u source）”之前您必须等待。如果您不等待，就可以获得html源代码，而不必使用javascript生成表。在它前面写上“时间。睡眠（10）”