Python Web刮片后追加Excel表格_Python_Excel_Selenium_Beautifulsoup

Python Web刮片后追加Excel表格

python excel selenium

Python Web刮片后追加Excel表格,python,excel,selenium,beautifulsoup,Python,Excel,Selenium,Beautifulsoup,我已经成功地从网站上抓取了数据。我制作了一个excel文件，其中包含一种商品的结果。在抓取第二种商品的数据后，我无法将另一张表添加到同一excel文件中。任何帮助都将不胜感激。先谢谢你。这是我的代码：- from selenium import webdriver import time, re from selenium.webdriver.support.ui import Select from bs4 import BeautifulSoup import pandas as pd f

我已经成功地从网站上抓取了数据。我制作了一个excel文件，其中包含一种商品的结果。在抓取第二种商品的数据后，我无法将另一张表添加到同一excel文件中。任何帮助都将不胜感激。先谢谢你。这是我的代码：-

from selenium import webdriver 
import time, re
from selenium.webdriver.support.ui import Select
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
import time

chrome_path = r"C:\Users\user\Desktop\chromedriver_win32\chromedriver.exe"
driver = webdriver.Chrome(chrome_path)

driver.get("https://fcainfoweb.nic.in/Reports/Report_Menu_Web.aspx")

html_source = driver.page_source
results=[]

driver.find_element_by_xpath("""//*[@id="ctl00_MainContent_Rbl_Rpt_type_1"]""").click()
element_variation = driver.find_element_by_id ("ctl00_MainContent_Ddl_Rpt_Option1")
drp_variation = Select(element_variation)
drp_variation.select_by_visible_text("Daily Variation")

driver.find_element_by_id("ctl00_MainContent_Txt_FrmDate").send_keys("01/05/2020")
driver.find_element_by_id("ctl00_MainContent_Txt_ToDate").send_keys("27/05/2020")

element_commodity = driver.find_element_by_id ("ctl00_MainContent_Lst_Commodity")
drp_commodity = Select(element_commodity)
drp_commodity.select_by_visible_text("Rice")

driver.find_element_by_xpath("""//*[@id="ctl00_MainContent_btn_getdata1"]""").click()

soup = BeautifulSoup(driver.page_source, 'html.parser')
table = pd.read_html(driver.page_source)[2] #second table is the one that we want
print(len(table))
print(table)

results.append(table)
driver.back()
time.sleep(1)
with pd.ExcelWriter(r'C:\Users\user\Desktop\python.xlsx') as writer:
 table.to_excel(writer, sheet_name = "rice", index=False) # Rice results on sheet named rice
 writer.save() 

driver.find_element_by_xpath("""//*[@id="btn_back"]""").click()
driver.find_element_by_xpath("""//*[@id="ctl00_MainContent_Rbl_Rpt_type_1"]""").click()
element_variation = driver.find_element_by_id ("ctl00_MainContent_Ddl_Rpt_Option1")
drp_variation = Select(element_variation)
drp_variation.select_by_visible_text("Daily Variation")

driver.find_element_by_id("ctl00_MainContent_Txt_FrmDate").send_keys("01/05/2020")
driver.find_element_by_id("ctl00_MainContent_Txt_ToDate").send_keys("27/05/2020")

element_commodity = driver.find_element_by_id ("ctl00_MainContent_Lst_Commodity")
drp_commodity = Select(element_commodity)
drp_commodity.select_by_visible_text("Wheat")

driver.find_element_by_xpath("""//*[@id="ctl00_MainContent_btn_getdata1"]""").click()

soup = BeautifulSoup(driver.page_source, 'html.parser')
table = pd.read_html(driver.page_source)[2] #second table is the one that we want
print(len(table))
print(table)

results.append(table)
driver.back()
time.sleep(1)

with pd.ExcelWriter(r'C:\Users\user\Desktop\python.xlsx') as writer:
 table.to_excel(writer, sheet_name = "wheat", index=False) # Wheat results on sheet named wheat
 writer.save()

对于某些类型的文件，您可能必须将所有数据读取到内存中，添加新数据，然后将所有数据再次保存到文件中。对于其他一些文件，您必须使用“附加”模式

请参阅文档，它有选项

mode=“a”

附加到现有文件

with pd.ExcelWriter(r'C:\Users\user\Desktop\python.xlsx') as writer:
    table.to_excel(writer, sheet_name="rice", index=False)
    #writer.save() 

with pd.ExcelWriter(r'C:\Users\user\Desktop\python.xlsx', mode='a') as writer:
    table.to_excel(writer, sheet_name="wheat", index=False)
    #writer.save()

或者，您可以在一个

中使用而无需附加
with pd.ExcelWriter(r'C:\Users\user\Desktop\python.xlsx') as writer:
    table.to_excel(writer, sheet_name="rice", index=False)
    table.to_excel(writer, sheet_name="wheat", index=False)
    #writer.save() 


顺便说一句：我发现append
模式不适用于引擎xlsxwriter
，我不得不使用引擎openpyxl
（这也意味着用pip
安装模块openpyxl
）
我发现可用的引擎有问题

完整工作代码
from selenium import webdriver 
from selenium.webdriver.support.ui import Select
import pandas as pd
import time

# --- functions ---

def get_data(start_date, end_date, product):

    # select `Variation Report`
    driver.find_element_by_id('ctl00_MainContent_Rbl_Rpt_type_1').click()

    # select `Daily Variant`
    element_variation = driver.find_element_by_id ('ctl00_MainContent_Ddl_Rpt_Option1')
    drop_variation = Select(element_variation)
    drop_variation.select_by_visible_text('Daily Variation')

    # select `product` before `date` because `end_date` opens calendar which blocks `product` list
    element_commodity = driver.find_element_by_id ('ctl00_MainContent_Lst_Commodity')
    drop_commodity = Select(element_commodity)
    drop_commodity.select_by_visible_text(product)

    # select `start_date` and `end_date`    
    driver.find_element_by_id('ctl00_MainContent_Txt_FrmDate').send_keys(start_date)
    driver.find_element_by_id('ctl00_MainContent_Txt_ToDate').send_keys(end_date)

    # click button `Get Data`
    driver.find_element_by_id('ctl00_MainContent_btn_getdata1').click()

    time.sleep(3)  # sometimes it need to wait for loading page

    #second table is the one that we want    
    table = pd.read_html(driver.page_source)[2]

    print(len(table))
    print(table)

    # go back
    driver.find_element_by_id('btn_back').click()

    time.sleep(3)  # sometimes it need to wait for loading page

    return table

# --- main ---

driver = webdriver.Firefox()

driver.get('https://fcainfoweb.nic.in/Reports/Report_Menu_Web.aspx')

start_date = '01/05/2020'
end_date   = '27/05/2020'

for number, product in enumerate( ('Rice', 'Wheat', 'Tomato', 'Sugar') ):
    table = get_data(start_date, end_date, product)

    # for first product create file, for other products append to existing file
    if number == 0:
        mode = 'w'
    else:
        mode = 'a'

    # standard engine `xlsxwriter` can't append so I had to use `openpyxl`
    with pd.ExcelWriter('output.xlsx', engine='openpyxl', mode=mode) as writer:
        table.to_excel(writer, sheet_name=product, index=False)

对于某些类型的文件，您可能必须将所有数据读取到内存中，添加新数据，然后将所有数据再次保存到文件中。对于其他一些文件，您必须使用“附加”模式
请参阅文档，它有选项mode=“a”
附加到现有文件
with pd.ExcelWriter(r'C:\Users\user\Desktop\python.xlsx') as writer:
    table.to_excel(writer, sheet_name="rice", index=False)
    #writer.save() 

with pd.ExcelWriter(r'C:\Users\user\Desktop\python.xlsx', mode='a') as writer:
    table.to_excel(writer, sheet_name="wheat", index=False)
    #writer.save() 

或者，您可以在一个中使用而无需附加
with pd.ExcelWriter(r'C:\Users\user\Desktop\python.xlsx') as writer:
    table.to_excel(writer, sheet_name="rice", index=False)
    table.to_excel(writer, sheet_name="wheat", index=False)
    #writer.save() 


顺便说一句：我发现append
模式不适用于引擎xlsxwriter
，我不得不使用引擎openpyxl
（这也意味着用pip
安装模块openpyxl
）
我发现可用的引擎有问题

完整工作代码
from selenium import webdriver 
from selenium.webdriver.support.ui import Select
import pandas as pd
import time

# --- functions ---

def get_data(start_date, end_date, product):

    # select `Variation Report`
    driver.find_element_by_id('ctl00_MainContent_Rbl_Rpt_type_1').click()

    # select `Daily Variant`
    element_variation = driver.find_element_by_id ('ctl00_MainContent_Ddl_Rpt_Option1')
    drop_variation = Select(element_variation)
    drop_variation.select_by_visible_text('Daily Variation')

    # select `product` before `date` because `end_date` opens calendar which blocks `product` list
    element_commodity = driver.find_element_by_id ('ctl00_MainContent_Lst_Commodity')
    drop_commodity = Select(element_commodity)
    drop_commodity.select_by_visible_text(product)

    # select `start_date` and `end_date`    
    driver.find_element_by_id('ctl00_MainContent_Txt_FrmDate').send_keys(start_date)
    driver.find_element_by_id('ctl00_MainContent_Txt_ToDate').send_keys(end_date)

    # click button `Get Data`
    driver.find_element_by_id('ctl00_MainContent_btn_getdata1').click()

    time.sleep(3)  # sometimes it need to wait for loading page

    #second table is the one that we want    
    table = pd.read_html(driver.page_source)[2]

    print(len(table))
    print(table)

    # go back
    driver.find_element_by_id('btn_back').click()

    time.sleep(3)  # sometimes it need to wait for loading page

    return table

# --- main ---

driver = webdriver.Firefox()

driver.get('https://fcainfoweb.nic.in/Reports/Report_Menu_Web.aspx')

start_date = '01/05/2020'
end_date   = '27/05/2020'

for number, product in enumerate( ('Rice', 'Wheat', 'Tomato', 'Sugar') ):
    table = get_data(start_date, end_date, product)

    # for first product create file, for other products append to existing file
    if number == 0:
        mode = 'w'
    else:
        mode = 'a'

    # standard engine `xlsxwriter` can't append so I had to use `openpyxl`
    with pd.ExcelWriter('output.xlsx', engine='openpyxl', mode=mode) as writer:
        table.to_excel(writer, sheet_name=product, index=False)

对于某些类型的文件，您可能必须将所有数据读取到内存中，添加新数据，然后将所有数据再次保存到文件中。对于其他一些文件，您必须使用“附加”模式，并且可能ExcelWriter
具有类似的功能。您是否检查了文档中的ExcelWriter
和到\u excel
？对于某些类型的文件，您可能需要将所有数据读取到内存中，添加新数据，然后再次将所有数据保存到文件中。对于其他一些文件，您必须使用“附加”模式，并且可能ExcelWriter
具有类似的功能。您是否检查了ExcelWriter
和到excel
的文档？如果pandas
使用模块xlsxwriter
，我发现append
不起作用，我必须使用installopenpyxl
和ExcelWriter（…，engine='openpyxl'）
。顺便说一句：我把你的代码放在了较短的版本中，因为你重复了一些行。我更改了完整的代码-现在我使用for
-循环来获得不同的产品。这太棒了！代码现在很简洁。非常感谢您的输入。如果pandas
使用模块xlsxwriter
并且我必须使用安装openpyxl
和使用ExcelWriter（…，engine='openpyxl'）
，我发现append
不起作用。顺便说一句：我把你的代码放在了较短的版本中，因为你重复了一些行。我更改了完整的代码-现在我使用for
-循环来获得不同的产品。这太棒了！代码现在很简洁。非常感谢您的投入。