Python Redfin脚本正在输出重复的行
我在Selenium中构建了一个webscraper,用于在redfin.com上获取redfin估算数据。我遇到的问题是,当我将刮取的数据输出到csv上时,它会每隔一段时间多次复制行值,我不知道如何修复它 这是我的密码:Python Redfin脚本正在输出重复的行,python,pandas,csv,export-to-csv,Python,Pandas,Csv,Export To Csv,我在Selenium中构建了一个webscraper,用于在redfin.com上获取redfin估算数据。我遇到的问题是,当我将刮取的数据输出到csv上时,它会每隔一段时间多次复制行值,我不知道如何修复它 这是我的密码: from selenium import webdriver from selenium.webdriver.remote import webelement from selenium.webdriver.support.ui import WebDriverWait fr
from selenium import webdriver
from selenium.webdriver.remote import webelement
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, InvalidElementStateException
import pandas as pd
import time
from bs4 import BeautifulSoup
import os
from datetime import datetime
input_file = ".\\pa-property-value-tools\\input\\addresses.xlsx"
input_df = pd.read_excel(input_file)
input_df['Address'] = input_df['Address'].astype(str)
output_df = pd.DataFrame(columns=['Account','Address', 'redfin_estimate'])
driver = webdriver.Chrome('C:\\Users\\user\\Downloads\\chromedriver_win32 (1)\\chromedriver.exe')
#driver = webdriver.Firefox(executable_path = 'C:\\Users\\Morgan.weiss\\Downloads\\geckodriver-v0.24.0-win64\\geckodriver.exe')
def append_date_timestamp(filepath, extension):
return (
filepath + "-" + datetime.now().strftime("%Y-%m-%d %H-%M-%S") + "." + extension
)
def get_redfin_estimate(address):
driver.get('https://www.redfin.com/')
print(address)
driver.find_element_by_name('searchInputBox').clear()
driver.find_element_by_name('searchInputBox').send_keys(address)
time.sleep(3)
try:
pop_up = driver.find_element_by_css_selector("div[data-rf-test-name='expanded-results']")
if pop_up:
types = pop_up.find_elements_by_class_name("expanded-row-content")
for ele in types:
val = ele.find_element_by_class_name("expanded-type")
if val.text == "ADDRESSES":
ele.find_element_by_css_selector("div[data-rf-test-name='item-row-active']").click()
else:
return ('N/A', 'N/A')
except:
pass
soup = BeautifulSoup(driver.page_source, 'html.parser')
try:
price1 = soup.find('div', {'class', 'avm'}).div.text
print(price1)
url = driver.current_url if driver.current_url else 'N/A'
return(price1, url)
except AttributeError:
try:
time.sleep(3)
price2 = soup.find('span',class_='avmLabel').find_next('span', class_='value').text
print(price2)
url = driver.current_url if driver.current_url else 'N/A'
return(price2, url)
except:
return('N/A', 'N/A')
outputfile = append_date_timestamp(".\\pa-property-value-tools\\output\\output", "csv")
count = 0
exception = 0
wait_after = 10
current_date = datetime.now().strftime("%Y-%m-%d")
driver.get('https://www.redfin.com/')
time.sleep(100)
for row in input_df.itertuples():
try:
count += 1
estimate,url_source = get_redfin_estimate(row.Address)
output_df = output_df.append({
'Account': row.Account,
'Address': row.Address,
'redfin_estimate':estimate,
'url':url_source,
'date_pulled':current_date
},
ignore_index=True,
)
if count % wait_after == 0:
# if file does not exist write header
if not os.path.isfile(outputfile):
output_df.to_csv(outputfile, index=False)
else: # else it exists so append without writing the header
output_df.to_csv(outputfile, mode='a', index=False, header=False)
#output_df = pd.DataFrame(columns=['Account','Address', 'redfin_estimate', 'url', 'date_pulled'])
print("Waiting 20 seconds for every " + str(wait_after) + " calls")
time.sleep(20)
time.sleep(1)
except (NoSuchElementException,InvalidElementStateException) as e:
print(e)
exception += 1
print(exception)
continue
print(exception)
if count % wait_after > 0:
output_df.to_csv(outputfile, mode='a', index=False, header=False)
driver.quit()
我认为导致这个问题的原因是:
outputfile = append_date_timestamp(".\\pa-property-value-tools\\output\\output", "csv")
count = 0
exception = 0
wait_after = 10
current_date = datetime.now().strftime("%Y-%m-%d")
driver.get('https://www.redfin.com/')
time.sleep(100)
for row in input_df.itertuples():
try:
count += 1
estimate,url_source = get_redfin_estimate(row.Address)
output_df = output_df.append({
'Account': row.Account,
'Address': row.Address,
'redfin_estimate':estimate,
'url':url_source,
'date_pulled':current_date
},
ignore_index=True,
)
if count % wait_after == 0:
# if file does not exist write header
if not os.path.isfile(outputfile):
output_df.to_csv(outputfile, index=False)
else: # else it exists so append without writing the header
output_df.to_csv(outputfile, mode='a', index=False, header=False)
#output_df = pd.DataFrame(columns=['Account','Address', 'redfin_estimate', 'url', 'date_pulled'])
print("Waiting 20 seconds for every " + str(wait_after) + " calls")
time.sleep(20)
time.sleep(1)
except (NoSuchElementException,InvalidElementStateException) as e:
print(e)
exception += 1
print(exception)
continue
print(exception)
if count % wait_after > 0:
output_df.to_csv(outputfile, mode='a', index=False, header=False)
我不知道是什么问题,任何建议都非常感谢
编辑:
对于标记为有问题的代码。代码所做的是计算我们遍历地址的次数。如果我们已经通过了10,那么我们将它们输出到csv中。我们对每个呼叫都有一个随机等待时间,这样我们就不会被ip地址阻塞。问题出在这些代码行中,因为某种原因,我得到了重复的代码。在写入csv文件后,您似乎没有重置
输出_df
您将在此处附加到数据帧:
output_df = output_df.append({
'Account': row.Account,
'Address': row.Address,
'redfin_estimate':estimate,
'url':url_source,
'date_pulled':current_date
},
ignore_index=True,
)
然后再次使用mode='a'
将output_df
的内容附加到csv文件中:
output_df.to_csv(outputfile, mode='a', index=False, header=False)
这就是为什么要多次写入行的原因
写入csv文件后重置数据帧应可解决此问题:
output_df.to_csv(outputfile, mode='a', index=False, header=False)
output_df = pd.DataFrame()
请添加一个简短的描述,说明您的代码在高级别上的作用。例如,转到redfin,打开abc页面,从abc中提取表格并存储在电子表格中等。你越容易让他人帮助你,你就越有可能更快地得到答案。祝你好运