Python 第6/7页后的Handling Cookie弹出窗口
我在这个网站上的一些研究员的帮助下建立了一个房地产数据的网站 它工作得很好,但是在is爬网到第6/7页或furhter之后,一个典型的cookie警告弹出,并且似乎破坏了我在CSV文件中的输出 有办法处理弹出窗口吗Python 第6/7页后的Handling Cookie弹出窗口,python,selenium,beautifulsoup,Python,Selenium,Beautifulsoup,我在这个网站上的一些研究员的帮助下建立了一个房地产数据的网站 它工作得很好,但是在is爬网到第6/7页或furhter之后,一个典型的cookie警告弹出,并且似乎破坏了我在CSV文件中的输出 有办法处理弹出窗口吗 from selenium import webdriver from bs4 import BeautifulSoup import re import time import requests import pandas as pd #open('output.csv', 'w'
from selenium import webdriver
from bs4 import BeautifulSoup
import re
import time
import requests
import pandas as pd
#open('output.csv', 'w').close()
browser = webdriver.Chrome('C:/Users/907133/Pythonstuff/chromedriver')
browser.set_window_position(0,0)
def jaap_spider(max_pages):
page = 1
while page <= max_pages:
url = 'https://www.jaap.nl/koophuizen/noord+holland/groot-amsterdam/amsterdam/p{}'.format(page)
#browser.delete_all_cookies()
browser.get(url)
#session = requests.Session()
#res1 = session.post(url, post_data)
#res2 = session.get(url1)
time.sleep(15)
#input('Press Enter after bypassing Captcha')
soup = BeautifulSoup(browser.page_source, 'html.parser')
info = soup.find_all('div', {'class':'property-info'})
inside = soup.find_all('a', {'class': 'property-inner'},{'href'})
# Make empty lists with header lines
outputlist_l1 = [['street', 'address', 'price', 'pricetag']]
outputlist_l2 = [['soort', 'bouwjaar', 'woonoppervlakte', 'inhoud', 'perceel']]
for huis in info:
street = huis.find('h2')
street = ' '.join(street.get_text(separator='\r\n', strip=True).split()[:+3])
address = huis.find('div')
address = address.find('div').text.strip()
price = huis.find('div', {'class': 'price-info'})
price = price.find('div').text.strip()
price = re.findall(r'\d', price)
price = ''.join(price)
pricetag = huis.find('div', {'class': 'property-price'})
pricetag = pricetag.find('span').text.strip()
outputlist_l1.append([street, address, price, pricetag])
for items in inside:
#browser.delete_all_cookies()
href = items.get('href')
url1 = href.format(page)
browser.get(url1)
kenmerken = BeautifulSoup(browser.page_source, 'html.parser')
details = kenmerken.find_all ('div', {'class':'detail-tab-content kenmerken'})
try:
tr = details[0].find_all ('td', {'class': 'value'})
except IndexError:
size_space = 'Unknown'
for inhoud in tr:
soort = tr[0].get_text(separator='\n', strip=True)
bouwjaar = tr[1].get_text(separator='\n', strip=True)
woonoppervlakte = tr[2].get_text(separator='\n', strip=True)
inhoud = tr[3].get_text(separator='\n', strip=True)
perceel = tr[4].get_text(separator='\n', strip=True)
l2 = ('{},{},{},{},{}'.format(soort, bouwjaar, woonoppervlakte, inhoud, perceel))
outputlist_l2.append([soort, bouwjaar, woonoppervlakte, inhoud, perceel])
page += 1
# Merge outputlist_l1 with outputlist_l2
outputlist = [a + b for a, b in zip(outputlist_l1, outputlist_l2)]
# transform to Pandas dataframe and export as csv
#saveFile = open('output.csv', 'a')
df = pd.DataFrame(outputlist[1:], columns=outputlist[0])
df.to_csv('output.csv', index=False)
#saveFile.close()
jaap_spider(15)
从selenium导入webdriver
从bs4导入BeautifulSoup
进口稀土
导入时间
导入请求
作为pd进口熊猫
#打开('output.csv','w')。关闭()
browser=webdriver.Chrome('C:/Users/907133/Pythonstuff/chromedriver')
浏览器。设置窗口位置(0,0)
def jaap_爬行器(最大页数):
页码=1
而页面要克服弹出问题,只需在加载页面后检查是否有可用的弹出窗口。如果是,请点击该按钮。希望能有所帮助
page = 1
while page <= max_pages:
url = 'https://www.jaap.nl/koophuizen/noord+holland/groot-amsterdam/amsterdam/p{}'.format(page)
browser.get(url)
time.sleep(10)
#Check here if there popup available
if len(browser.find_elements_by_xpath("//a[@class='CookiesOK']"))>0:
browser.find_element_by_xpath("//a[@class='CookiesOK']").click()
time.sleep(5)
#input('Press Enter after bypassing Captcha')
soup = BeautifulSoup(browser.page_source, 'html.parser')
info = soup.find_all('div', {'class':'property-info'})
inside = soup.find_all('a', {'class': 'property-inner'},{'href'})
page=1
而第0页:
浏览器。通过xpath(//a[@class='CookiesOK'])查找元素。单击()
时间。睡眠(5)
#输入('跳过验证码后按Enter键')
soup=BeautifulSoup(browser.page_源代码'html.parser')
info=soup.find_all('div',{'class':'property-info'})
inside=soup.find_all('a',{'class':'property inner'},{'href'})
这很有效!但有时它会再次出现,这意味着脚本不会把它放在一边。我能在网站上面添加的脚本中做些什么?