使用selenium重复特定过程的python Web垃圾处理
我试图从这个网站上搜集公司的联系方式: 我可以使用以下代码执行此操作:使用selenium重复特定过程的python Web垃圾处理,python,selenium,google-chrome,web-scraping,Python,Selenium,Google Chrome,Web Scraping,我试图从这个网站上搜集公司的联系方式: 我可以使用以下代码执行此操作: from selenium import webdriver from selenium.webdriver.common.keys import Keys from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait import pandas as pd import time fr
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
import pandas as pd
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
company_list= [] #create empty list
driver = webdriver.Chrome('/Users/rieder/Anaconda3/chromedriver_win32/chromedriver.exe') #define driver
driver.get('https://de.statista.com/companydb/suche?idCountry=276&idBranch=0&revenueFrom=-1000000000000000000&revenueTo=1000000000000000000&employeesFrom=0&employeesTo=100000000&sortMethod=revenueDesc&p=1') # open Website
driver.find_element_by_id("cookiesNotificationConfirm").click(); #accept cookies
driver.find_element_by_xpath("//*[@id='content']/section[3]/div/div/form/div/div[2]/div[2]/table/tr[2]/td[1]/a").click(); #click on the first company namelink
contact_data = WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.XPATH, "/html/body/div[3]/div[4]/section[6]/div/div[2]/div[2]/div/div"))) #get the contactdata from the company you chose before
for cn in contact_data:
company_list.append(cn.text) # this stores the text in the list
driver.back() #navigate to previous site
time.sleep(5) #wait for the pop-up window to appear
driver.find_element_by_xpath("/html/body/div[15]/div[3]/div[3]/div[1]/button[1]").click(), #deny the websites popup
time.sleep(5) #wait for the popup to vanish
driver.find_element_by_xpath("//*[@id='content']/section[3]/div/div/form/div/div[2]/div[2]/table/tr[3]/td[1]/a").click(); #click on the next company namelink
contact_data2 = WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.XPATH, "/html/body/div[3]/div[4]/section[6]/div/div[2]/div[2]/div/div"))) #get the contactdata from the company you chose before
for cn in contact_data2:
company_list.append(cn.text) # this stores the text in the list
print(company_list) #show the list
我的输出是:
['GUTex GmbH\nGerhard-Unland-Str. 1\n26683\nSaterland\nDeutschland', 'Robert Bosch GmbH\nRobert-Bosch-Platz 1\n70839\nGerlingen\nDeutschland']
问题:
我希望,我的代码对第1页上的整个列表执行此操作,然后继续到下一页并再次执行此操作。这将持续下去,直到我在列表中有100个地址为止。我会用一个“while循环”来实现这一点,但是我查找地址的XPath太过指定,所以它总是循环相同的公司
在此之前,非常感谢您尝试以下代码提取一页数据。更新用于迭代下一页记录的代码
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
company_list= [] #create empty list
driver = webdriver.Chrome() #define driver
driver.get('https://de.statista.com/companydb/suche?idCountry=276&idBranch=0&revenueFrom=-1000000000000000000&revenueTo=1000000000000000000&employeesFrom=0&employeesTo=100000000&sortMethod=revenueDesc&p=1') # open Website
if len(driver.find_elements_by_id("cookiesNotificationConfirm")) > 0:
driver.find_element_by_id("cookiesNotificationConfirm").click(); # accept cookies
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, '//table[@class="zebraTable zebraTable--companies"]//td[1]')))
elementsSize = len(driver.find_elements_by_xpath('//table[@class="zebraTable zebraTable--companies"]//td[1]'))
# To iterate over the company list and click on the company name then capture the address on navigated page
# come back to previous page and repeat the same.
for i in range(elementsSize):
WebDriverWait(driver, 20).until(
EC.element_to_be_clickable((By.XPATH, '//table[@class="zebraTable zebraTable--companies"]//td[1]')))
elements = driver.find_elements_by_xpath('//table[@class="zebraTable zebraTable--companies"]//td[1]/a')
company_name = elements[i].text
elements[i].click() # click on the first company namelink
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH,
'//*[@id="contactInformation"]//div[@class="companyContactBox"]'))) # get the contactdata from the company you chose before
contact_data = driver.execute_script("return document.getElementsByClassName('companyContactBox')[0].innerText")
# print(contact_data)
company_list.append(company_name + " : " + contact_data)
driver.back() # navigate to previous site
print(company_list)
多亏了上面的Dilip Meghwals评论,我可以完成我的代码:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time
company_list= [] #create empty list
count = 25
chrome_options = webdriver.ChromeOptions()
prefs = {"profile.default_content_setting_values.notifications" : 2}
chrome_options.add_experimental_option("prefs",prefs)
driver = webdriver.Chrome('/Users/rieder/Anaconda3/chromedriver_win32/chromedriver.exe', chrome_options=chrome_options) #define driver
driver.get('https://de.statista.com/companydb/suche?idCountry=276&idBranch=0&revenueFrom=-1000000000000000000&revenueTo=1000000000000000000&employeesFrom=0&employeesTo=100000000&sortMethod=revenueDesc&p=1') # open Website
if len(driver.find_elements_by_id("cookiesNotificationConfirm")) > 0:
driver.find_element_by_id("cookiesNotificationConfirm").click(); # accept cookies
while len(company_list) < 1000:
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, '//table[@class="zebraTable zebraTable--companies"]//td[1]')))
elementsSize = len(driver.find_elements_by_xpath('//table[@class="zebraTable zebraTable--companies"]//td[1]'))
# To iterate over the company list and click on the company name then capture the address on navigated page
# come back to previous page and repeat the same.
for i in range(elementsSize):
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, '//table[@class="zebraTable zebraTable--companies"]//td[1]')))
elements = driver.find_elements_by_xpath('//table[@class="zebraTable zebraTable--companies"]//td[1]/a')
company_name = elements[i].text
elements[i].click() # click on the first company namelink
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH,'//*[@id="contactInformation"]//div[@class="companyContactBox"]'))) # get the contactdata from the company you chose before
contact_data = driver.execute_script("return document.getElementsByClassName('companyContactBox')[0].innerText")
# print(contact_data)
company_list.append(contact_data)
driver.back() # navigate to previous site
time.sleep(5)
driver.find_element_by_xpath("//*[@id='content']/section[3]/div/div/form/div/div[2]/div[2]/div[2]/div/button[2]").click();
company_list = [w.replace('\n', ', ') for w in company_list]
print(company_list)
df_company_name = pd.DataFrame(company_list, columns =['Name'])
df_company_name.to_excel("company_name.xlsx")
从selenium导入webdriver
从selenium.webdriver.support.ui导入WebDriverWait
从selenium.webdriver.common.by导入
从selenium.webdriver.support将预期的_条件导入为EC
作为pd进口熊猫
导入时间
公司列表=[]创建空列表
计数=25
chrome\u options=webdriver.ChromeOptions()
prefs={“profile.default\u content\u setting\u value.notifications”:2}
chrome_选项。添加_实验_选项(“prefs”,prefs)
driver=webdriver.Chrome('/Users/rieder/Anaconda3/chromedriver\u win32/chromedriver.exe',Chrome\u options=Chrome\u options)#定义驱动程序
司机,上车https://de.statista.com/companydb/suche?idCountry=276&idBranch=0&revenueFrom=-10000000000000000&revenueTo=10000000000000000000000&employeesFrom=0&employeesTo=100000000&sortMethod=revenueDesc&p=1')#打开网站
如果len(driver.find\u elements\u by\u id(“cookiesNotificationConfirm”)>0:
驱动程序。按id(“cookiesNotificationConfirm”)查找元素。单击()接受饼干
而len(公司名单)<1000:
WebDriverWait(driver,20).until(EC.element可点击((By.XPATH,//table[@class=“zebraTable zebraTable--companys”]//td[1]))
elementsSize=len(driver.find_elements_by_xpath('//table[@class=“zebraTable zebraTable--companys”]//td[1]'))
#要遍历公司列表,请单击公司名称,然后在导航页面上捕获地址
#回到上一页,重复同样的内容。
对于范围内的i(元素大小):
WebDriverWait(driver,20).until(EC.element可点击((By.XPATH,//table[@class=“zebraTable zebraTable--companys”]//td[1]))
elements=driver。通过xpath查找元素('//table[@class=“zebraTable zebraTable--companys”]//td[1]/a')
公司名称=元素[i]。文本
元素[i]。单击()#单击第一个公司名称链接
WebDriverWait(driver,20).until(EC.element可点击((By.XPATH,'/*[@id=“contactInformation”]//div[@class=“companyContactBox”]]))#从之前选择的公司获取联系人数据
联系人\u data=driver.execute\u脚本(“return document.getElementsByClassName('companyContactBox')[0].innerText”)
#打印(联系人信息)
公司列表。追加(联系人数据)
driver.back()#导航到上一个站点
时间。睡眠(5)
驱动程序。通过xpath(“//*[@id='content']/section[3]/div/div/form/div/div[2]/div[2]/div[2]/div[2]/div/button[2]”查找元素。单击();
公司列表=[w.替换公司列表中的w('\n',',')]
打印(公司清单)
df_company_name=pd.DataFrame(company_list,columns=['name'])
df_company_name.to_excel(“company_name.xlsx”)
这非常有效。我刚刚编辑了一个while循环和一些小事情,比如在chrome中拒绝弹出窗口