使用selenium重复特定过程的python Web垃圾处理_Python_Selenium_Google Chrome_Web Scraping

使用selenium重复特定过程的python Web垃圾处理

python selenium google-chrome web-scraping

使用selenium重复特定过程的python Web垃圾处理,python,selenium,google-chrome,web-scraping,Python,Selenium,Google Chrome,Web Scraping,我试图从这个网站上搜集公司的联系方式：我可以使用以下代码执行此操作： from selenium import webdriver from selenium.webdriver.common.keys import Keys from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait import pandas as pd import time fr

我试图从这个网站上搜集公司的联系方式：

我可以使用以下代码执行此操作：

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
import pandas as pd
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC

company_list= [] #create empty list

driver = webdriver.Chrome('/Users/rieder/Anaconda3/chromedriver_win32/chromedriver.exe') #define driver

driver.get('https://de.statista.com/companydb/suche?idCountry=276&idBranch=0&revenueFrom=-1000000000000000000&revenueTo=1000000000000000000&employeesFrom=0&employeesTo=100000000&sortMethod=revenueDesc&p=1') # open Website

driver.find_element_by_id("cookiesNotificationConfirm").click(); #accept cookies

driver.find_element_by_xpath("//*[@id='content']/section[3]/div/div/form/div/div[2]/div[2]/table/tr[2]/td[1]/a").click(); #click on the first company namelink

contact_data = WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.XPATH, "/html/body/div[3]/div[4]/section[6]/div/div[2]/div[2]/div/div"))) #get the contactdata from the company you chose before

for cn in contact_data:
    company_list.append(cn.text) # this stores the text in the list

driver.back() #navigate to previous site

time.sleep(5) #wait for the pop-up window to appear

driver.find_element_by_xpath("/html/body/div[15]/div[3]/div[3]/div[1]/button[1]").click(), #deny the websites popup

time.sleep(5) #wait for the popup to vanish

driver.find_element_by_xpath("//*[@id='content']/section[3]/div/div/form/div/div[2]/div[2]/table/tr[3]/td[1]/a").click(); #click on the next company namelink

contact_data2 = WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.XPATH, "/html/body/div[3]/div[4]/section[6]/div/div[2]/div[2]/div/div"))) #get the contactdata from the company you chose before

for cn in contact_data2:
    company_list.append(cn.text) # this stores the text in the list

print(company_list) #show the list

我的输出是：

['GUTex GmbH\nGerhard-Unland-Str. 1\n26683\nSaterland\nDeutschland', 'Robert Bosch GmbH\nRobert-Bosch-Platz 1\n70839\nGerlingen\nDeutschland']

问题:

我希望，我的代码对第1页上的整个列表执行此操作，然后继续到下一页并再次执行此操作。这将持续下去，直到我在列表中有100个地址为止。我会用一个“while循环”来实现这一点，但是我查找地址的XPath太过指定，所以它总是循环相同的公司

在此之前，非常感谢您尝试以下代码提取一页数据。更新用于迭代下一页记录的代码

from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC

company_list= [] #create empty list

driver = webdriver.Chrome() #define driver

driver.get('https://de.statista.com/companydb/suche?idCountry=276&idBranch=0&revenueFrom=-1000000000000000000&revenueTo=1000000000000000000&employeesFrom=0&employeesTo=100000000&sortMethod=revenueDesc&p=1') # open Website

if len(driver.find_elements_by_id("cookiesNotificationConfirm")) > 0:
    driver.find_element_by_id("cookiesNotificationConfirm").click();  # accept cookies

WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, '//table[@class="zebraTable zebraTable--companies"]//td[1]')))

elementsSize = len(driver.find_elements_by_xpath('//table[@class="zebraTable zebraTable--companies"]//td[1]'))
# To iterate over the company list and click on the company name then capture the address on navigated page
# come back to previous page and repeat the same.
for i in range(elementsSize):
    WebDriverWait(driver, 20).until(
        EC.element_to_be_clickable((By.XPATH, '//table[@class="zebraTable zebraTable--companies"]//td[1]')))
    elements = driver.find_elements_by_xpath('//table[@class="zebraTable zebraTable--companies"]//td[1]/a')
    company_name = elements[i].text
    elements[i].click()  # click on the first company namelink
    WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH,
                                                                '//*[@id="contactInformation"]//div[@class="companyContactBox"]')))  # get the contactdata from the company you chose before
    contact_data = driver.execute_script("return document.getElementsByClassName('companyContactBox')[0].innerText")
    # print(contact_data)
    company_list.append(company_name + " : " + contact_data)
    driver.back()  # navigate to previous site

print(company_list)

多亏了上面的Dilip Meghwals评论，我可以完成我的代码：

from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time

company_list= [] #create empty list

count = 25

chrome_options = webdriver.ChromeOptions()

prefs = {"profile.default_content_setting_values.notifications" : 2}

chrome_options.add_experimental_option("prefs",prefs)

driver = webdriver.Chrome('/Users/rieder/Anaconda3/chromedriver_win32/chromedriver.exe', chrome_options=chrome_options) #define driver

driver.get('https://de.statista.com/companydb/suche?idCountry=276&idBranch=0&revenueFrom=-1000000000000000000&revenueTo=1000000000000000000&employeesFrom=0&employeesTo=100000000&sortMethod=revenueDesc&p=1') # open Website

if len(driver.find_elements_by_id("cookiesNotificationConfirm")) > 0:
    driver.find_element_by_id("cookiesNotificationConfirm").click();  # accept cookies


    while len(company_list) < 1000:
            
            WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, '//table[@class="zebraTable zebraTable--companies"]//td[1]')))

            elementsSize = len(driver.find_elements_by_xpath('//table[@class="zebraTable zebraTable--companies"]//td[1]'))
            # To iterate over the company list and click on the company name then capture the address on navigated page
            # come back to previous page and repeat the same.
            
            for i in range(elementsSize):
                WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, '//table[@class="zebraTable zebraTable--companies"]//td[1]')))
                elements = driver.find_elements_by_xpath('//table[@class="zebraTable zebraTable--companies"]//td[1]/a')
                company_name = elements[i].text
                elements[i].click()  # click on the first company namelink
                WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH,'//*[@id="contactInformation"]//div[@class="companyContactBox"]')))  # get the contactdata from the company you chose before
                contact_data = driver.execute_script("return document.getElementsByClassName('companyContactBox')[0].innerText")
                # print(contact_data)
                company_list.append(contact_data)
                driver.back()  # navigate to previous site
                            
            time.sleep(5)
    
            driver.find_element_by_xpath("//*[@id='content']/section[3]/div/div/form/div/div[2]/div[2]/div[2]/div/button[2]").click();

company_list = [w.replace('\n', ', ') for w in company_list]

print(company_list)

df_company_name = pd.DataFrame(company_list, columns =['Name'])

df_company_name.to_excel("company_name.xlsx")

从selenium导入webdriver
从selenium.webdriver.support.ui导入WebDriverWait
从selenium.webdriver.common.by导入
从selenium.webdriver.support将预期的_条件导入为EC
作为pd进口熊猫
导入时间
公司列表=[]创建空列表
计数=25
chrome\u options=webdriver.ChromeOptions（）
prefs={“profile.default\u content\u setting\u value.notifications”：2}
chrome_选项。添加_实验_选项（“prefs”，prefs）
driver=webdriver.Chrome（'/Users/rieder/Anaconda3/chromedriver\u win32/chromedriver.exe'，Chrome\u options=Chrome\u options）#定义驱动程序
司机，上车https://de.statista.com/companydb/suche?idCountry=276&idBranch=0&revenueFrom=-10000000000000000&revenueTo=10000000000000000000000&employeesFrom=0&employeesTo=100000000&sortMethod=revenueDesc&p=1'）#打开网站
如果len（driver.find\u elements\u by\u id（“cookiesNotificationConfirm”）>0：
驱动程序。按id（“cookiesNotificationConfirm”）查找元素。单击（）接受饼干
而len（公司名单）<1000：
WebDriverWait（driver，20）.until（EC.element可点击（（By.XPATH，//table[@class=“zebraTable zebraTable--companys”]//td[1]））
elementsSize=len（driver.find_elements_by_xpath（'//table[@class=“zebraTable zebraTable--companys”]//td[1]'））
#要遍历公司列表，请单击公司名称，然后在导航页面上捕获地址
#回到上一页，重复同样的内容。
对于范围内的i（元素大小）：
WebDriverWait（driver，20）.until（EC.element可点击（（By.XPATH，//table[@class=“zebraTable zebraTable--companys”]//td[1]））
elements=driver。通过xpath查找元素（'//table[@class=“zebraTable zebraTable--companys”]//td[1]/a'）
公司名称=元素[i]。文本
元素[i]。单击（）#单击第一个公司名称链接
WebDriverWait（driver，20）.until（EC.element可点击（（By.XPATH，'/*[@id=“contactInformation”]//div[@class=“companyContactBox”]]））#从之前选择的公司获取联系人数据
联系人\u data=driver.execute\u脚本（“return document.getElementsByClassName（'companyContactBox'）[0].innerText”）
#打印（联系人信息）
公司列表。追加（联系人数据）
driver.back（）#导航到上一个站点
时间。睡眠（5）
驱动程序。通过xpath（“//*[@id='content']/section[3]/div/div/form/div/div[2]/div[2]/div[2]/div[2]/div/button[2]”查找元素。单击（）；
公司列表=[w.替换公司列表中的w（'\n'，'，'）]
打印（公司清单）
df_company_name=pd.DataFrame（company_list，columns=['name']）
df_company_name.to_excel（“company_name.xlsx”）

这非常有效。我刚刚编辑了一个while循环和一些小事情，比如在chrome中拒绝弹出窗口