Python 通过Xpath循环获取元素错误
我想为Pinterest做一个网页刮板。我可以得到几乎所有的数据,但每个pin都有一个名为“查看更多”的按钮,该按钮会生成:“board name”和“author name”数据 逻辑:Python 通过Xpath循环获取元素错误,python,selenium,web-scraping,Python,Selenium,Web Scraping,我想为Pinterest做一个网页刮板。我可以得到几乎所有的数据,但每个pin都有一个名为“查看更多”的按钮,该按钮会生成:“board name”和“author name”数据 逻辑: moreButtons = driver.find_elements_by_xpath('//button[@data-test-id="seemoretoggle"]') for moreBtn in moreButtons: moreBtn.click() source_
moreButtons = driver.find_elements_by_xpath('//button[@data-test-id="seemoretoggle"]')
for moreBtn in moreButtons:
moreBtn.click()
source_data = driver.page_source
moreButtons = driver.find_elements_by_xpath('//button[@data-test-id="seemoretoggle"]')
for moreBtn in moreButtons:
moreBtn.click()
source_data = driver.page_source
获取董事会名称代码
# Pin Length - Total Pins
total_pins = []
total_pins = driver.find_elements_by_class_name("Grid__Item")
# Pin Board Names
i = 1
while i <= len(total_pins):
temp_xpath = "/html/body/div[1]/div[1]/div[1]/div/div/div/div/div[1]/div/div/div/div[" + str(i) + "]/div/div/div[2]/div[2]/h4/a[1]"
temp = driver.find_element_by_xpath(temp_xpath)
#pin_Board_Names.append(temp)
print(temp.text)
i += 1
driver = webdriver.PhantomJS(executable_path='phantomjs.exe')
print("Ghost Headless Driver Invoked")
# driver.implicitly_wait(5) # if element not found, wait for (seconds) before next operation
driver.get(url) # grab the url
# Scrolling till the end of page
print("Started Scrolling ... ")
match=True # change to 'False' for making this work..
lenOfPage = driver.execute_script("window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;")
while(match==False):
lastCount = lenOfPage
lenOfPage = driver.execute_script("window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;")
if lastCount==lenOfPage:
match=True
source_data = driver.page_source # page source code as html
# Get all pins , number of pins collected
total_pins = []
try:
total_pins = driver.find_elements_by_class_name("Grid__Item")
except:
print("Unable to load pins")
print("Total Pins: " + str(len(total_pins)))
# get number of 'see more' buttons collected - for error checking
moreButtons = driver.find_elements_by_xpath('//button[@data-test-id="seemoretoggle"]')
print("Dynamic Elements: " + str(len(moreButtons)))
print("Display: Dynamic Elements ... ")
# clicking all 'See More' buttons
i = 0
while i <= (len(moreButtons) - 1):
moreButtons[i].click()
i += 1
# Pin Board Names
print("Extracting Board Names ... ")
i = 1
successful = False # for checking success of try | else not working
while i <= len(total_pins):
try:
temp_xpath = "/html/body/div[1]/div[1]/div[1]/div/div/div/div/div[1]/div/div/div/div[" + str(i) + "]/div/div/div[2]/div[2]/h4/a[1]"
temp = driver.find_element_by_xpath(temp_xpath)
pin_Board_Names.append(temp.text)
# print("Board_No: " + str(i) + " > " + temp.text)
successful = True
except:
temp_xpath = "/html/body/div[1]/div[1]/div[1]/div/div/div/div/div[1]/div/div/div/div[" + str(i) + "]/div/div/div[2]/div/h4/a[1]"
temp = driver.find_element_by_xpath(temp_xpath)
pin_Board_Names.append(temp.text)
# print("Board_No: " + str(i) + " > " + temp.text)
successful = True
if successful == False:
print("Board_No: " + str(i) + " not found!")
i += 1
# quit driver
driver.quit()
输出:
Just old
Tiny House interior
SimpleLivingMama.com
no element at pin number: 4
SimpleLivingMama.com
Books for Pre-Schoolers
Stuff to Try
Baby & Toddler Milestones
Toys For Boys & Girls
House
OT
Make Extra Money
Shoes
Old photos
Crafts
for baby
There's A Book About That
Geek
Real DIY
Recycle & Repurpose
Crafts
Preschool Activities
Wild West Project
#BossMoms
no element at pin number: 24
#BossMoms
Crazy for DIY
Money Saving Tips
Painting Furniture
The home I want
screen door ideas
DIY Home
Little girl rooms
Container Home Desing
Bentley Joseph Adams
some truth bombs
New house!
Advice and Wisdom-Words
no element at pin number: 37
Advice and Wisdom-Words
House ideas
Houses
no element at pin number: 40
Houses
no element at pin number: 41
Houses
Fine Motor Activities for Kids
crafts
decorating ideas
mama
Barn Homes
For the Home
no element at pin number: 48
For the Home
检查了找不到输出的pin码,但网页上有电路板名称
编辑3:注意,在引脚编号47之后,它总是说找不到元素。不管名单有多大。还检查了所有按钮XPath是否在moreButtons中,并且它们是否有效
提前感谢您的帮助在@AnkDasCo的帮助下,我们在评论中找到了解决方案。这里有两个问题:
# Pin Length - Total Pins
total_pins = []
total_pins = driver.find_elements_by_class_name("Grid__Item")
# Pin Board Names
i = 1
while i <= len(total_pins):
temp_xpath = "/html/body/div[1]/div[1]/div[1]/div/div/div/div/div[1]/div/div/div/div[" + str(i) + "]/div/div/div[2]/div[2]/h4/a[1]"
temp = driver.find_element_by_xpath(temp_xpath)
#pin_Board_Names.append(temp)
print(temp.text)
i += 1
driver = webdriver.PhantomJS(executable_path='phantomjs.exe')
print("Ghost Headless Driver Invoked")
# driver.implicitly_wait(5) # if element not found, wait for (seconds) before next operation
driver.get(url) # grab the url
# Scrolling till the end of page
print("Started Scrolling ... ")
match=True # change to 'False' for making this work..
lenOfPage = driver.execute_script("window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;")
while(match==False):
lastCount = lenOfPage
lenOfPage = driver.execute_script("window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;")
if lastCount==lenOfPage:
match=True
source_data = driver.page_source # page source code as html
# Get all pins , number of pins collected
total_pins = []
try:
total_pins = driver.find_elements_by_class_name("Grid__Item")
except:
print("Unable to load pins")
print("Total Pins: " + str(len(total_pins)))
# get number of 'see more' buttons collected - for error checking
moreButtons = driver.find_elements_by_xpath('//button[@data-test-id="seemoretoggle"]')
print("Dynamic Elements: " + str(len(moreButtons)))
print("Display: Dynamic Elements ... ")
# clicking all 'See More' buttons
i = 0
while i <= (len(moreButtons) - 1):
moreButtons[i].click()
i += 1
# Pin Board Names
print("Extracting Board Names ... ")
i = 1
successful = False # for checking success of try | else not working
while i <= len(total_pins):
try:
temp_xpath = "/html/body/div[1]/div[1]/div[1]/div/div/div/div/div[1]/div/div/div/div[" + str(i) + "]/div/div/div[2]/div[2]/h4/a[1]"
temp = driver.find_element_by_xpath(temp_xpath)
pin_Board_Names.append(temp.text)
# print("Board_No: " + str(i) + " > " + temp.text)
successful = True
except:
temp_xpath = "/html/body/div[1]/div[1]/div[1]/div/div/div/div/div[1]/div/div/div/div[" + str(i) + "]/div/div/div[2]/div/h4/a[1]"
temp = driver.find_element_by_xpath(temp_xpath)
pin_Board_Names.append(temp.text)
# print("Board_No: " + str(i) + " > " + temp.text)
successful = True
if successful == False:
print("Board_No: " + str(i) + " not found!")
i += 1
# quit driver
driver.quit()
driver=webdriver.PhantomJS(可执行文件\u path='PhantomJS.exe')
打印(“调用了鬼影无头驱动程序”)
#隐式_wait(5)#如果未找到元素,请在下一个操作之前等待(秒)
获取(url)#获取url
#滚动到页面末尾
打印(“开始滚动…”)
match=True#更改为“False”以实现此功能。。
lenOfPage=driver.execute_脚本(“window.scrollTo(0,document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;”)
while(match==False):
lastCount=Lenof页
lenOfPage=driver.execute_脚本(“window.scrollTo(0,document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;”)
如果lastCount==lenOfPage:
匹配=真
source_data=driver.page_source#将源代码分页为html
#获取所有PIN,收集的PIN数
总引脚数=[]
尝试:
引脚总数=驱动器。按类名称(“网格项”)查找元素
除:
打印(“无法加载PIN”)
打印(“总引脚:+str(透镜(总引脚)))
#获取收集的“查看更多”按钮数-用于错误检查
moreButtons=driver。通过xpath查找元素('//button[@data test id=“seemoretoggle”]”)
打印(“动态元素:+str(len(moreButtons)))
打印(“显示:动态元素…”)
#单击所有“查看更多”按钮
i=0
而我知道你能在网页上看到多少元素?所有元素。。我使用的是可以工作的滚动脚本。是否(以编程方式)等待加载元素并确定它是否存在于代码中?我认为当在代码中完成.click()时,它会这样做。这是一个带有按钮和按下时显示的div的图像:当元素引发异常并让它重试时,可以在元素上使用WebDriverWait吗。让我知道进展如何。我认为当元素对selenium不可见时会发生这种情况