Javascript Selenium/python:在每次滚动后从动态加载的网页中提取文本
我正在使用Selenium/python自动向下滚动社交媒体网站并抓取帖子。在滚动一定次数(下面的代码)后,我当前在一次“点击”中提取所有文本,但我只想在每次滚动后提取新加载的文本 例如,如果页面最初包含文本“A,B,C”,那么在第一次滚动之后它显示“D,E,F”,我想存储“A,B,C”,然后滚动,然后存储“D,E,F”等等 我想提取的具体项目是帖子的日期和消息文本,这可以分别通过css选择器Javascript Selenium/python:在每次滚动后从动态加载的网页中提取文本,javascript,python,css,selenium,selenium-webdriver,Javascript,Python,Css,Selenium,Selenium Webdriver,我正在使用Selenium/python自动向下滚动社交媒体网站并抓取帖子。在滚动一定次数(下面的代码)后,我当前在一次“点击”中提取所有文本,但我只想在每次滚动后提取新加载的文本 例如,如果页面最初包含文本“A,B,C”,那么在第一次滚动之后它显示“D,E,F”,我想存储“A,B,C”,然后滚动,然后存储“D,E,F”等等 我想提取的具体项目是帖子的日期和消息文本,这可以分别通过css选择器'.message date'和'.message body'获得(例如日期=驱动程序。通过css_选择
'.message date'
和'.message body'
获得(例如日期=驱动程序。通过css_选择器('.message date')
)
有人能建议如何在每次滚动后仅提取新加载的文本吗?
以下是我当前的代码(在我完成滚动后提取所有日期/消息):
只需在滚动后睡觉,确保像使用真正的机器一样使用selenium,这需要等待页面加载新内容。 我建议您使用selenium中的wait函数,或者在代码中简单地添加sleep函数来加载内容
time.sleep(5)
您可以将消息的数量存储在一个变量中,并使用
xpath
和position()
获取新添加的帖子
dates = []
messages = []
num_of_posts = 1
for i in range(1, ScrollNumber):
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(3)
dates.extend(driver.find_elements_by_xpath('(//div[@class="message-date"])[position()>=' + str(num_of_posts) + ']'))
messages.extend(driver.find_elements_by_xpath('(//div[contains(@class, "message-body")])[position()>=' + str(num_of_posts) + ']'))
num_of_posts = len(dates)
这并不是您真正想要的(它不是一个selenium解决方案),它实际上是一个使用页面在后台使用的API的解决方案。在我看来,使用selenium而不是API是一种过度使用 下面是使用API的脚本:
import re
import requests
STREAM_ID_REGEX = re.compile(r"data-stream='symbol-(?P<id>\d+)'")
CSRF_TOKEN_REGEX = re.compile(r'<meta name="csrf-token" content="(?P<csrf>[^"]+)" />')
URL = 'https://stocktwits.com/symbol/USDJPY?q=%24USDjpy'
API_URL = 'https://stocktwits.com/streams/poll?stream=symbol&substream=top&stream_id={stream_id}&item_id={stream_id}'
def get_necessary_info():
res = requests.get(URL)
# Extract stream_id
match = STREAM_ID_REGEX.search(res.text)
stream_id = match.group('id')
# Extract CSRF token
match = CSRF_TOKEN_REGEX.search(res.text)
csrf_token = match.group('csrf')
return stream_id, csrf_token
def get_messages(stream_id, csrf_token, max_messages=100):
base_url = API_URL.format(stream_id=stream_id)
# Required headers
headers = {
'x-csrf-token': csrf_token,
'x-requested-with': 'XMLHttpRequest',
}
messages = []
more = True
max_value = None
while more:
# Pagination
if max_value:
url = '{}&max={}'.format(base_url, max_value)
else:
url = base_url
# Get JSON response
res = requests.get(url, headers=headers)
data = res.json()
# Add returned messages
messages.extend(data['messages'])
# Check if there are more messages
more = data['more']
if more:
max_value = data['max']
# Check if we have enough messages
if len(messages) >= max_messages:
break
return messages
def main():
stream_id, csrf_token = get_necessary_info()
messages = get_messages(stream_id, csrf_token)
for message in messages:
print(message['created_at'], message['body'])
if __name__ == '__main__':
main()
我在facebook上的帖子也有同样的问题。 为此,我在一个列表中保存了post ID(或者任何对post唯一的值,甚至是散列),然后当您再次进行查询时,您需要检查该ID是否在您的列表中
此外,您可以删除已解析的DOM,因此只有新的DOM才会存在。正如其他人所说,如果您可以通过直接点击API来完成所需的操作,这是您的最佳选择。如果您必须使用硒,请参阅下面我的解决方案 为了我的需要,我做了类似于下面的事情
- 我正在利用CSS路径的
特性在加载元素时单独查找元素:nth-child()
- 我还使用selenium的显式等待功能(通过
包,explicit
)高效地等待元素加载pip install explicit
from __future__ import print_function
from itertools import count
import sys
import time
from explicit import waiter, CSS
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.wait import WebDriverWait as Wait
# The CSS selectors we will use
POSTS_BASE_CSS = 'ol.stream-list > li' # All li elements
POST_BASE_CSS = POSTS_BASE_CSS + ":nth-child({0})" # li child element at index {0}
POST_DATE_CSS = POST_BASE_CSS + ' div.message-date' # li child element at {0} with div.message-date
POST_BODY_CSS = POST_BASE_CSS + ' div.message-body' # li child element at {0} with div.message-date
class Post(object):
def __init__(self, driver, post_index):
self.driver = driver
self.date_css = POST_DATE_CSS.format(post_index)
self.text_css = POST_BODY_CSS.format(post_index)
@property
def date(self):
return waiter.find_element(self.driver, self.date_css, CSS).text
@property
def text(self):
return waiter.find_element(self.driver, self.text_css, CSS).text
def get_posts(driver, url, max_screen_scrolls):
""" Post object generator """
driver.get(url)
screen_scroll_count = 0
# Wait for the initial posts to load:
waiter.find_elements(driver, POSTS_BASE_CSS, CSS)
for index in count(1):
# Evaluate if we need to scroll the screen, or exit the generator
# If there is no element at this index, it means we need to scroll the screen
if len(driver.find_elements_by_css_selector('ol.stream-list > :nth-child({0})'.format(index))) == 0:
if screen_scroll_count >= max_screen_scrolls:
# Break if we have already done the max scrolls
break
# Get count of total posts on page
post_count = len(waiter.find_elements(driver, POSTS_BASE_CSS, CSS))
# Scroll down
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
screen_scroll_count += 1
def posts_load(driver):
""" Custom explicit wait function; waits for more posts to load in """
return len(waiter.find_elements(driver, POSTS_BASE_CSS, CSS)) > post_count
# Wait until new posts load in
Wait(driver, 20).until(posts_load)
# The list elements have sponsored ads and scripts mixed in with the posts we
# want to scrape. Check if they have a div.message-date element and continue on
# if not
includes_date_css = POST_DATE_CSS.format(index)
if len(driver.find_elements_by_css_selector(includes_date_css)) == 0:
continue
yield Post(driver, index)
def main():
url = "https://stocktwits.com/symbol/USDJPY?q=%24USDjpy"
max_screen_scrolls = 4
driver = webdriver.Chrome()
try:
for post_num, post in enumerate(get_posts(driver, url, max_screen_scrolls), 1):
print("*" * 40)
print("Post #{0}".format(post_num))
print("\nDate: {0}".format(post.date))
print("Text: {0}\n".format(post.text[:34]))
finally:
driver.quit() # Use try/finally to make sure the driver is closed
if __name__ == "__main__":
main()
全面披露:
我是
explicit
包的创建者。您可以直接使用显式等待轻松重写上述内容,但以牺牲可读性为代价。这正是您想要的。但是,我不会这样刮网站…它只会变得越来越慢,运行时间越长。RAM的使用也将失控
import time
from hashlib import md5
import selenium.webdriver.support.ui as ui
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
URL = 'https://stocktwits.com/symbol/USDJPY?q=%24USDjpy'
CSS = By.CSS_SELECTOR
driver.get(URL)
def scrape_for(max_seconds=300):
found = set()
end_at = time.time() + max_seconds
wait = ui.WebDriverWait(driver, 5, 0.5)
while True:
# find elements
elms = driver.find_elements(CSS, 'li.messageli')
for li in elms:
# get the information we need about each post
text = li.find_element(CSS, 'div.message-content')
key = md5(text.text.encode('ascii', 'ignore')).hexdigest()
if key in found:
continue
found.add(key)
try:
date = li.find_element(CSS, 'div.message-date').text
except NoSuchElementException as e:
date = None
yield text.text, date
if time.time() > end_at:
raise StopIteration
driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
wait.until(EC.invisibility_of_element_located(
(CSS, 'div#more-button-loading')))
raise StopIteration
for twit in scrape_for(60):
print(twit)
driver.quit()
为什么在后台使用API时会有这么多麻烦<代码>https://stocktwits.com/streams/poll?stream=symbol&max=92297722&stream_id=674&substream=top&item_id=674谢谢您提供的信息,但您介意详细说明我如何使用API下载数据吗?那个网址对我不起作用。关于为什么我一开始没有使用API,a)我不知道怎么做(我是API新手)和b)我听说API只能用于下载过去30天的数据,而我至少需要一年的帖子。你需要学习如何使用Chrome开发者工具或Firebug来查找网络请求。您可以先检索页面,然后从页面中找出API url。问题是,如果页面滚动添加新数据,它总是通过调用API来添加数据,API将返回JSON或HTML或其他格式的数据。所以,若您可以使用滚动手动获取年份数据,那个么您也可以通过API获取它。这是因为即使是网页也必须获取数据来显示您的Hanks,我将对此进行研究。在我提供的示例代码中,我已经在使用
time.sleep()
,您是否建议我以不同的方式使用此函数来解决我的特定问题(这是根据我原始问题第二段中的示例,仅在每个卷轴后提取新加载的帖子)?如果是这样的话,你能详细说明一下如何实现这一点吗?使用python很重要吗?亲爱的朋友,我理解这个问题,让我用php查看该页面。我会在第二天告诉你,那太好了,谢谢。我只熟悉R和python,但会使用任何可用的语言。你用php作为光盘检查过吗上面提到@kiamoz?@jruf003你能告诉我我的答案有什么问题吗?我想你指的是被否决?大概这是因为你的帖子没有直接回答我的问题,但我不是投票否决你的人。事实上,我投票支持你是因为你读了字里行间的内容,给了我一个很好的解决方案(谢谢)。谢谢,这真的很有帮助。当另一个用户首先回答问题时,我给了他奖金,但是你的代码也很好用。
from __future__ import print_function
from itertools import count
import sys
import time
from explicit import waiter, CSS
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.wait import WebDriverWait as Wait
# The CSS selectors we will use
POSTS_BASE_CSS = 'ol.stream-list > li' # All li elements
POST_BASE_CSS = POSTS_BASE_CSS + ":nth-child({0})" # li child element at index {0}
POST_DATE_CSS = POST_BASE_CSS + ' div.message-date' # li child element at {0} with div.message-date
POST_BODY_CSS = POST_BASE_CSS + ' div.message-body' # li child element at {0} with div.message-date
class Post(object):
def __init__(self, driver, post_index):
self.driver = driver
self.date_css = POST_DATE_CSS.format(post_index)
self.text_css = POST_BODY_CSS.format(post_index)
@property
def date(self):
return waiter.find_element(self.driver, self.date_css, CSS).text
@property
def text(self):
return waiter.find_element(self.driver, self.text_css, CSS).text
def get_posts(driver, url, max_screen_scrolls):
""" Post object generator """
driver.get(url)
screen_scroll_count = 0
# Wait for the initial posts to load:
waiter.find_elements(driver, POSTS_BASE_CSS, CSS)
for index in count(1):
# Evaluate if we need to scroll the screen, or exit the generator
# If there is no element at this index, it means we need to scroll the screen
if len(driver.find_elements_by_css_selector('ol.stream-list > :nth-child({0})'.format(index))) == 0:
if screen_scroll_count >= max_screen_scrolls:
# Break if we have already done the max scrolls
break
# Get count of total posts on page
post_count = len(waiter.find_elements(driver, POSTS_BASE_CSS, CSS))
# Scroll down
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
screen_scroll_count += 1
def posts_load(driver):
""" Custom explicit wait function; waits for more posts to load in """
return len(waiter.find_elements(driver, POSTS_BASE_CSS, CSS)) > post_count
# Wait until new posts load in
Wait(driver, 20).until(posts_load)
# The list elements have sponsored ads and scripts mixed in with the posts we
# want to scrape. Check if they have a div.message-date element and continue on
# if not
includes_date_css = POST_DATE_CSS.format(index)
if len(driver.find_elements_by_css_selector(includes_date_css)) == 0:
continue
yield Post(driver, index)
def main():
url = "https://stocktwits.com/symbol/USDJPY?q=%24USDjpy"
max_screen_scrolls = 4
driver = webdriver.Chrome()
try:
for post_num, post in enumerate(get_posts(driver, url, max_screen_scrolls), 1):
print("*" * 40)
print("Post #{0}".format(post_num))
print("\nDate: {0}".format(post.date))
print("Text: {0}\n".format(post.text[:34]))
finally:
driver.quit() # Use try/finally to make sure the driver is closed
if __name__ == "__main__":
main()
import time
from hashlib import md5
import selenium.webdriver.support.ui as ui
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
URL = 'https://stocktwits.com/symbol/USDJPY?q=%24USDjpy'
CSS = By.CSS_SELECTOR
driver.get(URL)
def scrape_for(max_seconds=300):
found = set()
end_at = time.time() + max_seconds
wait = ui.WebDriverWait(driver, 5, 0.5)
while True:
# find elements
elms = driver.find_elements(CSS, 'li.messageli')
for li in elms:
# get the information we need about each post
text = li.find_element(CSS, 'div.message-content')
key = md5(text.text.encode('ascii', 'ignore')).hexdigest()
if key in found:
continue
found.add(key)
try:
date = li.find_element(CSS, 'div.message-date').text
except NoSuchElementException as e:
date = None
yield text.text, date
if time.time() > end_at:
raise StopIteration
driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
wait.until(EC.invisibility_of_element_located(
(CSS, 'div#more-button-loading')))
raise StopIteration
for twit in scrape_for(60):
print(twit)
driver.quit()