Python 解析向下滚动的整个网页的html代码
这将不会给我整个网页向下滚动结束,我想要的,但只有它的一部分 编辑: 预期产出:Python 解析向下滚动的整个网页的html代码,python,selenium,web-scraping,beautifulsoup,urllib,Python,Selenium,Web Scraping,Beautifulsoup,Urllib,这将不会给我整个网页向下滚动结束,我想要的,但只有它的一部分 编辑: 预期产出: from selenium import webdriver from selenium.common.exceptions import StaleElementReferenceException, TimeoutException from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebD
from selenium import webdriver
from selenium.common.exceptions import StaleElementReferenceException, TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import urllib,sys,requests
reload(sys)
sys.setdefaultencoding("utf-8")
class wait_for_more_than_n_elements_to_be_present(object):
def __init__(self, locator, count):
self.locator = locator
self.count = count
def __call__(self, driver):
try:
elements = EC._find_elements(driver, self.locator)
return len(elements) > self.count
except StaleElementReferenceException:
return False
def return_html_code(url):
driver = webdriver.Firefox()
driver.maximize_window()
driver.get(url)
# initial wait for the tweets to load
wait = WebDriverWait(driver, 10)
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "li[data-item-id]")))
# scroll down to the last tweet until there is no more tweets loaded
while True:
tweets = driver.find_elements_by_css_selector("li[data-item-id]")
number_of_tweets = len(tweets)
print number_of_tweets
driver.execute_script("arguments[0].scrollIntoView();", tweets[-1])
try:
wait.until(wait_for_more_than_n_elements_to_be_present((By.CSS_SELECTOR, "li[data-item-id]"), number_of_tweets))
except TimeoutException:
break
html_full_source=driver.page_source
driver.close()
return html_full_source
url='https://twitter.com/thecoolstacks'
#using selenium browser
html_source=return_html_code(url)
soup_selenium = BeautifulSoup(html_source)
print soup_selenium
text_tweet=[]
alltweets_selenium = soup_selenium.find_all(attrs={'data-item-type' : 'tweet'})
for tweet in alltweets_selenium:
#Text of tweet
html_tweet= tweet.find_all("p", class_="TweetTextSize TweetTextSize--16px js-tweet-text tweet-text")
text_tweet.append(''.join(html_tweet[0].findAll(text=True)))
print text_tweet
我仍然坚持使用 或者,下面是如何使用
selenium
解决问题:
- 使用并定义等待tweets加载到滚动上
- 通过
scrollIntoView()
import requests from bs4 import BeautifulSoup url='https://twitter.com/thecoolstacks'
req = requests.get(url)
soup = BeautifulSoup(req.content)
alltweets = soup.find_all(attrs={'data-item-type' : 'tweet'})
print alltweets[0]
这将根据需要向下滚动,以加载该频道中的所有现有tweet
以下是HTML解析片段,提取推文:
from selenium import webdriver
from selenium.common.exceptions import StaleElementReferenceException, TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
class wait_for_more_than_n_elements_to_be_present(object):
def __init__(self, locator, count):
self.locator = locator
self.count = count
def __call__(self, driver):
try:
elements = EC._find_elements(driver, self.locator)
return len(elements) > self.count
except StaleElementReferenceException:
return False
url = "https://twitter.com/ndtv"
driver = webdriver.Firefox()
driver.maximize_window()
driver.get(url)
# initial wait for the tweets to load
wait = WebDriverWait(driver, 10)
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "li[data-item-id]")))
# scroll down to the last tweet until there is no more tweets loaded
while True:
tweets = driver.find_elements_by_css_selector("li[data-item-id]")
number_of_tweets = len(tweets)
driver.execute_script("arguments[0].scrollIntoView();", tweets[-1])
try:
wait.until(wait_for_more_than_n_elements_to_be_present((By.CSS_SELECTOR, "li[data-item-id]"), number_of_tweets))
except TimeoutException:
break
它打印:
page_source = driver.page_source
driver.close()
soup = BeautifulSoup(page_source)
for tweet in soup.select("div.tweet div.content"):
print tweet.p.text
我仍然坚持使用 或者,下面是如何使用
selenium
解决问题:
- 使用并定义等待tweets加载到滚动上
- 通过
scrollIntoView()
import requests from bs4 import BeautifulSoup url='https://twitter.com/thecoolstacks'
req = requests.get(url)
soup = BeautifulSoup(req.content)
alltweets = soup.find_all(attrs={'data-item-type' : 'tweet'})
print alltweets[0]
这将根据需要向下滚动,以加载该频道中的所有现有tweet
以下是HTML解析片段,提取推文:
from selenium import webdriver
from selenium.common.exceptions import StaleElementReferenceException, TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
class wait_for_more_than_n_elements_to_be_present(object):
def __init__(self, locator, count):
self.locator = locator
self.count = count
def __call__(self, driver):
try:
elements = EC._find_elements(driver, self.locator)
return len(elements) > self.count
except StaleElementReferenceException:
return False
url = "https://twitter.com/ndtv"
driver = webdriver.Firefox()
driver.maximize_window()
driver.get(url)
# initial wait for the tweets to load
wait = WebDriverWait(driver, 10)
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "li[data-item-id]")))
# scroll down to the last tweet until there is no more tweets loaded
while True:
tweets = driver.find_elements_by_css_selector("li[data-item-id]")
number_of_tweets = len(tweets)
driver.execute_script("arguments[0].scrollIntoView();", tweets[-1])
try:
wait.until(wait_for_more_than_n_elements_to_be_present((By.CSS_SELECTOR, "li[data-item-id]"), number_of_tweets))
except TimeoutException:
break
它打印:
page_source = driver.page_source
driver.close()
soup = BeautifulSoup(page_source)
for tweet in soup.select("div.tweet div.content"):
print tweet.p.text
为什么不使用Twitter API呢?我很确定推特主页在滚动时是动态加载的,所以我认为BS无法做到这一点。使用
chrome
和devtools
,滚动时新的ajax调用处理down@alecxe很抱歉没提那件事。我试过了。不使用的原因:Twitter API不允许访问搜索查询的历史数据,并且对特定用户的推文限制为3200条。@I请不要使用任何其他软件包或您建议的解决方法。为什么不使用Twitter API呢?我很确定推特主页在您滚动时会动态加载,所以我认为BS不会这样做使用chrome
和devtools
,新的ajax在滚动时调用processingdown@alecxe很抱歉没提那件事。我试过了。不使用的原因:Twitter API不允许访问搜索查询的历史数据,并且对特定用户的tweet限制为3200条。@i请使用您建议的任何其他软件包或解决方法。太好了!非常感谢。我希望使用driver.page\u source
将html\u代码
转换为soup,使用soup=BeautifulSoup(driver.page\u source)
。但是soup不包含整页的源代码。你能告诉我哪里出错了吗?请检查上面的疑问。@AbhishekBhatia当然,你是在while循环完成后调用soup=beautifulsou(driver.page_source)
吗?@AbhishekBhatia是的,在关闭浏览器之前获取页面源代码-在调用driver.close()之前
。再次感谢您的帮助!你的建议奏效了,但在通过soup中的html代码进行解析时,我仍然面临一个奇怪的问题。请检查以上代码。理想情况下,它应该返回我所有的推文,但现在不是text\u tweet
包含所有的tweet,我使用简单的web废弃。太棒了!非常感谢。我希望使用driver.page\u source
将html\u代码
转换为soup,使用soup=BeautifulSoup(driver.page\u source)
。但是soup不包含整页的源代码。你能告诉我哪里出错了吗?请检查上面的疑问。@AbhishekBhatia当然,你是在while循环完成后调用soup=beautifulsou(driver.page_source)
吗?@AbhishekBhatia是的,在关闭浏览器之前获取页面源代码-在调用driver.close()之前
。再次感谢您的帮助!你的建议奏效了,但在通过soup中的html代码进行解析时,我仍然面临一个奇怪的问题。请检查以上代码。理想情况下,它应该返回我所有的推文,但现在不是text\u tweet
包含所有的tweet,我使用简单的web废弃。