Python 解析向下滚动的整个网页的html代码

Python 解析向下滚动的整个网页的html代码,python,selenium,web-scraping,beautifulsoup,urllib,Python,Selenium,Web Scraping,Beautifulsoup,Urllib,这将不会给我整个网页向下滚动结束,我想要的,但只有它的一部分 编辑: 预期产出: from selenium import webdriver from selenium.common.exceptions import StaleElementReferenceException, TimeoutException from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebD

这将不会给我整个网页向下滚动结束,我想要的,但只有它的一部分

编辑:

预期产出:

from selenium import webdriver
from selenium.common.exceptions import StaleElementReferenceException, TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import urllib,sys,requests
reload(sys)
sys.setdefaultencoding("utf-8")

class wait_for_more_than_n_elements_to_be_present(object):
    def __init__(self, locator, count):
        self.locator = locator
        self.count = count

    def __call__(self, driver):
        try:
            elements = EC._find_elements(driver, self.locator)
            return len(elements) > self.count
        except StaleElementReferenceException:
            return False

def return_html_code(url):
    driver = webdriver.Firefox()
    driver.maximize_window()
    driver.get(url)
    # initial wait for the tweets to load
    wait = WebDriverWait(driver, 10)
    wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "li[data-item-id]")))
    # scroll down to the last tweet until there is no more tweets loaded
    while True:
        tweets = driver.find_elements_by_css_selector("li[data-item-id]")
        number_of_tweets = len(tweets)
        print number_of_tweets
        driver.execute_script("arguments[0].scrollIntoView();", tweets[-1])
        try:
            wait.until(wait_for_more_than_n_elements_to_be_present((By.CSS_SELECTOR, "li[data-item-id]"), number_of_tweets))
        except TimeoutException:
            break
    html_full_source=driver.page_source
    driver.close()
    return html_full_source


url='https://twitter.com/thecoolstacks'
#using selenium browser
html_source=return_html_code(url)
soup_selenium = BeautifulSoup(html_source)
print soup_selenium
text_tweet=[]
alltweets_selenium = soup_selenium.find_all(attrs={'data-item-type' : 'tweet'})
for tweet in alltweets_selenium:
    #Text of tweet
    html_tweet= tweet.find_all("p", class_="TweetTextSize TweetTextSize--16px js-tweet-text tweet-text")
    text_tweet.append(''.join(html_tweet[0].findAll(text=True)))    
print text_tweet

我仍然坚持使用

或者,下面是如何使用
selenium
解决问题:

  • 使用并定义等待tweets加载到滚动上
  • 通过
    scrollIntoView()
实施:

import requests from bs4 import BeautifulSoup      url='https://twitter.com/thecoolstacks' 
req = requests.get(url) 
soup = BeautifulSoup(req.content) 
alltweets = soup.find_all(attrs={'data-item-type' : 'tweet'}) 
print alltweets[0]
这将根据需要向下滚动,以加载该频道中的所有现有tweet


以下是HTML解析片段,提取推文:

from selenium import webdriver
from selenium.common.exceptions import StaleElementReferenceException, TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


class wait_for_more_than_n_elements_to_be_present(object):
    def __init__(self, locator, count):
        self.locator = locator
        self.count = count

    def __call__(self, driver):
        try:
            elements = EC._find_elements(driver, self.locator)
            return len(elements) > self.count
        except StaleElementReferenceException:
            return False


url = "https://twitter.com/ndtv"
driver = webdriver.Firefox()
driver.maximize_window()
driver.get(url)

# initial wait for the tweets to load
wait = WebDriverWait(driver, 10)
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "li[data-item-id]")))

# scroll down to the last tweet until there is no more tweets loaded
while True:
    tweets = driver.find_elements_by_css_selector("li[data-item-id]")
    number_of_tweets = len(tweets)

    driver.execute_script("arguments[0].scrollIntoView();", tweets[-1])

    try:
        wait.until(wait_for_more_than_n_elements_to_be_present((By.CSS_SELECTOR, "li[data-item-id]"), number_of_tweets))
    except TimeoutException:
        break
它打印:

page_source = driver.page_source
driver.close()

soup = BeautifulSoup(page_source)
for tweet in soup.select("div.tweet div.content"):
    print tweet.p.text

我仍然坚持使用

或者,下面是如何使用
selenium
解决问题:

  • 使用并定义等待tweets加载到滚动上
  • 通过
    scrollIntoView()
实施:

import requests from bs4 import BeautifulSoup      url='https://twitter.com/thecoolstacks' 
req = requests.get(url) 
soup = BeautifulSoup(req.content) 
alltweets = soup.find_all(attrs={'data-item-type' : 'tweet'}) 
print alltweets[0]
这将根据需要向下滚动,以加载该频道中的所有现有tweet


以下是HTML解析片段,提取推文:

from selenium import webdriver
from selenium.common.exceptions import StaleElementReferenceException, TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


class wait_for_more_than_n_elements_to_be_present(object):
    def __init__(self, locator, count):
        self.locator = locator
        self.count = count

    def __call__(self, driver):
        try:
            elements = EC._find_elements(driver, self.locator)
            return len(elements) > self.count
        except StaleElementReferenceException:
            return False


url = "https://twitter.com/ndtv"
driver = webdriver.Firefox()
driver.maximize_window()
driver.get(url)

# initial wait for the tweets to load
wait = WebDriverWait(driver, 10)
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "li[data-item-id]")))

# scroll down to the last tweet until there is no more tweets loaded
while True:
    tweets = driver.find_elements_by_css_selector("li[data-item-id]")
    number_of_tweets = len(tweets)

    driver.execute_script("arguments[0].scrollIntoView();", tweets[-1])

    try:
        wait.until(wait_for_more_than_n_elements_to_be_present((By.CSS_SELECTOR, "li[data-item-id]"), number_of_tweets))
    except TimeoutException:
        break
它打印:

page_source = driver.page_source
driver.close()

soup = BeautifulSoup(page_source)
for tweet in soup.select("div.tweet div.content"):
    print tweet.p.text


为什么不使用Twitter API呢?我很确定推特主页在滚动时是动态加载的,所以我认为BS无法做到这一点。使用
chrome
devtools
,滚动时新的ajax调用处理down@alecxe很抱歉没提那件事。我试过了。不使用的原因:Twitter API不允许访问搜索查询的历史数据,并且对特定用户的推文限制为3200条。@I请不要使用任何其他软件包或您建议的解决方法。为什么不使用Twitter API呢?我很确定推特主页在您滚动时会动态加载,所以我认为BS不会这样做使用
chrome
devtools
,新的ajax在滚动时调用processingdown@alecxe很抱歉没提那件事。我试过了。不使用的原因:Twitter API不允许访问搜索查询的历史数据,并且对特定用户的tweet限制为3200条。@i请使用您建议的任何其他软件包或解决方法。太好了!非常感谢。我希望使用
driver.page\u source
html\u代码
转换为soup,使用
soup=BeautifulSoup(driver.page\u source)
。但是soup不包含整页的源代码。你能告诉我哪里出错了吗?请检查上面的疑问。@AbhishekBhatia当然,你是在while循环完成后调用
soup=beautifulsou(driver.page_source)
吗?@AbhishekBhatia是的,在关闭浏览器之前获取页面源代码-在调用
driver.close()之前
。再次感谢您的帮助!你的建议奏效了,但在通过soup中的html代码进行解析时,我仍然面临一个奇怪的问题。请检查以上代码。理想情况下,它应该返回我所有的推文,但现在不是
text\u tweet
包含所有的tweet,我使用简单的web废弃。太棒了!非常感谢。我希望使用
driver.page\u source
html\u代码
转换为soup,使用
soup=BeautifulSoup(driver.page\u source)
。但是soup不包含整页的源代码。你能告诉我哪里出错了吗?请检查上面的疑问。@AbhishekBhatia当然,你是在while循环完成后调用
soup=beautifulsou(driver.page_source)
吗?@AbhishekBhatia是的,在关闭浏览器之前获取页面源代码-在调用
driver.close()之前
。再次感谢您的帮助!你的建议奏效了,但在通过soup中的html代码进行解析时,我仍然面临一个奇怪的问题。请检查以上代码。理想情况下,它应该返回我所有的推文,但现在不是
text\u tweet
包含所有的tweet,我使用简单的web废弃。