Python 解析向下滚动的整个网页的html代码_Python_Selenium_Web Scraping_Beautifulsoup_Urllib

Python 解析向下滚动的整个网页的html代码

python selenium web-scraping

Python 解析向下滚动的整个网页的html代码,python,selenium,web-scraping,beautifulsoup,urllib,Python,Selenium,Web Scraping,Beautifulsoup,Urllib,这将不会给我整个网页向下滚动结束，我想要的，但只有它的一部分编辑：预期产出： from selenium import webdriver from selenium.common.exceptions import StaleElementReferenceException, TimeoutException from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebD

这将不会给我整个网页向下滚动结束，我想要的，但只有它的一部分

编辑：

预期产出：

from selenium import webdriver
from selenium.common.exceptions import StaleElementReferenceException, TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import urllib,sys,requests
reload(sys)
sys.setdefaultencoding("utf-8")

class wait_for_more_than_n_elements_to_be_present(object):
    def __init__(self, locator, count):
        self.locator = locator
        self.count = count

    def __call__(self, driver):
        try:
            elements = EC._find_elements(driver, self.locator)
            return len(elements) > self.count
        except StaleElementReferenceException:
            return False

def return_html_code(url):
    driver = webdriver.Firefox()
    driver.maximize_window()
    driver.get(url)
    # initial wait for the tweets to load
    wait = WebDriverWait(driver, 10)
    wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "li[data-item-id]")))
    # scroll down to the last tweet until there is no more tweets loaded
    while True:
        tweets = driver.find_elements_by_css_selector("li[data-item-id]")
        number_of_tweets = len(tweets)
        print number_of_tweets
        driver.execute_script("arguments[0].scrollIntoView();", tweets[-1])
        try:
            wait.until(wait_for_more_than_n_elements_to_be_present((By.CSS_SELECTOR, "li[data-item-id]"), number_of_tweets))
        except TimeoutException:
            break
    html_full_source=driver.page_source
    driver.close()
    return html_full_source


url='https://twitter.com/thecoolstacks'
#using selenium browser
html_source=return_html_code(url)
soup_selenium = BeautifulSoup(html_source)
print soup_selenium
text_tweet=[]
alltweets_selenium = soup_selenium.find_all(attrs={'data-item-type' : 'tweet'})
for tweet in alltweets_selenium:
    #Text of tweet
    html_tweet= tweet.find_all("p", class_="TweetTextSize TweetTextSize--16px js-tweet-text tweet-text")
    text_tweet.append(''.join(html_tweet[0].findAll(text=True)))    
print text_tweet

我仍然坚持使用

或者，下面是如何使用

selenium

解决问题：

使用并定义等待tweets加载到滚动上
通过
```
scrollIntoView（）
```

实施：

import requests from bs4 import BeautifulSoup      url='https://twitter.com/thecoolstacks' 
req = requests.get(url) 
soup = BeautifulSoup(req.content) 
alltweets = soup.find_all(attrs={'data-item-type' : 'tweet'}) 
print alltweets[0]

这将根据需要向下滚动，以加载该频道中的所有现有tweet

以下是HTML解析片段，提取推文：

from selenium import webdriver
from selenium.common.exceptions import StaleElementReferenceException, TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


class wait_for_more_than_n_elements_to_be_present(object):
    def __init__(self, locator, count):
        self.locator = locator
        self.count = count

    def __call__(self, driver):
        try:
            elements = EC._find_elements(driver, self.locator)
            return len(elements) > self.count
        except StaleElementReferenceException:
            return False


url = "https://twitter.com/ndtv"
driver = webdriver.Firefox()
driver.maximize_window()
driver.get(url)

# initial wait for the tweets to load
wait = WebDriverWait(driver, 10)
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "li[data-item-id]")))

# scroll down to the last tweet until there is no more tweets loaded
while True:
    tweets = driver.find_elements_by_css_selector("li[data-item-id]")
    number_of_tweets = len(tweets)

    driver.execute_script("arguments[0].scrollIntoView();", tweets[-1])

    try:
        wait.until(wait_for_more_than_n_elements_to_be_present((By.CSS_SELECTOR, "li[data-item-id]"), number_of_tweets))
    except TimeoutException:
        break

它打印：

page_source = driver.page_source
driver.close()

soup = BeautifulSoup(page_source)
for tweet in soup.select("div.tweet div.content"):
    print tweet.p.text

我仍然坚持使用

或者，下面是如何使用

selenium

解决问题：

使用并定义等待tweets加载到滚动上
通过
```
scrollIntoView（）
```

实施：

import requests from bs4 import BeautifulSoup      url='https://twitter.com/thecoolstacks' 
req = requests.get(url) 
soup = BeautifulSoup(req.content) 
alltweets = soup.find_all(attrs={'data-item-type' : 'tweet'}) 
print alltweets[0]

这将根据需要向下滚动，以加载该频道中的所有现有tweet

以下是HTML解析片段，提取推文：

from selenium import webdriver
from selenium.common.exceptions import StaleElementReferenceException, TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


class wait_for_more_than_n_elements_to_be_present(object):
    def __init__(self, locator, count):
        self.locator = locator
        self.count = count

    def __call__(self, driver):
        try:
            elements = EC._find_elements(driver, self.locator)
            return len(elements) > self.count
        except StaleElementReferenceException:
            return False


url = "https://twitter.com/ndtv"
driver = webdriver.Firefox()
driver.maximize_window()
driver.get(url)

# initial wait for the tweets to load
wait = WebDriverWait(driver, 10)
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "li[data-item-id]")))

# scroll down to the last tweet until there is no more tweets loaded
while True:
    tweets = driver.find_elements_by_css_selector("li[data-item-id]")
    number_of_tweets = len(tweets)

    driver.execute_script("arguments[0].scrollIntoView();", tweets[-1])

    try:
        wait.until(wait_for_more_than_n_elements_to_be_present((By.CSS_SELECTOR, "li[data-item-id]"), number_of_tweets))
    except TimeoutException:
        break

它打印：

page_source = driver.page_source
driver.close()

soup = BeautifulSoup(page_source)
for tweet in soup.select("div.tweet div.content"):
    print tweet.p.text

为什么不使用Twitter API呢？我很确定推特主页在滚动时是动态加载的，所以我认为BS无法做到这一点。使用

chrome

和

devtools

，滚动时新的ajax调用处理down@alecxe很抱歉没提那件事。我试过了。不使用的原因：Twitter API不允许访问搜索查询的历史数据，并且对特定用户的推文限制为3200条。@I请不要使用任何其他软件包或您建议的解决方法。为什么不使用Twitter API呢？我很确定推特主页在您滚动时会动态加载，所以我认为BS不会这样做使用

chrome

和

devtools

，新的ajax在滚动时调用processingdown@alecxe很抱歉没提那件事。我试过了。不使用的原因：Twitter API不允许访问搜索查询的历史数据，并且对特定用户的tweet限制为3200条。@i请使用您建议的任何其他软件包或解决方法。太好了！非常感谢。我希望使用

driver.page\u source

将

html\u代码

转换为soup，使用

soup=BeautifulSoup（driver.page\u source）

。但是soup不包含整页的源代码。你能告诉我哪里出错了吗？请检查上面的疑问。@AbhishekBhatia当然，你是在while循环完成后调用

soup=beautifulsou（driver.page_source）

吗？@AbhishekBhatia是的，在关闭浏览器之前获取页面源代码-在调用

driver.close（）之前

。再次感谢您的帮助！你的建议奏效了，但在通过soup中的html代码进行解析时，我仍然面临一个奇怪的问题。请检查以上代码。理想情况下，它应该返回我所有的推文，但现在不是

text\u tweet

包含所有的tweet，我使用简单的web废弃。太棒了！非常感谢。我希望使用

driver.page\u source

将

html\u代码

转换为soup，使用

soup=BeautifulSoup（driver.page\u source）

。但是soup不包含整页的源代码。你能告诉我哪里出错了吗？请检查上面的疑问。@AbhishekBhatia当然，你是在while循环完成后调用

soup=beautifulsou（driver.page_source）

吗？@AbhishekBhatia是的，在关闭浏览器之前获取页面源代码-在调用

driver.close（）之前

text\u tweet

包含所有的tweet，我使用简单的web废弃。