Python 无头卷轴
输出:Python 无头卷轴,python,selenium,selenium-webdriver,xvfb,Python,Selenium,Selenium Webdriver,Xvfb,输出: from selenium import webdriver from selenium.webdriver.common.by import By from selenium.common.exceptions import StaleElementReferenceException, TimeoutException from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.sup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import StaleElementReferenceException, TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import urllib,requests,unidecode,lxml,pdb
from pyvirtualdisplay import Display
from xvfbwrapper import Xvfb
class wait_for_more_than_n_elements_to_be_present(object):
def __init__(self, locator, count):
self.locator = locator
self.count = count
def __call__(self, driver):
try:
elements = EC._find_elements(driver, self.locator)
return len(elements) > self.count
except StaleElementReferenceException:
return False
def return_html_code(url):
print url #added in edit 1
vdisplay =Xvfb()
vdisplay.start()
driver = webdriver.Firefox()
driver.maximize_window()
driver.get(url)
# initial wait for the tweets to load
wait = WebDriverWait(driver, 240)
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "li[data-item-id]")))
# scroll down to the last tweet until there is no more tweets loaded
while True:
tweets = driver.find_elements_by_css_selector("li[data-item-id]")
print len(tweets) #added in edit 1
driver.execute_script("arguments[0].scrollIntoView(true);", tweets[-1])
try:
wait.until(wait_for_more_than_n_elements_to_be_present((By.CSS_SELECTOR, "li[data-item-id]"), number_of_tweets))
except TimeoutException:
break
html_full_source=driver.page_source
driver.close()
vdisplay.stop()
html_full=return_html_code(url)
我有上面的代码,可以在无止境模式下无限滚动页面。但不知何故,它似乎停止之前。
参考资料-
编辑1:
https://twitter.com/search?q=Error%20Check&src=typd&lang=en
20
39
56
74
在runnning@alexce代码中,它在两次运行中显示了不同的输出,日期检查表明有更多的tweet:
$ phantomjs --version
2.1.1
编辑2:
运行@Alexe代码的更新版本时。在大约7000条推文之后,它显示了以下错误
https://twitter.com/search?q=Error%20Check&src=typd&lang=en
20
40
59
76
95
114
133
152
171
191
211
231
249
267
Date of most old tweet: 12 Jan 2016
https://twitter.com/search?q=Error%20Check&src=typd&lang=en
20
40
59
76
95
114
133
152
171
191
211
231
249
267
287
303
317
337
356
373
388
400
418
437
457
476
492
Date of most old tweet: 8 Jan 2016
回溯(最近一次呼叫最后一次):
文件“twitter_script.py”,第82行,在
搜索twitter(“匿名酗酒者”)
文件“twitter_script.py”,第76行,在search_twitter中
db_name=将_数据_写入_db(*获取_twitter_数据(查询))
文件“twitter_script.py”,第24行,在get_twitter_数据中
html\u full=返回\u html\u代码(url)
文件“c:\Users\sony\Desktop\social\u network\u extract\u old\social\u network\u extract\scrolldowhtmlcode.py”,第48行,返回\u html\u代码
html\u full\u source=driver.page\u source
文件“c:\Anaconda\lib\site packages\selenium\webdriver\remote\webdriver.py”,第464行,第页\u source
返回self.execute(Command.GET_PAGE_SOURCE)['value']
文件“c:\Anaconda\lib\site packages\selenium\webdriver\remote\webdriver.py”,第199行,执行
响应=self.command\u executor.execute(driver\u command,params)
文件“c:\Anaconda\lib\site packages\selenium\webdriver\remote\remote\u connection.py”,第395行,执行
返回self.\u请求(命令信息[0],url,正文=数据)
文件“c:\Anaconda\lib\site packages\selenium\webdriver\remote\remote\u connection.py”,第463行,在\u请求中
resp=opener.open(请求,超时=self.\u超时)
文件“c:\Anaconda\lib\urllib2.py”,第431行,打开
响应=自身打开(请求,数据)
文件“c:\Anaconda\lib\urllib2.py”,第449行,处于打开状态
"开放",
文件“c:\Anaconda\lib\urllib2.py”,第409行,在调用链中
结果=func(*args)
文件“c:\Anaconda\lib\urllib2.py”,第1227行,在http\u open中
返回self.do_open(httplib.HTTPConnection,req)
文件“c:\Anaconda\lib\urllib2.py”,第1200行,打开
r=h.getresponse(缓冲=True)
文件“c:\Anaconda\lib\httplib.py”,第1136行,在getresponse中
response.begin()
文件“c:\Anaconda\lib\httplib.py”,第453行,在begin中
版本、状态、原因=self.\u读取\u状态()
文件“c:\Anaconda\lib\httplib.py”,第409行,处于读取状态
line=self.fp.readline(_MAXLINE+1)
文件“c:\Anaconda\lib\socket.py”,第480行,在readline中
数据=self.\u sock.recv(self.\r bufsize)
socket.error:[Errno 10054]远程主机已强制关闭现有连接
编辑3:
正在为不同的url尝试相同的代码
Traceback (most recent call last):
File "twitter_script.py", line 82, in <module>
search_twitter('Alcoholics Anonymous')
File "twitter_script.py", line 76, in search_twitter
db_name=write_data_to_db(*get_twitter_data(query))
File "twitter_script.py", line 24, in get_twitter_data
html_full=return_html_code(url)
File "c:\Users\sony\Desktop\social_network_extract_old\social_network_extract\scrollDownHtmlCode.py", line 48, in return_html_code
html_full_source=driver.page_source
File "c:\Anaconda\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 464, in page_source
return self.execute(Command.GET_PAGE_SOURCE)['value']
File "c:\Anaconda\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 199, in execute
response = self.command_executor.execute(driver_command, params)
File "c:\Anaconda\lib\site-packages\selenium\webdriver\remote\remote_connection.py", line 395, in execute
return self._request(command_info[0], url, body=data)
File "c:\Anaconda\lib\site-packages\selenium\webdriver\remote\remote_connection.py", line 463, in _request
resp = opener.open(request, timeout=self._timeout)
File "c:\Anaconda\lib\urllib2.py", line 431, in open
response = self._open(req, data)
File "c:\Anaconda\lib\urllib2.py", line 449, in _open
'_open', req)
File "c:\Anaconda\lib\urllib2.py", line 409, in _call_chain
result = func(*args)
File "c:\Anaconda\lib\urllib2.py", line 1227, in http_open
return self.do_open(httplib.HTTPConnection, req)
File "c:\Anaconda\lib\urllib2.py", line 1200, in do_open
r = h.getresponse(buffering=True)
File "c:\Anaconda\lib\httplib.py", line 1136, in getresponse
response.begin()
File "c:\Anaconda\lib\httplib.py", line 453, in begin
version, status, reason = self._read_status()
File "c:\Anaconda\lib\httplib.py", line 409, in _read_status
line = self.fp.readline(_MAXLINE + 1)
File "c:\Anaconda\lib\socket.py", line 480, in readline
data = self._sock.recv(self._rbufsize)
socket.error: [Errno 10054] An existing connection was forcibly closed by the remote host
https://twitter.com/search?q=Alcoholics%20Anonymous%20Drunk%20since%3A2006-03-24%20直到%3A2006-04-23&src=typd&lang=en
回溯(最近一次呼叫最后一次):
文件“twitter_script.py”,第64行,在
搜索twitter(“酗酒者匿名醉酒者”)
文件“twitter_script.py”,第58行,在search_twitter中
db_name=将_数据_写入_db(*获取_twitter_数据(查询))
文件“twitter_script.py”,第31行,在get_twitter_数据中
html\u full=返回\u html\u代码(url)
文件“c:\Users\sony\Desktop\social\u network\u extract\u old\social\u network\u extract\scrolldowhtmlcode.py”,第30行,返回\u html\u代码
等待.until(位于((By.CSS_选择器,“li[数据项id]”)的_元素的EC.visibility_)
文件“c:\Anaconda\lib\site packages\selenium\webdriver\support\wait.py”,第80行,直到
引发TimeoutException(消息、屏幕、堆栈跟踪)
selenium.common.Exception.TimeoutException:消息:
屏幕截图:可通过屏幕
编辑4:
https://twitter.com/search?q=Alcoholics%20Anonymous%20Drunk%20since%3A2006-03-24%20until%3A2006-04-23&src=typd&lang=en
Traceback (most recent call last):
File "twitter_script.py", line 64, in <module>
search_twitter('Alcoholics Anonymous Drunk')
File "twitter_script.py", line 58, in search_twitter
db_name=write_data_to_db(*get_twitter_data(query))
File "twitter_script.py", line 31, in get_twitter_data
html_full=return_html_code(url)
File "c:\Users\sony\Desktop\social_network_extract_old\social_network_extract\scrollDownHtmlCode.py", line 30, in return_html_code
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "li[data-item-id]")))
File "c:\Anaconda\lib\site-packages\selenium\webdriver\support\wait.py", line 80, in until
raise TimeoutException(message, screen, stacktrace)
selenium.common.exceptions.TimeoutException: Message:
Screenshot: available via screen
ubuntu@ip-172-31-38-123:~/social\u network\u extract\u proxy$cat error.txt
回溯(最近一次呼叫最后一次):
文件“twitter_script.py”,第70行,在
搜索twitter(“匿名酗酒者”)
文件“twitter\u script.py”,第64行,搜索\u twitter
db_name=将_数据_写入_db(*获取_twitter_数据(查询))
文件“twitter_script.py”,第37行,在get_twitter_数据中
html\u full=返回\u html\u代码(url)
文件“/home/ubuntu/social\u network\u extract\u proxy/firefox\u driver\u code.py”,第35行,返回html\u代码
driver=webdriver.Firefox(Firefox\u profile=profile)
文件“/home/ubuntu/anaconda2/lib/python2.7/site packages/selenium/webdriver/firefox/webdriver.py”,第79行,在__
self.binary,超时),
文件“/home/ubuntu/anaconda2/lib/python2.7/site packages/selenium/webdriver/firefox/extension_connection.py”,第49行,在__
self.binary.launch_浏览器(self.profile)
文件“/home/ubuntu/anaconda2/lib/python2.7/site packages/selenium/webdriver/firefox/firefox\u binary.py”,第68行,在启动浏览器中
self.\u等待\u直到\u可连接()
文件“/home/ubuntu/anaconda2/lib/python2.7/site packages/selenium/webdriver/firefox/firefox\u binary.py”,第106行,在“等待”直到“可连接”
%(self.profile.path)
selenium.common.exceptions.WebDriverException:消息:无法加载配置文件。Profile Dir:/tmp/tmpvFoPrE如果您在FirefoxBinary构造函数中指定了日志文件,请检查它以了解详细信息。
过了一段时间后出现上述错误。以下是一组让我在无头模式下工作的因素:
- 切换到
PhantomJS
- 在滚动到最后一条tweet的视图之前,滚动到页面顶部(多次以增加可靠性)
ubuntu@ip-172-31-38-123:~/social_network_extract_proxy$ cat error.txt
Traceback (most recent call last):
File "twitter_script.py", line 70, in <module>
search_twitter('alcoholics anonymous')
File "twitter_script.py", line 64, in search_twitter
db_name=write_data_to_db(*get_twitter_data(query))
File "twitter_script.py", line 37, in get_twitter_data
html_full=return_html_code(url)
File "/home/ubuntu/social_network_extract_proxy/firefox_driver_code.py", line 35, in return_html_code
driver=webdriver.Firefox(firefox_profile=profile)
File "/home/ubuntu/anaconda2/lib/python2.7/site-packages/selenium/webdriver/firefox/webdriver.py", line 79, in __init__
self.binary, timeout),
File "/home/ubuntu/anaconda2/lib/python2.7/site-packages/selenium/webdriver/firefox/extension_connection.py", line 49, in __init__
self.binary.launch_browser(self.profile)
File "/home/ubuntu/anaconda2/lib/python2.7/site-packages/selenium/webdriver/firefox/firefox_binary.py", line 68, in launch_browser
self._wait_until_connectable()
File "/home/ubuntu/anaconda2/lib/python2.7/site-packages/selenium/webdriver/firefox/firefox_binary.py", line 106, in _wait_until_connectable
% (self.profile.path))
selenium.common.exceptions.WebDriverException: Message: Can't load the profile. Profile Dir: /tmp/tmpvFoPrE If you specified a log_file in the FirefoxBinary constructor, check it for details.
您是否可以提供您正在使用的twitter页面,并在每次迭代中打印出
推文数量
——在停止之前您看到加载了多少?谢谢。@alecxe我现在添加了这些问题,请现在检查。谢谢您的建议。@alecxe url:https://twitter.com/search?q=Error%20Check&src=typd&lang=en
和tweets检索:74
@alecxe@Abhishek-使用无休止的滚动,这最终起作用了吗?您是否能够使用driver.page\u source
复制/保存“无休止”长页面的HTML代码?我吃了一些
import time
def return_html_code(url):
dcap = dict(webdriver.DesiredCapabilities.PHANTOMJS)
dcap["phantomjs.page.settings.userAgent"] = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.57 Safari/537.36"
driver = webdriver.PhantomJS(desired_capabilities=dcap)
driver.maximize_window()
driver.get(url)
# initial wait for the tweets to load
wait = WebDriverWait(driver, 30)
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "li[data-item-id]")))
# scroll down to the last tweet until there is no more tweets loaded
while True:
tweets = driver.find_elements_by_css_selector("li[data-item-id]")
number_of_tweets = len(tweets)
print(number_of_tweets)
# move to the top and then to the bottom 5 times in a row
for _ in range(5):
driver.execute_script("window.scrollTo(0, 0)")
driver.execute_script("arguments[0].scrollIntoView(true);", tweets[-1])
time.sleep(0.5)
try:
wait.until(wait_for_more_than_n_elements_to_be_present((By.CSS_SELECTOR, "li[data-item-id]"), number_of_tweets))
except TimeoutException:
break