Python 3.x 有没有办法让selenium异步工作?
我的目标是在Khan Academy上获取尽可能多的个人资料链接。然后在每个配置文件上刮取一些特定数据,将它们写入CSV文件 我的问题很简单:脚本太慢了 以下是脚本:Python 3.x 有没有办法让selenium异步工作?,python-3.x,selenium,asynchronous,web-scraping,thread-safety,Python 3.x,Selenium,Asynchronous,Web Scraping,Thread Safety,我的目标是在Khan Academy上获取尽可能多的个人资料链接。然后在每个配置文件上刮取一些特定数据,将它们写入CSV文件 我的问题很简单:脚本太慢了 以下是脚本: from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support imp
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException,StaleElementReferenceException,NoSuchElementException
from bs4 import BeautifulSoup
import re
from requests_html import HTMLSession
session = HTMLSession()
r = session.get('https://www.khanacademy.org/computing/computer-programming/programming#intro-to-programming')
r.html.render(sleep=5)
soup=BeautifulSoup(r.html.html,'html.parser')
# first step: find all courses links and put them in a list
courses_links = soup.find_all(class_='link_1uvuyao-o_O-nodeStyle_cu2reh-o_O-nodeStyleIcon_4udnki')
list_courses={}
for links in courses_links:
courses = links.extract()
link_course = courses['href']
title_course= links.find(class_='nodeTitle_145jbuf')
span_title_course=title_course.span
text_span=span_title_course.text.strip()
final_link_course ='https://www.khanacademy.org'+link_course
list_courses[text_span]=final_link_course
# second step: loop the script down below with each course link in our list
for courses_step in list_courses.values():
# part 1: make selenium infinite click "schow more" button so we can then scrape as much profile links as possible
driver = webdriver.Chrome()
driver.get(courses_step)
while True: # might want to change that to do some testing
try:
showmore=WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.CLASS_NAME,'button_1eqj1ga-o_O-shared_1t8r4tr-o_O-default_9fm203')))
showmore.click()
except TimeoutException:
break
except StaleElementReferenceException:
break
# part2: once the page fully loaded scrape all profile links and put them in a list
soup=BeautifulSoup(driver.page_source,'html.parser')
#find the profile links
driver.quit()
profiles = soup.find_all(href=re.compile("/profile/kaid"))
profile_list=[]
for links in profiles:
links_no_list = links.extract()
text_link = links_no_list['href']
text_link_nodiscussion = text_link[:-10]
final_profile_link ='https://www.khanacademy.org'+text_link_nodiscussion
profile_list.append(final_profile_link)
#remove profile link duplicates
profile_list=list(set(profile_list))
#print number of profiles we got in the course link
print('in this link:')
print(courses_step)
print('we have this number of profiles:')
print(len(profile_list))
#create the csv file
filename = "khan_withprojectandvotes.csv"
f = open(filename, "w")
headers = "link, date_joined, points, videos, questions, votes, answers, flags, project_request, project_replies, comments, tips_thx, last_date, number_project, projet_votes, projets_spins, topq_votes, topa_votes, sum_badges, badge_lvl1, badge_lvl2, badge_lvl3, badge_lvl4, badge_lvl5, badge_challenge\n"
f.write(headers)
#part 3: for each profile link, scrape the specific data and store them into the csv
for link in profile_list:
#print each profile link we are about to scrap
print("Scrapping ",link)
session = HTMLSession()
r = session.get(link)
r.html.render(sleep=5)
soup=BeautifulSoup(r.html.html,'html.parser')
badge_list=soup.find_all(class_='badge-category-count')
badgelist=[]
if len(badge_list) != 0:
for number in badge_list:
text_num=number.text.strip()
badgelist.append(text_num)
number_badges=sum(list(map(int, badgelist)))
number_badges=str(number_badges)
badge_challenge=str(badgelist[0])
badge_lvl5=str(badgelist[1])
badge_lvl4=str(badgelist[2])
badge_lvl3=str(badgelist[3])
badge_lvl2=str(badgelist[4])
badge_lvl1=str(badgelist[5])
else:
number_badges='NA'
badge_challenge='NA'
badge_lvl5='NA'
badge_lvl4='NA'
badge_lvl3='NA'
badge_lvl2='NA'
badge_lvl1='NA'
user_info_table=soup.find('table', class_='user-statistics-table')
if user_info_table is not None:
dates,points,videos=[tr.find_all('td')[1].text for tr in user_info_table.find_all('tr')]
else:
dates=points=videos='NA'
user_socio_table=soup.find_all('div', class_='discussion-stat')
data = {}
for gettext in user_socio_table:
category = gettext.find('span')
category_text = category.text.strip()
number = category.previousSibling.strip()
data[category_text] = number
full_data_keys=['questions','votes','answers','flags raised','project help requests','project help replies','comments','tips and thanks'] #might change answers to answer because when it's 1 it's putting NA instead
for header_value in full_data_keys:
if header_value not in data.keys():
data[header_value]='NA'
user_calendar = soup.find('div',class_='streak-calendar-scroll-container')
if user_calendar is not None:
last_activity = user_calendar.find('span',class_='streak-cell filled')
try:
last_activity_date = last_activity['title']
except TypeError:
last_activity_date='NA'
else:
last_activity_date='NA'
session = HTMLSession()
linkq=link+'discussion/questions'
r = session.get(linkq)
r.html.render(sleep=5)
soup=BeautifulSoup(r.html.html,'html.parser')
topq_votes=soup.find(class_='text_12zg6rl-o_O-LabelXSmall_mbug0d-o_O-votesSum_19las6u')
if topq_votes is not None:
topq_votes=topq_votes.text.strip()
topq_votes=re.findall('\d+', topq_votes)
topq_votes=topq_votes[0]
#print(topq_votes)
else:
topq_votes='0'
session = HTMLSession()
linka=link+'discussion/answers'
r = session.get(linka)
r.html.render(sleep=5)
soup=BeautifulSoup(r.html.html,'html.parser')
topa_votes=soup.find(class_='text_12zg6rl-o_O-LabelXSmall_mbug0d-o_O-votesSum_19las6u')
if topa_votes is not None:
topa_votes=topa_votes.text.strip()
topa_votes=re.findall('\d+', topa_votes)
topa_votes=topa_votes[0]
else:
topa_votes='0'
# infinite click on show more button for each profile link project section and then scrape data
with webdriver.Chrome() as driver:
wait = WebDriverWait(driver,10)
driver.get(link+'projects')
while True:
try:
showmore = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'[class^="showMore"] > a')))
driver.execute_script("arguments[0].click();",showmore)
except Exception:
break
soup = BeautifulSoup(driver.page_source,'html.parser')
driver.quit()
project = soup.find_all(class_='title_1usue9n')
prjct_number = str(len(project))
votes_spins=soup.find_all(class_='stats_35behe')
list_votes=[]
for votes in votes_spins:
numbvotes=votes.text.strip()
numbvotes=re.split(r'\s',numbvotes)
list_votes.append(numbvotes[0])
prjct_votes=str(sum(list(map(int, list_votes))))
list_spins=[]
for spins in votes_spins:
numspins=spins.text.strip()
numspins=re.split(r'\s',numspins)
list_spins.append(numspins[3])
number_spins=list(map(int, list_spins))
number_spins = [0 if i < 0 else i for i in number_spins]
prjct_spins=str(sum(number_spins))
f.write(link + "," + dates + "," + points.replace("," , "") + "," + videos + "," + data['questions'] + "," + data['votes'] + "," + data['answers'] + "," + data['flags raised'] + "," + data['project help requests'] + "," + data['project help replies'] + "," + data['comments'] + "," + data['tips and thanks'] + "," + last_activity_date + "," + prjct_number + "," + prjct_votes + "," + prjct_spins + "," + topq_votes + "," + topa_votes + "," + number_badges + "," + badge_lvl1 + ',' + badge_lvl2 + ',' + badge_lvl3 + ',' + badge_lvl4 + ',' + badge_lvl5 + ',' + badge_challenge + ',' + "\n")
从selenium导入webdriver
从selenium.webdriver.common.by导入
从selenium.webdriver.support.ui导入WebDriverWait
从selenium.webdriver.support将预期的_条件导入为EC
从selenium.common.Exception导入TimeoutException、StaleElementReferenceException、NosTouchElementException
从bs4导入BeautifulSoup
进口稀土
从请求\u html导入HTMLSession
session=HTMLSession()
r=会话。获取('https://www.khanacademy.org/computing/computer-programming/programming#intro-到编程')
r、 render(sleep=5)
soup=BeautifulSoup(r.html.html,'html.parser')
#第一步:找到所有课程链接并将其放入列表中
课程链接=汤。查找所有(class='link'u 1uvuyao-o\u o-nodeStyle\u cu2reh-o\u o-nodeStyleIcon'u 4udnki')
列出课程={}
有关课程中的链接\u链接:
courses=links.extract()
link_course=courses['href']
title\u course=links.find(class='nodeTitle\u 145jbuf')
span\u title\u course=title\u course.span
text\u span=span\u title\u course.text.strip()
最终链接课程https://www.khanacademy.org“+link_课程
列出课程[text\u span]=最终链接课程
#第二步:将下面的脚本与列表中的每个课程链接循环
对于课程\u列表中的步骤\u课程。值():
#第1部分:让selenium无限单击“schow more”按钮,这样我们就可以刮取尽可能多的配置文件链接
driver=webdriver.Chrome()
驱动程序。获取(课程\步骤)
虽然是真的:#可能想改变这一点来进行一些测试
尝试:
showmore=WebDriverWait(驱动程序,15)。直到(EC.元素的存在位置((按类名称,'button_1eqj1ga-o_o-shared_1t8r4tr-o_o-default_9fm203'))
showmore.click()
除TimeoutException外:
打破
除StaleElementReferenceException外:
打破
#第2部分:页面完全加载后,将所有配置文件链接刮取并放入列表中
soup=BeautifulSoup(driver.page\u源代码'html.parser')
#查找配置文件链接
driver.quit()
profiles=soup.find_all(href=re.compile(“/profile/kaid”))
配置文件列表=[]
对于配置文件中的链接:
links\u no\u list=links.extract()
text\u link=links\u no\u列表['href']
text\u link\u nodiscussion=text\u link[:-10]
最终配置文件链接https://www.khanacademy.org“+文本链接”讨论
配置文件列表。追加(最终配置文件链接)
#删除重复的配置文件链接
配置文件列表=列表(设置(配置文件列表))
#打印我们在课程链接中获得的配置文件数量
打印('在此链接中:')
打印(课程步骤)
print('我们有这个数量的配置文件:')
打印(len(配置文件列表))
#创建csv文件
filename=“khan_with projectandvows.csv”
f=打开(文件名为“w”)
headers=“链接、加入日期、积分、视频、问题、投票、答案、旗帜、项目请求、项目回复、评论、提示、最后日期、项目编号、项目投票、项目旋转、topq投票、topa投票、总和徽章、徽章第一层、徽章第二层、徽章第三层、徽章第四层、徽章第五层、徽章挑战\n”
f、 写入(标题)
#第3部分:对于每个概要文件链接,刮取特定数据并将其存储到csv中
对于配置文件列表中的链接:
#打印我们将要废弃的每个配置文件链接
打印(“报废”,链接)
session=HTMLSession()
r=会话.get(链接)
r、 render(sleep=5)
soup=BeautifulSoup(r.html.html,'html.parser')
badge\u list=soup.find\u all(class='badge-category-count')
徽章列表=[]
如果len(徽章列表)!=0:
对于badge_列表中的编号:
text_num=number.text.strip()
badgelist.append(文本编号)
编号\徽章=总数(列表(地图(int,徽章列表)))
号码牌=str(号码牌)
徽章挑战=str(徽章列表[0])
badge_lvl5=str(badgelist[1])
徽章4层=str(徽章列表[2])
badge_lvl3=str(badgelist[3])
badge_lvl2=str(badgelist[4])
徽章第1层=str(徽章列表[5])
其他:
号码牌class='NA'
徽章(挑战)
徽章_lvl5='NA'
徽章第4层='NA'
徽章_lvl3='NA'
徽章_lvl2='NA'
徽章_lvl1='NA'
user\u info\u table=soup.find('table',class='user-statistics-table')
如果用户信息表不是无:
日期、点数、视频=[tr.find_all('td')[1]。用户信息表中tr的文本。find_all('tr')]
其他:
日期=点数=视频='NA'
user\u social\u table=soup.find\u all('div',class='discussion-stat')
数据={}
对于用户表中的gettext:
category=gettext.find('span')
category\u text=category.text.strip()
number=category.previousSibling.strip()类
数据[类别\文本]=编号
完整的数据键=[“问题”,“投票”,“答案”,“提出的标志”,“项目帮助请求”,“项目帮助回复”,“评论”,“提示和感谢]”]。\
对于完整数据键中的标题值:
如果标头_值不在data.keys()中:
数据[标题值]='NA'
用户\u calendar=soup.find('div',class='streak-calendar-scroll-container')
如果用户日历不是无:
last_activity=user_calendar.find('span',class='streak-cell-filled')
尝试:
上次活动日期=上次活动['title']
除类型错误外:
上次活动日期='NA'
其他:
上次活动日期='NA'
session=HTMLSession()
linkq=链接+“讨论/问题”
r=会话.get(linkq)
r、 render(sleep=5)
import requests
import bs4
import json
URL = "https://www.khanacademy.org/computing/computer-programming/programming#intro-to-programming"
BASE_URL = "https://www.khanacademy.org"
response = requests.get(URL)
soup = bs4.BeautifulSoup(response.content, 'lxml')
script = soup.find_all('script')[18]
script = script.text.encode('utf-8')
script = unicode(script, errors='ignore').encode('utf-8').strip()
script = script.split('{window["./javascript/app-shell-package/app-entry.js"] = ')[1]
script = script[:-2]
json_content = json.loads(script)