Python 3.x 有没有办法让selenium异步工作?

Python 3.x 有没有办法让selenium异步工作?,python-3.x,selenium,asynchronous,web-scraping,thread-safety,Python 3.x,Selenium,Asynchronous,Web Scraping,Thread Safety,我的目标是在Khan Academy上获取尽可能多的个人资料链接。然后在每个配置文件上刮取一些特定数据,将它们写入CSV文件 我的问题很简单:脚本太慢了 以下是脚本: from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support imp

我的目标是在Khan Academy上获取尽可能多的个人资料链接。然后在每个配置文件上刮取一些特定数据,将它们写入CSV文件

我的问题很简单:脚本太慢了

以下是脚本:

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException,StaleElementReferenceException,NoSuchElementException
from bs4 import BeautifulSoup
import re
from requests_html import HTMLSession

session = HTMLSession()
r = session.get('https://www.khanacademy.org/computing/computer-programming/programming#intro-to-programming')
r.html.render(sleep=5)
soup=BeautifulSoup(r.html.html,'html.parser')

# first step: find all courses links and put them in a list
courses_links = soup.find_all(class_='link_1uvuyao-o_O-nodeStyle_cu2reh-o_O-nodeStyleIcon_4udnki')
list_courses={}

for links in courses_links:
    courses = links.extract()
    link_course = courses['href']
    title_course= links.find(class_='nodeTitle_145jbuf')
    span_title_course=title_course.span
    text_span=span_title_course.text.strip()
    final_link_course ='https://www.khanacademy.org'+link_course
    list_courses[text_span]=final_link_course


# second step: loop the script down below with each course link in our list

for courses_step in list_courses.values():
    # part 1: make selenium infinite click "schow more" button so we can then scrape as much profile links as possible
    driver = webdriver.Chrome()
    driver.get(courses_step)
    while True: # might want to change that to do some testing
        try:
            showmore=WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.CLASS_NAME,'button_1eqj1ga-o_O-shared_1t8r4tr-o_O-default_9fm203')))
            showmore.click()
        except TimeoutException:
            break
        except StaleElementReferenceException:
            break

    # part2: once the page fully loaded scrape all profile links and put them in a list

    soup=BeautifulSoup(driver.page_source,'html.parser')
    #find the profile links
    driver.quit()
    profiles = soup.find_all(href=re.compile("/profile/kaid"))
    profile_list=[]
    for links in profiles:
        links_no_list = links.extract()
        text_link = links_no_list['href']
        text_link_nodiscussion = text_link[:-10]
        final_profile_link ='https://www.khanacademy.org'+text_link_nodiscussion
        profile_list.append(final_profile_link)

    #remove profile link duplicates
    profile_list=list(set(profile_list))

    #print number of profiles we got in the course link
    print('in this link:')
    print(courses_step)
    print('we have this number of profiles:')
    print(len(profile_list))

    #create the csv file
    filename = "khan_withprojectandvotes.csv"
    f = open(filename, "w")
    headers = "link, date_joined, points, videos, questions, votes, answers, flags, project_request, project_replies, comments, tips_thx, last_date, number_project, projet_votes, projets_spins, topq_votes, topa_votes, sum_badges, badge_lvl1, badge_lvl2, badge_lvl3, badge_lvl4, badge_lvl5, badge_challenge\n"
    f.write(headers)

    #part 3: for each profile link, scrape the specific data and store them into the csv
    for link in profile_list:
        #print each profile link we are about to scrap
        print("Scrapping ",link)

        session = HTMLSession()
        r = session.get(link)
        r.html.render(sleep=5)

        soup=BeautifulSoup(r.html.html,'html.parser')

        badge_list=soup.find_all(class_='badge-category-count')
        badgelist=[]
        if len(badge_list) != 0:
            for number in badge_list:
                text_num=number.text.strip()
                badgelist.append(text_num)
            number_badges=sum(list(map(int, badgelist)))
            number_badges=str(number_badges)
            badge_challenge=str(badgelist[0])
            badge_lvl5=str(badgelist[1])
            badge_lvl4=str(badgelist[2])
            badge_lvl3=str(badgelist[3])
            badge_lvl2=str(badgelist[4])
            badge_lvl1=str(badgelist[5])

        else:
            number_badges='NA'
            badge_challenge='NA'
            badge_lvl5='NA'
            badge_lvl4='NA'
            badge_lvl3='NA'
            badge_lvl2='NA'
            badge_lvl1='NA'


        user_info_table=soup.find('table', class_='user-statistics-table')
        if user_info_table is not None:
            dates,points,videos=[tr.find_all('td')[1].text for tr in user_info_table.find_all('tr')]
        else:
            dates=points=videos='NA'

        user_socio_table=soup.find_all('div', class_='discussion-stat')
        data = {}
        for gettext in user_socio_table:
            category = gettext.find('span')
            category_text = category.text.strip()
            number = category.previousSibling.strip()
            data[category_text] = number

        full_data_keys=['questions','votes','answers','flags raised','project help requests','project help replies','comments','tips and thanks'] #might change answers to answer because when it's 1 it's putting NA instead
        for header_value in full_data_keys:
            if header_value not in data.keys():
                data[header_value]='NA'

        user_calendar = soup.find('div',class_='streak-calendar-scroll-container')
        if user_calendar is not None:
            last_activity = user_calendar.find('span',class_='streak-cell filled')
            try:
                last_activity_date = last_activity['title']
            except TypeError:
                last_activity_date='NA'
        else:
            last_activity_date='NA'

        session = HTMLSession()
        linkq=link+'discussion/questions'
        r = session.get(linkq)
        r.html.render(sleep=5)
        soup=BeautifulSoup(r.html.html,'html.parser')

        topq_votes=soup.find(class_='text_12zg6rl-o_O-LabelXSmall_mbug0d-o_O-votesSum_19las6u')
        if topq_votes is not None:
            topq_votes=topq_votes.text.strip()
            topq_votes=re.findall('\d+', topq_votes)
            topq_votes=topq_votes[0]
            #print(topq_votes)
        else:
            topq_votes='0'


        session = HTMLSession()
        linka=link+'discussion/answers'
        r = session.get(linka)
        r.html.render(sleep=5)
        soup=BeautifulSoup(r.html.html,'html.parser')

        topa_votes=soup.find(class_='text_12zg6rl-o_O-LabelXSmall_mbug0d-o_O-votesSum_19las6u')
        if topa_votes is not None:
            topa_votes=topa_votes.text.strip()
            topa_votes=re.findall('\d+', topa_votes)
            topa_votes=topa_votes[0]
        else:
            topa_votes='0'

# infinite click on show more button for each profile link project section and then scrape data
        with webdriver.Chrome() as driver:
            wait = WebDriverWait(driver,10)
            driver.get(link+'projects')

            while True:
                try:
                    showmore = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'[class^="showMore"] > a')))
                    driver.execute_script("arguments[0].click();",showmore)
                except Exception:
                    break

            soup = BeautifulSoup(driver.page_source,'html.parser')
            driver.quit()
            project = soup.find_all(class_='title_1usue9n')
            prjct_number = str(len(project))
            votes_spins=soup.find_all(class_='stats_35behe')
            list_votes=[]
            for votes in votes_spins:
                numbvotes=votes.text.strip()
                numbvotes=re.split(r'\s',numbvotes)
                list_votes.append(numbvotes[0])
                prjct_votes=str(sum(list(map(int, list_votes))))
            list_spins=[]
            for spins in votes_spins:
                numspins=spins.text.strip()
                numspins=re.split(r'\s',numspins)
                list_spins.append(numspins[3])
                number_spins=list(map(int, list_spins))
            number_spins = [0 if i < 0 else i for i in number_spins]
            prjct_spins=str(sum(number_spins))

        f.write(link + "," + dates + "," + points.replace("," , "") + "," + videos + "," + data['questions'] + "," + data['votes'] + "," + data['answers'] + "," + data['flags raised'] + "," + data['project help requests'] + "," + data['project help replies'] + "," + data['comments'] + "," + data['tips and thanks'] + "," + last_activity_date + "," + prjct_number + "," + prjct_votes + "," + prjct_spins + "," + topq_votes + "," + topa_votes + "," + number_badges + "," + badge_lvl1 + ',' + badge_lvl2 + ',' + badge_lvl3 + ',' + badge_lvl4 + ',' + badge_lvl5 + ',' + badge_challenge + ',' + "\n")
从selenium导入webdriver
从selenium.webdriver.common.by导入
从selenium.webdriver.support.ui导入WebDriverWait
从selenium.webdriver.support将预期的_条件导入为EC
从selenium.common.Exception导入TimeoutException、StaleElementReferenceException、NosTouchElementException
从bs4导入BeautifulSoup
进口稀土
从请求\u html导入HTMLSession
session=HTMLSession()
r=会话。获取('https://www.khanacademy.org/computing/computer-programming/programming#intro-到编程')
r、 render(sleep=5)
soup=BeautifulSoup(r.html.html,'html.parser')
#第一步:找到所有课程链接并将其放入列表中
课程链接=汤。查找所有(class='link'u 1uvuyao-o\u o-nodeStyle\u cu2reh-o\u o-nodeStyleIcon'u 4udnki')
列出课程={}
有关课程中的链接\u链接:
courses=links.extract()
link_course=courses['href']
title\u course=links.find(class='nodeTitle\u 145jbuf')
span\u title\u course=title\u course.span
text\u span=span\u title\u course.text.strip()
最终链接课程https://www.khanacademy.org“+link_课程
列出课程[text\u span]=最终链接课程
#第二步:将下面的脚本与列表中的每个课程链接循环
对于课程\u列表中的步骤\u课程。值():
#第1部分:让selenium无限单击“schow more”按钮,这样我们就可以刮取尽可能多的配置文件链接
driver=webdriver.Chrome()
驱动程序。获取(课程\步骤)
虽然是真的:#可能想改变这一点来进行一些测试
尝试:
showmore=WebDriverWait(驱动程序,15)。直到(EC.元素的存在位置((按类名称,'button_1eqj1ga-o_o-shared_1t8r4tr-o_o-default_9fm203'))
showmore.click()
除TimeoutException外:
打破
除StaleElementReferenceException外:
打破
#第2部分:页面完全加载后,将所有配置文件链接刮取并放入列表中
soup=BeautifulSoup(driver.page\u源代码'html.parser')
#查找配置文件链接
driver.quit()
profiles=soup.find_all(href=re.compile(“/profile/kaid”))
配置文件列表=[]
对于配置文件中的链接:
links\u no\u list=links.extract()
text\u link=links\u no\u列表['href']
text\u link\u nodiscussion=text\u link[:-10]
最终配置文件链接https://www.khanacademy.org“+文本链接”讨论
配置文件列表。追加(最终配置文件链接)
#删除重复的配置文件链接
配置文件列表=列表(设置(配置文件列表))
#打印我们在课程链接中获得的配置文件数量
打印('在此链接中:')
打印(课程步骤)
print('我们有这个数量的配置文件:')
打印(len(配置文件列表))
#创建csv文件
filename=“khan_with projectandvows.csv”
f=打开(文件名为“w”)
headers=“链接、加入日期、积分、视频、问题、投票、答案、旗帜、项目请求、项目回复、评论、提示、最后日期、项目编号、项目投票、项目旋转、topq投票、topa投票、总和徽章、徽章第一层、徽章第二层、徽章第三层、徽章第四层、徽章第五层、徽章挑战\n”
f、 写入(标题)
#第3部分:对于每个概要文件链接,刮取特定数据并将其存储到csv中
对于配置文件列表中的链接:
#打印我们将要废弃的每个配置文件链接
打印(“报废”,链接)
session=HTMLSession()
r=会话.get(链接)
r、 render(sleep=5)
soup=BeautifulSoup(r.html.html,'html.parser')
badge\u list=soup.find\u all(class='badge-category-count')
徽章列表=[]
如果len(徽章列表)!=0:
对于badge_列表中的编号:
text_num=number.text.strip()
badgelist.append(文本编号)
编号\徽章=总数(列表(地图(int,徽章列表)))
号码牌=str(号码牌)
徽章挑战=str(徽章列表[0])
badge_lvl5=str(badgelist[1])
徽章4层=str(徽章列表[2])
badge_lvl3=str(badgelist[3])
badge_lvl2=str(badgelist[4])
徽章第1层=str(徽章列表[5])
其他:
号码牌class='NA'
徽章(挑战)
徽章_lvl5='NA'
徽章第4层='NA'
徽章_lvl3='NA'
徽章_lvl2='NA'
徽章_lvl1='NA'
user\u info\u table=soup.find('table',class='user-statistics-table')
如果用户信息表不是无:
日期、点数、视频=[tr.find_all('td')[1]。用户信息表中tr的文本。find_all('tr')]
其他:
日期=点数=视频='NA'
user\u social\u table=soup.find\u all('div',class='discussion-stat')
数据={}
对于用户表中的gettext:
category=gettext.find('span')
category\u text=category.text.strip()
number=category.previousSibling.strip()类
数据[类别\文本]=编号
完整的数据键=[“问题”,“投票”,“答案”,“提出的标志”,“项目帮助请求”,“项目帮助回复”,“评论”,“提示和感谢]”]。\
对于完整数据键中的标题值:
如果标头_值不在data.keys()中:
数据[标题值]='NA'
用户\u calendar=soup.find('div',class='streak-calendar-scroll-container')
如果用户日历不是无:
last_activity=user_calendar.find('span',class='streak-cell-filled')
尝试:
上次活动日期=上次活动['title']
除类型错误外:
上次活动日期='NA'
其他:
上次活动日期='NA'
session=HTMLSession()
linkq=链接+“讨论/问题”
r=会话.get(linkq)
r、 render(sleep=5)
import requests
import bs4
import json

URL = "https://www.khanacademy.org/computing/computer-programming/programming#intro-to-programming"
BASE_URL = "https://www.khanacademy.org"

response = requests.get(URL)
soup = bs4.BeautifulSoup(response.content, 'lxml')

script = soup.find_all('script')[18]
script = script.text.encode('utf-8')
script = unicode(script, errors='ignore').encode('utf-8').strip()
script = script.split('{window["./javascript/app-shell-package/app-entry.js"] = ')[1]
script = script[:-2]
json_content = json.loads(script)