Python 带无头Web驱动程序的Selenium Web抓取_Python_Selenium

Python 带无头Web驱动程序的Selenium Web抓取

python selenium

Python 带无头Web驱动程序的Selenium Web抓取,python,selenium,Python,Selenium,我需要用硒刮掉一个网站。以下是相同的代码： #!/usr/bin/env python # -*- coding: utf-8 -*- from selenium import webdriver import time import pandas as pd from selenium.common.exceptions import NoSuchElementException from xlrd import open_workbook import logging import si

我需要用硒刮掉一个网站。以下是相同的代码：

#!/usr/bin/env python
# -*- coding: utf-8 -*-

from selenium import webdriver
import time 
import pandas as pd
from selenium.common.exceptions import NoSuchElementException
from xlrd import open_workbook
import logging
import signal


#make lists for all the different aspects needed.
links = []
pics = []
types = []
names = []
descs = []
views = []
no_speakers = []
location = []
dates = []
people = []
organization = [] 
summ = []
twitter = []
facebook = []
contact = []
emails = []
website_link = []
venue = []
official_address = []
speakers  = []
fees = []
at_tr = []
prev_links = []
index = -1
update = []

def main_url(url):
    driver = webdriver.Chrome('C:/Program Files/chromedriver.exe')#gets the web driver. 
    driver.get(url) #gets the URL
    time.sleep(5) # wait 5 seconds until DOM will load completly
    while True:
        try:
            driver.find_element_by_id('view_more').click() #clicks on load more until there are no more events to be loaded. 
            time.sleep(3)
        except Exception as e:
            break

    rows = driver.find_elements_by_class_name('sec_conf_main')
    for row in rows:
        conf = row.find_element_by_class_name('conf_summery')
        nam = conf.find_element_by_class_name('c_name')
        name = nam.find_element_by_tag_name('a')

        if len(names) != 0 and name.get_attribute('title') in names:
            index = names.index(name.get_attribute('title'))

            pic = row.find_element_by_class_name('conf_logo')
            link = pic.find_element_by_tag_name('a')
            if links[index] == link:
                pass
            else:
                links[index] = link.get_attribute('href') #get link of event.
                if not link.get_attribute('href') in update:
                    update.append(link.get_attribute('href'))

            img = link.find_element_by_tag_name('img')
            if pics[index] == img.get_attribute('src'):
                pass
            else:
                pics[index] = img.get_attribute('src') #picture source of event.
                if not link.get_attribute('href') in update:
                    update.append(link.get_attribute('href'))

            desc = row.find_element_by_class_name('conf_desc')
            if descs[index] == desc.text:
                pass
            else:
                descs[index] = desc.text #description of event.
                if not link.get_attribute('href') in update:
                    update.append(link.get_attribute('href'))

            d = conf.find_elements_by_tag_name('strong')
            count = 0
            while count < len(d):
                view = d[count].text
                if views[index] == view:
                    pass
                else:
                    views[index] = view #number of views. 
                    if not link.get_attribute('href') in update:
                        update.append(link.get_attribute('href'))

                if no_speakers[index] == d[count + 1].text:
                    pass
                else:
                    no_speakers[index] = d[count + 1].text #number of speakers. 
                    if not link.get_attribute('href') in update:
                        update.append(link.get_attribute('href'))
                count = count +  2

            t = conf.find_elements_by_class_name('spel')
            ty = []
            for item in t:
                ty.append(item.get_attribute('title'))
            if types[index] == ','.join(ty):
                pass
            else:
                types[index] = (','.join(ty))#speciality of event. 
                if not link.get_attribute('href') in update:
                    update.append(link.get_attribute('href'))

            date_place = conf.find_elements_by_class_name('c_summery')
            for item in date_place:
                try:
                    if item.find_element_by_tag_name('img'):
                        if location[index] == item.text:
                            pass
                        else:
                            location[index] = (item.text) #location of event 
                            if not link.get_attribute('href') in update:
                                update.append(link.get_attribute('href'))
                except NoSuchElementException as e:
                    pass
                try:
                    if item.find_element_by_tag_name('span'):
                        date = item.text
                        i = date.find('|')
                        if dates[index] == date[:i]:
                            pass
                        else:
                            dates[index] = (date[:i]) #date from and to of event. 
                            if not link.get_attribute('href') in update:
                                update.append(link.get_attribute('href'))
                except NoSuchElementException as e:
                    pass

        else:
            names.append(name.get_attribute('title')) #title of event.
            pic = row.find_element_by_class_name('conf_logo')
            link = pic.find_element_by_tag_name('a')
            links.append(link.get_attribute('href')) #get link of event. 

            img = link.find_element_by_tag_name('img')
            pics.append(img.get_attribute('src')) #picture source of event.

            desc = row.find_element_by_class_name('conf_desc')
            descs.append(desc.text) #description of event. 

            d = conf.find_elements_by_tag_name('strong')
            count = 0
            while count < len(d):
                view = d[count].text
                views.append(view) #number of views. 
                no_speakers.append(d[count + 1].text) #number of speakers. 
                count = count +  2

            t = conf.find_elements_by_class_name('spel')
            ty = []
            for item in t:
                ty.append(item.get_attribute('title'))
            types.append(','.join(ty))#speciality of event. 

            date_place = conf.find_elements_by_class_name('c_summery')
            for item in date_place:
                try:
                    if item.find_element_by_tag_name('img'):
                        location.append(item.text) #location of event 
                except NoSuchElementException as e:
                    pass
                try:
                    if item.find_element_by_tag_name('span'):
                        date = item.text
                        index = date.find('|')
                        dates.append(date[:index]) #date from and to of event. 
                except NoSuchElementException as e:
                    pass

    driver.close()
    driver.quit()

def each_event(item):

    driver = webdriver.Chrome('C:/Program Files/chromedriver.exe')
    driver.get(item) #get each Link of the event. 
    time.sleep(5)
    if len(prev_links) != 0 and item in prev_links:
        index = links.index(item)

        try:
            org = driver.find_element_by_class_name('speakers')
            l = org.text.split()
            if organization[index] == ' '.join(l[3:]):
                pass
            else:
                organization[index] = (' '.join(l[3:]))
                if not item in update:
                    update.append(item)
        except NoSuchElementException as e:
            organization[index] = 'No Organization Given.'

        try:
            summary = driver.find_element_by_class_name('conf_head_summary')
            if summ[index] == summary.find_element_by_tag_name('p').text:
                pass
            else:
                summ[index] = (summary.find_element_by_tag_name('p').text)
                if not item in update:
                    update.append(item)
        except NoSuchElementException as e:
            summ[index] = 'No Conference Summary Given.'

        try:
            tw = driver.find_element_by_class_name('TW')
            if twitter[index] == tw.get_attribute('title'):
                pass
            else:
                twitter[index] = (tw.get_attribute('title'))
                if not item in update:
                    update.append(item)
        except NoSuchElementException as e:
            twitter[index] = 'No Twitter Link'

        try:
            fb = driver.find_element_by_class_name('FB')
            if facebook[index] == fb.get_attribute('title'):
                pass
            else:
                facebook[index] = (fb.get_attribute('title'))
                if not item in update:
                    update.append(item)
        except NoSuchElementException as e:
            facebook[index] = ('No Facebook Link')

        try:
            c = driver.find_element_by_class_name('marB20').find_element_by_xpath('//table/tbody/tr[1]/td[3]').text
            if contact[index] == c:
                pass
            else:
                if len(c) == 0:
                    contact[index] = ('No Contact Number Given.')
                else:
                    contact[index] = (c)
                    if not item in update:
                        update.append(item)
        except NoSuchElementException as e:
            contact[index] = ('No Contact Number Given.')

        try:
            email = driver.find_elements_by_class_name('emailFruser')
            e = []
            for item in email:
                e.append(item.text)
            if emails[index] == ','.join(e):
                pass
            else:
                emails[index] = (','.join(e))
                if not item in update:
                    update.append(item)
        except NoSuchElementException as e:
            emails[index] = ('No email.')

        try:
            web = driver.find_element_by_id('cRegistraionpopup5').get_attribute('href')
            if website_link[index] == web:
                pass
            else:
                website_link[index] = (web)
                if not item in update:
                    update.append(item)
        except NoSuchElementException as e:
            website_link[index] = ('No Website Link')

        try:
            v = driver.find_element_by_class_name('conf_venue1').text
            if venue[index] == v:
                pass
            else:
                venue[index] = (v)
                if not item in update:
                    update.append(item)
        except NoSuchElementException as e:
            venue[index] = ('No Venue Given.')

        try:
            oa = driver.find_element_by_class_name('hotel-detail').text
            if official_address[index] == oa:
                pass
            else:
                official_address[index] = oa
                if not item in update:
                    update.append(item)
        except NoSuchElementException as e:
            official_address[index] = ('No Official Address Given. ')

        try:
            sp = driver.find_elements_by_class_name('speaker_single_inn')
            l = []
            for item in sp:
                l.append(driver.find_element_by_xpath('//div/h5/a').text)
            if len(l) == 0:
                speakers[index] = 'No Speakers'      

            if speakers[index] == ','.join(l):
                pass
            else:
                speakers[index] = (','.join(l))
                if not item in update:
                    update.append(item)
        except NoSuchElementException as e:
            speakers[index] = ('No Speakers')

        try:
            s = driver.find_element_by_class_name('mobScroll')
            trs = s.find_elements_by_xpath('//table/tbody/tr')
            l = []
            for item in trs:
                try:
                    item.find_element_by_class_name('ticketname_inn')
                    l.append(item.text)
                except NoSuchElementException as e:
                    pass
            if fees[index] == ','.join(l):
                pass
            else:
                fees[index] = (';'.join(l))
                if not item in update:
                    update.append(item)
        except NoSuchElementException as e:
            fees[index] = ('No Fees Given')

        try:
            sp = driver.find_elements_by_class_name('r-speaker-info')
            l = []
            for item in sp:
                l.append(item.text)
            if len(l) == 0:
                at_tr[index] = 'No Attenders or Trackers Given.'
            if at_tr[index] == ','.join(l):
                pass
            else:
                at_tr[index] = (','.join(l))
                if not item in update:
                    update.append(item)
        except NoSuchElementException as e:
            at_tr[index] = ('No Attenders or Trackers Given')

    else:
        try:
            org = driver.find_element_by_class_name('speakers')
            l = org.text.split()
            organization.append(' '.join(l[3:]))
        except NoSuchElementException as e:
            organization.append('No Organization Given.')

        try:
            summary = driver.find_element_by_class_name('conf_head_summary')
            summ.append(summary.find_element_by_tag_name('p').text)
        except NoSuchElementException as e:
            summ.append('No Conference Summary Given.')

        try:
            tw = driver.find_element_by_class_name('TW')
            twitter.append(tw.get_attribute('title'))
        except:
            twitter.append('No Twitter Link')

        try:
            fb = driver.find_element_by_class_name('FB')
            facebook.append(fb.get_attribute('title'))
        except:
            facebook.append('No Facebook Link')

        try:
            c = driver.find_element_by_class_name('marB20').find_element_by_xpath('//table/tbody/tr[1]/td[3]').text
            if len(c) == 0:
                contact.append('No Contact Number Given.')
            else:
                contact.append(c)
        except NoSuchElementException as e:
            contact.append('No Contact Number Given.')

        try:
            email = driver.find_elements_by_class_name('emailFruser')
            e = []
            for item in email:
                e.append(item.text)
            emails.append(' '.join(e))
        except NoSuchElementException as e:
            emails.append('No email.')

        try:
            website_link.append(driver.find_element_by_id('cRegistraionpopup5').get_attribute('href'))
        except NoSuchElementException as e:
            website_link.append('No Website Link')

        try:
            venue.append(driver.find_element_by_class_name('conf_venue1').text)
        except NoSuchElementException as e:
            venue.append('No Venue Given.')

        try:
            official_address.append(driver.find_element_by_class_name('hotel-detail').text)
        except NoSuchElementException as e:
            official_address.append('No Official Address Given. ')

        try:
            sp = driver.find_elements_by_class_name('speaker_single_inn')
            l = []
            for item in sp:
                l.append(driver.find_element_by_xpath('//div/h5/a').text)
            if len(l) == 0:
                speakers.append('No Speakers Given.')
            else:
                speakers.append(','.join(l))
        except NoSuchElementException as e:
            speakers.append('No Speakers')

        try:
            s = driver.find_element_by_class_name('mobScroll')
            trs = s.find_elements_by_xpath('//table/tbody/tr')
            l = []
            for item in trs:
                try:
                    item.find_element_by_class_name('ticketname_inn')
                    l.append(item.text)
                except NoSuchElementException as e:
                    pass
            fees.append(';'.join(l))
        except NoSuchElementException as e:
            fees.append('No Fees Given')

        try:
            sp = driver.find_elements_by_class_name('r-speaker-info')
            l = []
            for item in sp:
                l.append(item.text)
            if len(l) == 0:
                at_tr.append('No Attenders or Trackers Given')
            else:
                at_tr.append(','.join(l))
        except NoSuchElementException as e:
            at_tr.append('No Attenders or Trackers Given')

    driver.close()
    driver.quit()

def main():
    file = 'EMedEvents.xlsx' #file to write in
    book = open_workbook(file)
    sheet = book.sheet_by_index(0)

    d = pd.read_excel(file)
    if d.empty:
        pass
    else:
        for row in range(1, sheet.nrows):
            names.append(sheet.cell(row, 0).value)
            dates.append(sheet.cell(row, 1).value)
            types.append(sheet.cell(row, 2).value)
            location.append(sheet.cell(row, 3).value)
            descs.append(sheet.cell(row, 4).value)
            views.append(sheet.cell(row, 5).value)
            no_speakers.append(sheet.cell(row, 6).value)
            pics.append(sheet.cell(row, 7).value)
            links.append(sheet.cell(row, 8).value)
            organization.append(sheet.cell(row, 9).value)
            summ.append(sheet.cell(row, 10).value)
            twitter.append(sheet.cell(row, 11).value)
            facebook.append(sheet.cell(row, 12).value)
            contact.append(sheet.cell(row, 13).value)
            emails.append(sheet.cell(row, 14).value)
            website_link.append(sheet.cell(row, 15).value)
            venue.append(sheet.cell(row, 16).value)
            official_address.append(sheet.cell(row, 17).value)
            speakers.append(sheet.cell(row, 18).value)
            fees.append(sheet.cell(row, 19).value)
            at_tr.append(sheet.cell(row, 20).value)

    if len(links) != 0:
        for item in links:
            prev_links.append(item)

    main_url("https://www.emedevents.com/india-medical-conferences") #main url to use. 
    for item in links:
        each_event(item) #get people information of each event. 

    df = pd.DataFrame.from_dict({'Event Name':names,'Event Dates':dates, 'Specialty' : types,'Event Location' : location, 'Description' : descs, 
                                 'Views' : views, 'Speakers' : no_speakers, 'Picture Source' : pics, 'Event Link' : links, 'Organized By' : organization, 
                                 'Conference Summary' : summ, 'Twitter Link' : twitter, 'Facebook Link' : facebook,'Contact Number' : contact, 
                                 'Email' : emails, 'Website Link' : website_link, 'Venue' : venue, 'Official Address' : official_address, 'Speaking' : speakers,
                                 'Fees' : fees, 'Attenders and Trackers': at_tr})
    df.to_excel(file, header=True, index=False) #print the data in the excel sheet. 

    logging.basicConfig(filename = 'error_' + str(time.time()) + '.log', level = logging.INFO)
    logging.info('%d events were read from the excel sheet', len(prev_links))
    logging.info('%d events were added to the excel sheet', len(links) - len(prev_links))
    logging.info('Following are the links of the events that were updated:')
    for item in update:
        logging.info(item)

if __name__ == '__main__':
    main() #if the name is main, run the main method and continue with the program.

我不知道现在该怎么办。为了使代码在Ubuntu平台上工作，我在代码中做了哪些更改

提前谢谢

你需要安装一个特定于操作系统的chromedriver和Chrome。嘿，科尔，我下载了chromedriver文件，并使用以下代码在linux操作系统上运行脚本：

options=options（）选项。添加_参数（'--headless'）选项。添加_参数（'--no sandbox'）##绕过操作系统安全模型选项。添加#u参数（'start-maximized'）选项。添加参数（'disable-infobar'）选项。添加参数（“--disable extensions”）driver=webdriver.Chrome（'/usr/local/bin/chromedriver'，Chrome\u options=options）

运行此脚本后，我出现以下错误：

selenium.common.exceptions.WebDriverException:Message:Service/usr/local/bin/chromedriver意外退出。状态代码为：127

我非常确定libfontconfig已下载。因此库没有问题。那么，出现了什么错误？我错过了什么吗就下载而言，你是否安装了Chrome或Chrome？我想这可能是个问题。关于libfontconfig，我在谷歌上找到了这个问题：我已经安装了chromedriver。我是否还需要安装Chrome并安装链接建议的所有额外库？

UserWarning: Selenium support for PhantomJS has been deprecated, please use headless versions of Chrome or Firefox instead
  warnings.warn('Selenium support for PhantomJS has been deprecated, please use headless '