Python 速度​;上乌苏

Python 速度​;上乌苏,python,python-3.x,beautifulsoup,Python,Python 3.x,Beautifulsoup,如何加快商品解析过程 在这个例子中,大约有40000个产品,类别-分页--产品 现在,在这个过程中花费了大约十个小时 from bs4 import BeautifulSoup as bs import requests from urllib.request import urlopen def BB_parse(base_url, headers): pricelist = [] urls=[] urls.append(base_url) page_boo

如何加快商品解析过程

在这个例子中,大约有40000个产品,类别-
分页-
-产品

现在,在这个过程中花费了大约十个小时

from bs4 import BeautifulSoup as bs
import requests
from urllib.request import urlopen


def BB_parse(base_url, headers):
    pricelist = []
    urls=[]
    urls.append(base_url)
    page_books=[]
    page_books.append(base_url)
    url1s = []
    url1s.append(base_url)
    article = 'none'
    name = 'none'
    description = 'none'
    available = 'none'
    price = 'none'
    oldprice = 'none'
    Category1 = 'none'
    Category2 = 'none'
    Category3 = 'none'
    image1 = 'none'
    image2 = 'none'
    image3 = 'none'
    code = 'none'
    Publish ='none'
    Authors = 'none'
    Series = 'none'
    Blinding = 'none'
    Count_Page = 'none'
    Width = 'none'
    Height = 'none'
    Date = 'none'



    session = requests.Session()
    request = session.get(base_url, headers = headers)
    if request.status_code == 200:
        soup = bs(request.content, 'lxml')
        for el in soup.find_all('ul', attrs={'class': 'submenu'}):
            for le in el.find_all('li', attrs={'class': 'parent'}):
                for te in le.find_all('ul', attrs={'class': 'submenu'}): 
                    for ts2 in te.find_all('a', href=True):
                        category = ts2.getText().strip()
                        href = ts2['href']
                        url = f'https://{href}'
                        if url not in urls:
                           urls.append(url)
                           #print(url)



    for url in urls:
        session = requests.Session()
        request = session.get(url, headers = headers)
        if request.status_code == 200:
           request = session.get(url, headers = headers)
           soup = bs(request.content, 'lxml')
           try:
               for item in soup.findAll("div", {'class': 'pagination'}):
                   pagination = int([tag.text for tag in item.findAll("a")][-2])
                   for i in range(1,(pagination)+1):
                       url1 = f'{url}?PAGEN_1={i}'
                       if url1 not in url1s:
                          url1s.append(url1)



           except:
              pass

    for url1 in url1s:
        request = session.get(url1, headers = headers)
        soup = bs(request.content, 'lxml')
        div = soup.find_all('div', {'class': 'catalog-item-card'})
        for divs in div:
            try:
                title = divs.find('a', {'class': 'item-title'})
                bookurl = title['href']
                page_book = f'https://{bookurl}'
                if page_book not in page_books:
                   page_books.append(page_book)


            except:
               pass




    for page_book in page_books:
        request = session.get(page_book, headers = headers)
        soup = bs(request.content, 'lxml')
        container_main = soup.find_all('div', {'class': 'content-wrapper'})
        for c1 in container_main:

            try:
                article = c1.find('div', {'class': 'article'}).text.strip().replace("Артикул: ", "")
                name = c1.select('h1')[0].text
                description = c1.find('div', {'class': 'description'}).text.strip()
                available = c1.find('div', {'class': 'avl'}).text.strip()
                price = c1.find('span', {'class': 'catalog-detail-item-price' }).text.replace("Цена:", "").replace("тг.", "").replace("за шт", "").strip()
                oldprice = c1.find('span', {'class': 'catalog-detail-item-price-old' }).text.replace("Цена:", "").replace("тг.", "").replace("за шт", "").replace(" ", "").strip()
                #economy = c1.find('span', {'class': 'catalog-detail-item-price-percent' }).text.replace("Экономия", "").replace("тг.", "").strip()
                Category1 = c1.find_all('span', {'itemprop': 'title'})[1].text
                Category2 = c1.find_all('span', {'itemprop': 'title'})[2].text
                Category3 = c1.find_all('span', {'itemprop': 'title'})[3].text
                catalog_detail_pictures = c1.find_all('div', {'class': 'catalog-detail-pictures'})
                image1 = c1.find_all('a', {'rel': 'lightbox'})[0]['href']
                image2 = c1.find_all('a', {'rel': 'lightbox'})[1]['href']
                image3 = c1.find_all('a', {'rel': 'lightbox'})[2]['href']
            except:
                pass


            for count, tag in enumerate(soup.find_all(class_='name')):

                try:
                    if tag.text == 'ISBN':
                       code = soup.find_all(class_='val')[count].text
                    if tag.text == 'Издательство':
                       Publish = soup.find_all(class_='val')[count].text
                    if tag.text == 'Авторы':
                       Authors = soup.find_all(class_='val')[count].text
                    if tag.text == 'Серия':
                       Series = soup.find_all(class_='val')[count].text
                    if tag.text == 'Переплет':
                       Blinding = soup.find_all(class_='val')[count].text
                    if tag.text == 'Количество страниц':
                       Count_Page = soup.find_all(class_='val')[count].text
                    if tag.text == 'Ширина':
                       Width = soup.find_all(class_='val')[count].text
                    if tag.text == 'Высота':
                       Height = soup.find_all(class_='val')[count].text
                    if tag.text == 'Дата последнего тиража':
                       Date = soup.find_all(class_='val')[count].text


                except:
                  pass



            pricelist.append({
                     'name': name,
                     'article': article,
                     'price': price,
                     'oldprice': oldprice,
                     'code': code,
                     'Publish': Publish,
                     'Authors': Authors,
                     'Series': Series,
                     'Blinding': Blinding,
                     'Count_Page': Count_Page,
                     'Width': Width,
                     'Height': Height,
                     'Date': Date,
                     'Category1': Category1,
                     'Category2': Category2,
                     'Category3': Category3,
                     'image1': image1,
                     'image2': image2,
                     'image3': image3
                })


        print(len(pricelist))



    else:
        print('ERROR or Done. Status_code = ' + str(request.status_code))

    return pricelist

def files_writer(pricelist):
    with open('Book24.csv', 'w') as file:
        a_lol = csv.writer(file)
        a_lol.writerow(('Наименование', 'Артикул', 'Цена', 'Старая цена', 'Код ISBN', 'Издательство', 'Авторы', 'Серия', 'Переплет', 'Количество страниц', 'Ширина', 'Высота', 'Дата последнего тиража', 'Родительская категория', 'Категория', 'Подкатегория', 'Картикнка_1', 'Картикнка_2', 'Картикнка_3'))
        for RF in pricelist:
            a_lol.writerow((RF['name'], RF['article'], RF['price'], RF['oldprice'], RF['code'], RF['Publish'], RF['Authors'], RF['Series'], RF['Blinding'], RF['Count_Page'], RF['Width'], RF['Height'], RF['Date'], RF['Category1'], RF['Category2'], RF['Category3'], RF['image1'], RF['image2'], RF['image3']))


pricelist = BB_parse(base_url, headers)
files_writer(pricelist)

您是否在最后创建了一个表?最大的瓶颈在哪里?您是否尝试过或可能尝试过制作不同的功能和配置文件?你需要先弄清楚。你可能会从或中受益。也许刮皮会有所帮助-