Python 输出到excel工作表时，数据未被存储_Python_Excel_Pandas_Web Crawler_Data Export

Python 输出到excel工作表时，数据未被存储

python excel pandas web-crawler

Python 输出到excel工作表时，数据未被存储,python,excel,pandas,web-crawler,data-export,Python,Excel,Pandas,Web Crawler,Data Export,我只是想通过网络绘制一个网站，并使用pandas将数据导出到excel。结果仅打印列而不是结果，尽管。追加到数据（列表）。。我做了很多谷歌搜索，最后来到这里寻求建议。。。我已经更新了前一个问题，因为根据最小可行产品，这可能是不够的？提问要求以下代码摘要如下（旅游景点）硬编码静态变量进入网站环游所有的县找出要循环浏览的帖子数量收集URL链接 For循环进入每个页面并提取数据，并将信息连续附加到数据中=[] 最后导出并打印到excel 那是我的理想计划。。它似乎如此接近，但却如此遥远 i

我只是想通过网络绘制一个网站，并使用pandas将数据导出到excel。结果仅打印列而不是结果，尽管。追加到数据（列表）。。我做了很多谷歌搜索，最后来到这里寻求建议。。。我已经更新了前一个问题，因为根据最小可行产品，这可能是不够的？提问要求

以下代码摘要如下（旅游景点）

硬编码静态变量进入网站环游所有的县找出要循环浏览的帖子数量收集URL链接 For循环进入每个页面并提取数据，并将信息连续附加到数据中=[]

最后导出并打印到excel

那是我的理想计划。。它似乎如此接近，但却如此遥远

import os
import sys
import time
import math
import urllib.request
import time
import numpy as np
import pandas as pd
from Stamprally import StamprallyInfo
from selenium import webdriver as wd
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select


prefectureNameList = ["海外"]

# ,"北海道地方", "北海道", "東北地方", "青森県", "岩手県", "宮城県", "秋田県", "山形県", "福島県", "関東地方", "茨城県", "栃木県", "群馬県", "埼玉県", "千葉県", "東京都", "神奈川県", "中部地方", "新潟県", "富山県", "石川県", "福井県", "山梨県", "長野県", "岐阜県", "静岡県",
#                       "愛知県", "三重県", "近畿地方", "滋賀県", "京都府", "大阪府", "兵庫県", "奈良県", "和歌山県", "中国地方", "鳥取県", "島根県", "岡山県", "広島県", "山口県", "四国地方", "徳島県", "香川県", "愛媛県", "高知県", "九州・沖縄地方", "福岡県", "佐賀県", "長崎県", "熊本県", "大分県", "宮崎県", "鹿児島県", "沖縄県"]


data = []
contentAggregator = []
df = pd.DataFrame(data, columns=["Total List Number", "Prefecture", "ListLink", "Location Tag", "Event Tag", "Available Period",
                  "Available StartDate", "End Date", "Last Updated", "Main Image URL", "Title", "innerWebSiteURL", "mainText"])
main_url = 'https://stamprally.org/'
driver = wd.Chrome(executable_path='chromedriver.exe')
driver.get(main_url)

prefectureValueStorage = driver.find_element_by_xpath(
    "//*[@id='header_search_cat1']/option[2]").get_attribute('value')

# [x.get_attribute('value') for x in driver.find_elements_by_xpath(
#     "//select[@name='search_cat1']/option[@class='level-1' or @class='level-0']")]

prefectureNameIterator = -1
# Loop through all the different prefectures
# for prefectureValue in prefectureValueStorage:
start = time.time()
prefectureNameIterator += 1
driver.get(
    f"https://stamprally.org/?search_keywords&search_keywords_operator=and&search_cat1={145}&search_cat2=0")

print("START OF PREFECTURE " + prefectureNameList[prefectureNameIterator])

# Calculate How Many Times To Run Page Loop
imageDownloadCounter = 1
totalList = driver.find_element_by_css_selector(
    'div.page_navi2.clearfix>p').text  # .get_attribute('text')
totalListNum = totalList.split("件中")
# Add TotalListNum to the contentAggregator
contentAggregator.append(int(totalListNum[0]))
if int(totalListNum[0]) % 10 != 0:
    pageLoopCount = math.ceil((int(totalListNum[0])/10))
else:
    pageLoopCount = int(totalListNum[0])/10
# continue
currentpage = 0
while currentpage < pageLoopCount:
    currentpage += 1
    print("Current Page " + str(currentpage))

# ========================================================================================================================================================
# # Loop through all the Listings within the prefecture page
    driver.get(
        f"https://stamprally.org/?search_keywords&search_keywords_operator=and&search_cat1={145}&search_cat2=0&paged={currentpage}")
    # print("Loading Page %s" % currentpage)
    # ========================================================================================================================================================
    # Add prefectureName to the contentAggregator
    # contentAggregator.append(prefectureNameList[prefectureNameIterator])
    # Gather All List Links
    urlList = []
    currentUrlCounter = 0
    listURLContainer = driver.find_elements_by_css_selector(
        '#post_list2 > li > a')
    # Put all the lists in one Array
    for url in listURLContainer:
        urlList.append(url.get_attribute('href'))
    # Loop through all the links
    for listURL in listURLContainer:
        contentAggregator = []
# Add TotalListNum to the contentAggregator
        contentAggregator.append(int(totalListNum[0]))
# Add prefectureName to the contentAggregator
        contentAggregator.append(
            prefectureNameList[prefectureNameIterator])
        print('article Link: ')
        print(urlList[currentUrlCounter])
    # Add listLink to the contentAggregator
        contentAggregator.append(
            urlList[currentUrlCounter])
    # for Each Links in listURLContainer:
        driver.get(urlList[currentUrlCounter])
        currentUrlCounter += 1
        locationTag = [x.get_attribute('title') for x in driver.find_elements_by_xpath(
            "//*[@id='post_meta_top']/li[1]/a[@class='cat-category']")]
        print(locationTag)
    # Add locationTag to the contentAggregator
        contentAggregator.append(locationTag)

        eventTag = [x.get_attribute('title') for x in driver.find_elements_by_xpath(
            "//*[@id='post_meta_top']/li[2]/a[@class='cat-category2']")]
        contentAggregator.append(eventTag)
        print(eventTag)
 
        availablePeriod = (driver.find_element_by_css_selector(
            'div#post_date')).text.split("（ ")
        availablePeriodFormatted = availablePeriod[0].replace("開催期間：", "")
        availableStartDate = availablePeriod[0].split(" ～ ")
        endDate = availableStartDate[1]
        availableStartDateFormatted = availableStartDate[0].replace(
            "開催期間：", "")
    # Select Latest Update Date
        lastUpdatedDate = driver.find_element_by_css_selector(
            'time.entry-date.updated').text
        print("Available Period:")
        print(availablePeriodFormatted)
    # Add Available Period to the contentAggregator
        contentAggregator.append(availablePeriodFormatted)
        print("Available StartDate:")
        print(availableStartDateFormatted)
    # Add Available StartDate to the contentAggregator
        contentAggregator.append(availableStartDateFormatted)
        print("End Date: ")
        print(endDate)
    # Add endDate to the contentAggregator
        contentAggregator.append(endDate)
        print("Last Updated:")
        print(lastUpdatedDate[6:])
    # Add lastUpdatedDate to the contentAggregator
        contentAggregator.append(lastUpdatedDate[6:])
# ========================================================================================================================================================
    # Download Main Post Image
        mainImageUrl = driver.find_element_by_css_selector(
            'img.attachment-post-thumbnail.size-post-thumbnail.wp-post-image').get_attribute('src')
    # Add lastUpdatedDate to the contentAggregator
        contentAggregator.append(mainImageUrl)
    # Save Post Main Title
        postTitle = driver.find_element_by_css_selector(
            'h2#post_title').text.replace(" 開催終了", "")
        print("Title: ")
        print(postTitle)
    # Add Title to the contentAggregator
        contentAggregator.append(postTitle)
    # Save Post Main Image
        urllib.request.urlretrieve(mainImageUrl, (str(
            prefectureNameList[prefectureNameIterator])+postTitle+str(imageDownloadCounter) + ".png"))
        imageDownloadCounter += 1
    # Get Inner Website Link
        innerWebSiteButtonURL = driver.find_element_by_css_selector(
            'div.post_content.clearfix > div >a').get_attribute('href')
        print("inner Website Button URL: " + innerWebSiteButtonURL)
    # Add innerWebSiteURL to the contentAggregator
        contentAggregator.append(innerWebSiteButtonURL)
    # Gather Main Post Text Content
        mainText = driver.find_elements_by_css_selector(
            'div.post_content.clearfix > p')
        mainContentText = []
    # Remove Disclamimer text
        for mainContentDetail in mainText:
            mainContentText.append(mainContentDetail.text)
        mainContextTextCount = len(mainContentText)-1
        print(mainContentText[:mainContextTextCount])
    # Add Main Post Text Content to the contentAggregator
        contentAggregator.append(mainContentText[:mainContextTextCount])

    # ========================================================================================================================================================
            contentReorder = [1, 0, 10, 5, 6, 7, 8, 12, 3, 4, 9, 11, 2]
            contentAggregator = [contentAggregator[i] for i in contentReorder]
            print("=====================================================================================================================================================")
            print(contentAggregator)
            data.append(contentAggregator)
            print(data)
            print(pd.DataFrame(data, columns=["Total List Number", "Prefecture", "ListLink", "Location Tag", "Event Tag", "Available Period",
                                              "Available StartDate", "End Date", "Last Updated", "Main Image URL", "Title", "innerWebSiteURL", "mainText"]))
            end = time.time()
            print(end - start)


xlwriter = pd.ExcelWriter('StampRally_Crawler.xlsx')
df.to_excel(xlwriter, sheet_name="Stamprally.org Crawl Result")
xlwriter.close()
# ========================================================================================================================================================
# Close  Off
driver.close()
driver.quit()
sys.exit()

导入操作系统
导入系统
导入时间
输入数学
导入urllib.request
导入时间
将numpy作为np导入
作为pd进口熊猫
从Stampraly进口StamprallyInfo
从selenium将webdriver作为wd导入
从selenium.webdriver.common.by导入
从selenium.webdriver.support.ui导入WebDriverWait
从selenium.webdriver.support将预期的_条件导入为EC
从selenium.webdriver.support.ui导入选择
县名列表=[”海外"]
# ,"北海道地方", "北海道", "東北地方", "青森県", "岩手県", "宮城県", "秋田県", "山形県", "福島県", "関東地方", "茨城県", "栃木県", "群馬県", "埼玉県", "千葉県", "東京都", "神奈川県", "中部地方", "新潟県", "富山県", "石川県", "福井県", "山梨県", "長野県", "岐阜県", "静岡県",
#                       "愛知県", "三重県", "近畿地方", "滋賀県", "京都府", "大阪府", "兵庫県", "奈良県", "和歌山県", "中国地方", "鳥取県", "島根県", "岡山県", "広島県", "山口県", "四国地方", "徳島県", "香川県", "愛媛県", "高知県", "九州・沖縄地方", "福岡県", "佐賀県", "長崎県", "熊本県", "大分県", "宮崎県", "鹿児島県", "沖縄県"]
数据=[]
contentAggregator=[]
df=pd.DataFrame（数据，列=[“总列表号”、“地区”、“列表链接”、“位置标记”、“事件标记”、“可用期”，
“可用开始日期”、“结束日期”、“上次更新”、“主图像URL”、“标题”、“内部网站URL”、“主文本”]）
主要的https://stamprally.org/'
driver=wd.Chrome（可执行文件\u path='chromedriver.exe'）
获取驱动程序（主url）
PerfectureValueStorage=driver.find\u元素\u by\u xpath(
“//*[@id='header\u search\u cat1']/[option[2]”。获取属性（'value'））
#[x.get_属性（'value'），用于驱动程序中的x.find_元素(
#“//选择[@name='search\u cat1']/选项[@class='level-1'或@class='level-0']”
县名迭代器=-1
#环游所有不同的县
#对于县城值存储中的县城值：
开始=时间。时间（）
郡县名称迭代器+=1
司机，快(
f“https://stamprally.org/?search_keywords&search_keywords_operator=and&search_cat1={145}&search_cat2=0”）
打印（“地区开始”+地区名称列表[地区名称迭代器]）
#计算运行页面循环的次数
imageDownloadCounter=1
totalist=driver.find\u element\u by\u css\u选择器(
'div.page_navi2.clearfix>p'）.text#.get_属性（'text'））
TotalIstNum=totalList.split（“件中")
#将TotalIstNum添加到contentAggregator
contentAggregator.append（int（totalistnum[0]））
如果int（总计为ISTNUM[0]）%10！=0：
pageLoopCount=math.ceil（（int（totalistnum[0]）/10））
其他：
pageLoopCount=int（TotalIstNum[0]）/10
#继续
currentpage=0
当currentpageli>a”）
#将所有列表放在一个数组中
对于listURLContainer中的url：
urlist.append（url.get_属性（'href'））
#遍历所有链接
对于listURLContainer中的listURL：
contentAggregator=[]
#将TotalIstNum添加到contentAggregator
contentAggregator.append（int（totalistnum[0]））
#将县名添加到contentAggregator
contentAggregator.append(
郡县名称列表[郡县名称迭代器]）
打印（'文章链接：'）
打印（URL列表[当前URL计数器]）
#将listLink添加到contentAggregator
contentAggregator.append(
URL列表[currentUrlCounter]）
#对于listURLContainer中的每个链接：
获取（urlist[currentUrlCounter]）
currentUrlCounter+=1
locationTag=[x.get\u属性（'title'），用于驱动程序中的x.find\u元素(
“//*[@id='post\u meta\u top']/li[1]/a[@class='cat-category']]
打印（位置标签）
#将locationTag添加到contentAggregator
contentAggregator.append（locationTag）
eventTag=[x.get_属性（'title'），用于驱动程序中的x.find_元素(
“//*[@id='post\u meta\u top']/li[2]/a[@class='cat-category2']”）
contentAggregator.append（eventTag）
打印（事件标签）
availablePeriod=（driver.find_element_by_css_选择器(
'div#post#date'））.text.split（“”）
A.
df = pd.DataFrame(data, columns=["Total List Number", "Prefecture", "ListLink", "Location Tag", "Event Tag", "Available Period",                   "Available StartDate", "End Date", "Last Updated", "Main Image URL", "Title", "innerWebSiteURL", "mainText"])