用Python编写Excel文件

用Python编写Excel文件,python,excel,xlsxwriter,Python,Excel,Xlsxwriter,我正在用BeautifulSoup和Selenium做一些网页垃圾处理,我想把我的数据写入excel文件 # coding: utf-8 import requests import bs4 from datetime import datetime import re import os import urllib import urllib2 from bs4 import BeautifulSoup from selenium import webdriver import time

我正在用BeautifulSoup和Selenium做一些网页垃圾处理,我想把我的数据写入excel文件

# coding: utf-8

import requests
import bs4
from datetime import datetime
import re
import os
import urllib
import urllib2
from bs4 import BeautifulSoup
from selenium import webdriver
import time

initialpage = 'https://www.boxofficemojo.com/yearly/chart/?yr=2017&p=.htm'
res = requests.get(initialpage, timeout=None)
soup = bs4.BeautifulSoup(res.text, 'html.parser')



pages = []
pagelinks=soup.select('a[href^="/yearly/chart/?page"]')

for i in range(int(len(pagelinks)/2)):
    pages.append(str(pagelinks[i])[9:-14])
    pages[i]=pages[i].replace("amp;","")
    pages[i]= "https://www.boxofficemojo.com" + pages[i]
    pages[i]=pages[i][:-1]



pages.insert(0, initialpage)
date_dic = {}
movie_links = []
titles = []
Domestic_Gross_Arr=[]
Genre_Arr=[]
Release_Date_Arr = []
Theaters_Arr=[]
Budget_Arr = []
Views_Arr = []
Edits_Arr = []
Editors_Arr = []


for i in range(int(len(pagelinks)/2 + 1)): 
    movie_count=0;
    res1 = requests.get(pages[i])
    souppage=bs4.BeautifulSoup(res1.text, 'html.parser')
    for j in souppage.select('tr > td > b > font > a'):
        link = j.get("href")[7:].split("&")
        str1 = "".join(link)
        final = "https://www.boxofficemojo.com/movies" + str1
        if "/?id" in final: 
            movie_links.append(final)
            movie_count += 1

    number_of_theaters=souppage.find("tr", bgcolor="#dcdcdc")
    for k in range(movie_count):
        #print(number_of_theaters.next_sibling.contents[4].text)
        Theaters_Arr.append(number_of_theaters.next_sibling.contents[4].text)
        number_of_theaters=number_of_theaters.next_sibling



k=0
path = os.getcwd()  
path = path + '/movie_pictures'
os.makedirs(path)
os.chdir(path)
while(k < 2):
    j = movie_links[k]
    try:
        res1 = requests.get(j)
        soup1 = bs4.BeautifulSoup(res1.text, 'html.parser')

        c = soup1.select('td[width="35%"]')
        d=soup1.select('div[class="mp_box_content"]')

        genre = soup1.select('td[valign="top"]')[5].select('b')
        image = soup1.select('img')[6].get('src')
        budget = soup1.select('tr > td > b')
        domestic = str(c[0].select('b'))[4:-5]


        release = soup1.nobr.a
        title = soup1.select('title')[0].getText()[:-25]
        print ("-----------------------------------------")
        print ("Title: " +title)
        titles.append(title)
        print ("Domestic Gross: " +domestic)
        Domestic_Gross_Arr.append(domestic)
        print ("Genre: "+genre[0].getText())
        Genre_Arr.append(genre[0].getText())
        print ("Release Date: " +release.contents[0])
        Release_Date_Arr.append(release.contents[0])
        print ("Production Budget: " +budget[5].getText())
        Budget_Arr.append(budget[5].getText())

        year1=str(release.contents[0])[-4:]
        a,b=str(release.contents[0]).split(",")
        month1, day1=a.split(" ")
        datez= year1 + month1 + day1
        new_date= datetime.strptime(datez , "%Y%B%d")
        date_dic[title]=new_date       


        with open('pic' + str(k) + '.png', 'wb') as handle:
            response = requests.get(image, stream=True)

            if not response.ok:
                print response

            for block in response.iter_content(1024):
                if not block:
                    break

            handle.write(block)
    except:
        print("Error Occured, Page Or Data Not Available")
    k+=1



def subtract_one_month(t):

    import datetime
    one_day = datetime.timedelta(days=1)
    one_month_earlier = t - one_day
    while one_month_earlier.month == t.month or one_month_earlier.day > t.day:
        one_month_earlier -= one_day
    year=str(one_month_earlier)[:4]

    day=str(one_month_earlier)[8:10]

    month=str(one_month_earlier)[5:7]

    newdate= year + "-" + month +"-" + day

    return newdate

number_of_errors=0
browser = webdriver.Chrome("/Users/Gokce/Downloads/chromedriver")
browser.maximize_window() 
browser.implicitly_wait(20)
for i in titles:
    try:
        release_date = date_dic[i]
        i = i.replace(' ', '_')
        i = i.replace("2017", "2017_film")
    #end = datetime.strptime(release_date, '%B %d, %Y')


        end_date = release_date.strftime('%Y-%m-%d')
        start_date = subtract_one_month(release_date)
        url = "https://tools.wmflabs.org/pageviews/?project=en.wikipedia.org&platform=all-access&agent=user&start="+ start_date +"&end="+ end_date + "&pages=" + i


        browser.get(url)
        page_views_count = browser.find_element_by_css_selector(" .summary-column--container .legend-block--pageviews .linear-legend--counts:first-child span.pull-right ")
        page_edits_count = browser.find_element_by_css_selector(" .summary-column--container .legend-block--revisions .linear-legend--counts:first-child span.pull-right ")
        page_editors_count = browser.find_element_by_css_selector(" .summary-column--container .legend-block--revisions .legend-block--body .linear-legend--counts:nth-child(2) span.pull-right ")
        print (i)

        print ("Number of Page Views: " +page_views_count.text)
        Views_Arr.append(page_views_count.text)
        print ("Number of Edits: " +page_edits_count.text)
        Edits_Arr.append(page_edits_count.text)
        print ("Number of Editors: " +page_editors_count.text)
        Editors_Arr.append(page_editors_count.text)
    except:
        print("Error Occured for this page: " + str(i))
        number_of_errors += 1
        Views_Arr.append(-1)
        Edits_Arr.append(-1)
        Editors_Arr.append(-1)


time.sleep(5)
browser.quit()



import xlsxwriter
os.chdir("/home")
workbook = xlsxwriter.Workbook('WebScraping.xlsx')
worksheet = workbook.add_worksheet()

worksheet.write(0,0, "Hello")
worksheet.write(0,1, 'Genre')
worksheet.write(0,2, 'Production Budget')
worksheet.write(0,3, 'Domestic Gross')
worksheet.write(0,4, 'Release Date')
worksheet.write(0,5, 'Number of Wikipedia Page Views')
worksheet.write(0,6, 'Number of Wikipedia Edits')
worksheet.write(0,7, 'Number of Wikipedia Editors')

row=1
for i in range(len(titles)):
    worksheet.write(row, 0, titles[i])
    worksheet.write(row, 1, Genre_Arr[i])
    worksheet.write(row, 2, Budget_Arr[i])
    worksheet.write(row, 3, Domestic_Gross_Arr[i])
    worksheet.write(row, 4, Release_Date_Arr[i])
    worksheet.write(row, 5, Theaters_Arr[i])
    worksheet.write(row, 6, Views_Arr[i])
    worksheet.write(row, 7, Edits_Arr[i])
    worksheet.write(row, 8, Editors_Arr[i])
    row += 1


workbook.close()
#编码:utf-8
导入请求
进口bs4
从日期时间导入日期时间
进口稀土
导入操作系统
导入URL库
导入urllib2
从bs4导入BeautifulSoup
从selenium导入webdriver
导入时间
initialpage=https://www.boxofficemojo.com/yearly/chart/?yr=2017&p=.htm'
res=requests.get(initialpage,timeout=None)
soup=bs4.BeautifulSoup(res.text,'html.parser')
页码=[]
pagelinks=soup.select('a[href^=“/yearly/chart/?page”]”)
对于范围内的i(int(len(pagelinks)/2)):
pages.append(str(pagelinks[i])[9:-14])
第[i]页=第[i]页。替换(“amp;”,“”)
第[i]=“页”https://www.boxofficemojo.com“+页[i]
第[i]=页[i][:-1]
页。插入(0,初始页)
日期_dic={}
电影链接=[]
标题=[]
国内总收入
类型_Arr=[]
发布日期\u Arr=[]
剧院_Arr=[]
预算_Arr=[]
视图\u Arr=[]
编辑_Arr=[]
编辑器\u Arr=[]
对于范围内的i(int(len(pagelinks)/2+1)):
电影计数=0;
res1=requests.get(第[i]页)
souppage=bs4.BeautifulSoup(res1.text,'html.parser')
对于souppage中的j,选择('tr>td>b>font>a'):
link=j.get(“href”)[7:.split(&”)
str1=“”.加入(链接)
最终=”https://www.boxofficemojo.com/movies“+str1
如果在最终版本中使用“/?id”:
电影链接。附加(最终)
电影计数+=1
剧院数量=souppage.find(“tr”,bgcolor=“#dcdc”)
对于范围内的k(电影计数):
#打印(影院数量。下一个兄弟。内容[4]。文本)
剧院数附加(剧院数。下一个兄弟。内容[4]。文本)
剧院的数量=剧院的数量。下一个兄弟姐妹
k=0
path=os.getcwd()
路径=路径+'/movie_pictures'
os.makedirs(路径)
os.chdir(路径)
而(k<2):
j=电影链接[k]
尝试:
res1=requests.get(j)
soup1=bs4.BeautifulSoup(res1.text,'html.parser')
c=soup1.select('td[width=“35%”)
d=soup1.select('div[class=“mp\u box\u content”]”)
流派=soup1。选择('td[valign=“top”]')[5]。选择('b')
image=soup1.select('img')[6]。get('src'))
预算=soup1。选择('tr>td>b')
国内=str(c[0]。选择('b'))[4:-5]
释放=soup1.nobr.a
title=soup1.select('title')[0]。getText()[:-25]
打印(“--------------------------------------------------------”)
打印(“标题:”+标题)
标题。附加(标题)
印刷品(“国内生产总值:+国内生产总值”)
国内总金额附加(国内)
打印(“类型:+类型[0]。getText())
流派\u Arr.append(流派[0].getText())
打印(“发布日期:+Release.contents[0])
发布日期\协议附加(发布内容[0])
打印(“生产预算:+Budget[5].getText())
预算\u Arr.append(预算[5].getText())
year1=str(release.contents[0])[-4:]
a、 b=str(release.contents[0]).split(“,”)
month1,day1=a.split(“”)
datez=year1+month1+day1
new_date=datetime.strtime(datez,“%Y%B%d”)
日期dic[标题]=新日期
以open('pic'+str(k)+'.png','wb'作为句柄:
response=requests.get(image,stream=True)
如果没有响应。确定:
打印响应
对于响应中的块。iter_内容(1024):
如果不阻止:
打破
handle.write(块)
除:
打印(“出现错误,页面或数据不可用”)
k+=1
def减去一个月(t):
导入日期时间
一天=datetime.timedelta(天=1)
提前一个月=t-一天
而一个月_previous.month==t.month或一个月_previous.day>t.day:
提前一个月-=提前一天
年份=str(提前一个月)[:4]
day=str(提前一个月)[8:10]
月=str(提前一个月)[5:7]
newdate=年+“-”+月+“-”+日
返回新日期
错误的数量=0
browser=webdriver.Chrome(“/Users/Gokce/Downloads/chromedriver”)
browser.maximize_window()
浏览器。隐式等待(20)
对于标题中的i:
尝试:
发布日期=发布日期[i]
i=i.替换(“”,“”)
i=i.替换(“2017年”、“2017年电影”)
#end=datetime.strtime(发布日期,%B%d,%Y')
结束日期=发布日期.strftime(“%Y-%m-%d”)
开始日期=减去一个月(发布日期)
url=”https://tools.wmflabs.org/pageviews/?project=en.wikipedia.org&platform=all-access&agent=user&start=“+start\u date+”&end=“+end\u date+”&pages=“+i
browser.get(url)
页面视图计数=浏览器。通过css选择器(“摘要列--容器.图例块--页面视图.线性图例--计数:第一个子span.pull-right”)查找元素
page_edits_count=browser.find_element_by_css_selector(“.summary column--container.legend block--revisions.linear legend--counts:第一个子span.pull-right”)
page\u editors\u count=browser.find\u element\u by\u css\u选择器(“.summary column--container.legend block--revisions.legend block--body.linear legend--counts:第n个子项(2)span.pull-right”)
印刷品(一)
打印(“页面浏览次数:+页面浏览次数\计数.文本”)
视图\u Arr.append(页面\u视图\u计数.text)
打印(“编辑次数:+页面\u编辑\u计数.text)
编辑\u Arr.append(页面编辑\u count.text)
打印(“编辑数:”+页面\编辑数\文本)
编辑器\u Arr.append(页面\u编辑器\u count.text)
除:
打印(“此页出现错误:+str(i))
错误的数量+=1
视图\u Arr.append(-1)
编辑\u Arr.append(-1)
编辑器\u Arr.append(-1)
时间。睡眠(5)
browser.quit()
导入xlsxwriter
os.chdir(“/home”)
工作簿=xlsxwriter.工作簿('WebScraping.xlsx')
工作表=工作簿。添加工作表()
工作表。写(0,0,“你好”)
工作表。写入(0,1,‘类型’)
工作表。写入(0,2,‘生产预算’)
工作表。填写(0,3,‘国内生产总值’)
工作表。写入(0,4,‘发布日期’)
工作表。写入(0,
---------------------------------------------------------------------------
IOError                                   Traceback (most recent call last)
<ipython-input-9-c99eea52d475> in <module>()
     27 
     28 
---> 29 workbook.close()

/Users/Gokce/anaconda2/lib/python2.7/site-packages/xlsxwriter/workbook.pyc in close(self)
    309         if not self.fileclosed:
    310             self.fileclosed = 1
--> 311             self._store_workbook()
    312 
    313     def set_size(self, width, height):

/Users/Gokce/anaconda2/lib/python2.7/site-packages/xlsxwriter/workbook.pyc in _store_workbook(self)
    638 
    639         xlsx_file = ZipFile(self.filename, "w", compression=ZIP_DEFLATED,
--> 640                             allowZip64=self.allow_zip64)
    641 
    642         # Add XML sub-files to the Zip file with their Excel filename.

/Users/Gokce/anaconda2/lib/python2.7/zipfile.pyc in __init__(self, file, mode, compression, allowZip64)
    754             modeDict = {'r' : 'rb', 'w': 'wb', 'a' : 'r+b'}
    755             try:
--> 756                 self.fp = open(file, modeDict[mode])
    757             except IOError:
    758                 if mode == 'a':

IOError: [Errno 45] Operation not supported: 'WebScraping.xlsx' 
import openpyxl
# you could create a template spreadsheet that does all of the
# worksheet.write() commands on lines 188-195
wb = openpyxl.load_workbook('Template.xlsx')
wb.create_sheet('my_sheet')
ws = wb.get_sheet_by_name(title='my_sheet')
row=1
for i in range(len(titles)):
    ws.cell(row=row, column=0, value=titles[i])
    ws.cell(row=row, column=1, value=Genre_Arr[i])
    ws.cell(row=row, column=2, value=Budget_Arr[i])
    ws.cell(row=row, column=3, value=Domestic_Gross_Arr[i])
    ws.cell(row=row, column=4, value=Release_Date_Arr[i])
    ws.cell(row=row, column=5, value=Theaters_Arr[i])
    ws.cell(row=row, column=6, value=Views_Arr[i])
    ws.cell(row=row, column=7, value=Edits_Arr[i])
    ws.cell(row=row, column=8, value=Editors_Arr[i])
    row += 1
wb.save('my_file.xlsx')