Python 超出范围_Python_Beautifulsoup

Python 超出范围

python

Python 超出范围,python,beautifulsoup,Python,Beautifulsoup,我得到一个列表索引超出范围错误，不确定原因。我的代码是从网站收集温度数据的webscraper。几个月来，所有这些都运转良好，直到最近我有以下几个函数作为参考。重要的是getDailyAve（），这就是引发异常的地方任何想法或建议都将不胜感激 import sys import urllib from bs4 import BeautifulSoup from urllib2 import urlopen, URLError import webbrowser import time fro

我得到一个列表索引超出范围错误，不确定原因。我的代码是从网站收集温度数据的webscraper。几个月来，所有这些都运转良好，直到最近

我有以下几个函数作为参考。重要的是

getDailyAve（）

，这就是引发异常的地方

任何想法或建议都将不胜感激

import sys
import urllib
from bs4 import BeautifulSoup
from urllib2 import urlopen, URLError
import webbrowser
import time
from collections import Counter
import numpy as np
import re
import csv
import datetime
from datetime import timedelta

DATE_FORMAT = '%Y/%m/%d'

def daterange(start, end):
      def convert(date):
            try:
                  date = datetime.datetime.strptime(date, DATE_FORMAT)
                  return date.date()
            except TypeError:
                  return date

      def get_date(n):
            return datetime.datetime.strftime(convert(start) + timedelta(days=n), DATE_FORMAT)

      days = (convert(end) - convert(start)).days
      if days <= 0:
            raise ValueError('The start date must be before the end date.')
      for n in range(0, days):
            yield get_date(n)

class SiteLocation:
  """class defining mine location parameters to lookup on weather search"""
  def __init__(self, city, state, zip, code):
    self.city = city
    self.state = state
    self.zip = zip
    self.code = code

def getDailyAve(url):
  url = urllib.urlopen(url)
  soup = BeautifulSoup(url.read(), 'lxml')
  form = soup.find("form",{"id": "archivedate"})
  table = form.find_next_sibling("table")
  rows = table.select("tr")[1:]

  time=[]
  temp=[]
  minutes=[]

  # handle no data case
  if soup.find(text="Archive data not available for this date."):
    print("Data not available, URL: '%s'" % url)
    return None

  # capture time and temps
  for row in rows:
    data = [td.text for td in row.find_all("td")]

    match = re.search(r"[+-]?(?<!\.)\b[0-9]+\b(?!\.[0-9])",data[2])
    if match:
      temp.append(match.group())
      time.append(data[0])
      minutes.append(data[0][-4:-2])

  common = Counter(minutes).most_common()[0][0]

  finalTimes = []
  finalTemps = []
  for i in range(0,len(time)):
    if minutes[i] == common:
      finalTimes.append(time[i])
      finalTemps.append(int(temp[i]))
  dailyAve = sum(finalTemps) / float(len(finalTimes))
  return dailyAve

def writeToCsv(list1, list2, list3, list4, list5, list6, list7, list8):
  with open('results.csv', 'wb') as csvfile:
    results = csv.writer(csvfile, delimiter=',')
    results.writerow(['T-SJ', 'T- RB', 'T-DS', 'T-JW', 'T-GB', 'D', 'M', 'Y'])
    for idx in range(0,len(list1)):
      results.writerow([str(list1[idx]), str(list2[idx]), str(list3[idx]), str(list4[idx]), str(list5[idx]), str(list6[idx]), str(list7[idx]), str(list8[idx])])

def buildURL(location, day, month, year):
  if day < 10:
    strDay = '0'+str(day)
  else:
    strDay = str(day)

  baseURL  = "http://www.weatherforyou.com/reports/index.php?forecast=pass&pass=archive&zipcode=" + location.zip + "&pands=" + location.city + "%2" + "C" + location.state + "&place=" + location.city + "&state=" + location.state + "&icao=" + location.code + "&country=us&month=" + str(month) + "&day=" + strDay + "&year=" + str(year) + "&dosubmit=Go"
  return baseURL

def main():
  loc1 = SiteLocation('Farmington','NM','87401','KFMN')
  loc2 = SiteLocation('Whitesville','WV','25209','KBKW')
  loc3 = SiteLocation('Rangely','CO','81648','KVEL')
  loc4 = SiteLocation('Brookwood','AL','35444','KTCL')
  loc5 = SiteLocation('Princeton','IN','47670','KAJG')

  start = '2016/08/31'
  end = datetime.date.today()

  dateRange = list(daterange(start, end))

  listDailyAve1 = []
  listDailyAve2 = []
  listDailyAve3 = []
  listDailyAve4 = []
  listDailyAve5 = []

  listDays = []
  listMonths = []
  listYears = []

  for idx in range(0,len(dateRange)):
    strDate = str(dateRange[idx]).split("/")
    year = strDate[0]
    month = strDate[1] 
    day = strDate[2]

    url1 = buildURL(loc1, day, month, year)
    url2 = buildURL(loc2, day, month, year)
    url3 = buildURL(loc3, day, month, year)
    url4 = buildURL(loc4, day, month, year)
    url5 = buildURL(loc5, day, month, year)

    dailyAve1 = getDailyAve(url1)
    dailyAve2 = getDailyAve(url2)
    dailyAve3 = getDailyAve(url3)
    dailyAve4 = getDailyAve(url4)
    dailyAve5 = getDailyAve(url5)

    listDailyAve1.append(dailyAve1)
    listDailyAve2.append(dailyAve2)
    listDailyAve3.append(dailyAve3)
    listDailyAve4.append(dailyAve4)
    listDailyAve5.append(dailyAve5)

    listDays.append(day)
    listMonths.append(month)
    listYears.append(year)

  writeToCsv(listDailyAve1, listDailyAve2, listDailyAve3, listDailyAve4,listDailyAve5, listDays, listMonths, listYears)

if __name__ == '__main__':
  status = main()
  sys.exit(status)

导入系统导入URL库从bs4导入BeautifulSoup 从urllib2导入urlopen，URLError 导入网络浏览器导入时间从收款进口柜台将numpy作为np导入进口稀土导入csv 导入日期时间从日期时间导入时间增量日期\格式=“%Y/%m/%d” def日期范围（开始、结束）： def转换（日期）：尝试： date=datetime.datetime.strtime（日期，日期格式）返回日期。日期（）除类型错误外：返回日期 def获取日期（n）： return datetime.datetime.strftime（转换（开始）+timedelta（天数=n），日期格式）天=（转换（结束）-转换（开始））。天如果天以下是引发的异常：

Traceback (most recent call last):
  File ".\weatherScrape2.py", line 147, in <module>
    status = main()
  File ".\weatherScrape2.py", line 128, in main
    dailyAve1 = getDailyAve(url1)
  File ".\weatherScrape2.py", line 61, in getDailyAve
    match = re.search(r"[+-]?(?<!\.)\b[0-9]+\b(?!\.[0-9])",data[2])
IndexError: list index out of range

回溯（最近一次呼叫最后一次）：
文件“\weatherScrape2.py”，第147行，在
status=main（）
文件“\weatherScrape2.py”，第128行，在main中
dailyAve1=getDailyAve（url1）
文件“\weatherScrape2.py”，第61行，在getDailyAve中
match=re.search（r“[+-]”？（？首先，您需要处理没有可用数据的情况。以下是一种方法：
# handle "no data" case
if soup.find(text="Archive data not available for this date."):
    print("Data not available, URL: '%s'." % url)
    return None

此外，我认为获取行的逻辑中存在问题。我会这样做：
form = soup.find("form", {"id": "archivedate"})

table = form.find_next_sibling("table")
rows = table.select("tr")[1:]


以下是我正在执行的完整代码段（针对单个URL）：
导入请求
从bs4导入BeautifulSoup
从收款进口柜台
进口稀土
def getDailyAve（url）：
response=requests.get（url）
汤=BeautifulSoup（response.content，“lxml”）
form=soup.find（“form”，{“id”：“archivedate”}）
表格=表格。查找下一个兄弟姐妹（“表格”）
行=表。选择（“tr”）[1:]
时间=[]
温度=[]
分钟=[]
#不处理任何数据案例
如果soup.find（text=“存档数据在此日期不可用”）：
打印（“数据不可用，URL:'%s'%URL”）
一无所获
#捕捉时间和节奏
对于行中的行：
data=[td.text代表第行中的td.find_all（“td”）]
match=re.search（r“[+-]？（？温度是否可能降到零度以下？在比利时他们做到了：）您能提供一个发生错误的堆栈跟踪吗？此外，您的代码很容易出错：这不是好的设计（抱歉）.@WillemVanOnsem我的代码应该低于零，但是是的，它们会下降到零以下。@WillemVanOnsem traceback添加了。你能提出一些改进代码的建议吗？还有很多需要学习。尝试了你的更改，但仍然收到一个超出范围的错误。更新的代码和异常显示在我的编辑中。@Pete有趣，适用于我，它在哪个url上失败了？复制和粘贴我显示的代码对你有用？有趣的是…我的代码在@Pete上失败了。请查看更新-我已经将它缩减为调用getDailyAve（）
你发布的url-打印70.125
（注意，我使用的是请求
，而不是urlib2）。我用您的代码片段创建了一个新文件，但收到了相同的错误。它将Indexer:list index抛出超出范围。这是在第27行，这是re.search调用。
import requests
from bs4 import BeautifulSoup
from collections import Counter
import re


def getDailyAve(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'lxml')
    form = soup.find("form", {"id": "archivedate"})
    table = form.find_next_sibling("table")
    rows = table.select("tr")[1:]

    time = []
    temp = []
    minutes = []

    # handle no data case
    if soup.find(text="Archive data not available for this date."):
        print("Data not available, URL: '%s'" % url)
        return None

    # capture time and temps
    for row in rows:
        data = [td.text for td in row.find_all("td")]

        match = re.search(r"[+-]?(?<!\.)\b[0-9]+\b(?!\.[0-9])", data[2])
        if match:
            temp.append(match.group())
            time.append(data[0])
            minutes.append(data[0][-4:-2])

    common = Counter(minutes).most_common()[0][0]

    finalTimes = []
    finalTemps = []
    for i in range(0, len(time)):
        if minutes[i] == common:
            finalTimes.append(time[i])
            finalTemps.append(int(temp[i]))
    dailyAve = sum(finalTemps) / float(len(finalTimes))
    return dailyAve


print(getDailyAve("https://www.weatherforyou.com/reports/index.php?forecast=pass&pass=archive&zipcode=87401&pands=Farmington%2CNM&place=Farmington&state=NM&icao=KFMN&country=us&month=09&day=03&year=2016&dosubmit=Go"))