Python 在蟒蛇靓汤和熊猫中填充缺失的枣_Python_Python 3.x_Pandas_Web Scraping_Beautifulsoup

Python 在蟒蛇靓汤和熊猫中填充缺失的枣

python python-3.x pandas web-scraping

Python 在蟒蛇靓汤和熊猫中填充缺失的枣,python,python-3.x,pandas,web-scraping,beautifulsoup,Python,Python 3.x,Pandas,Web Scraping,Beautifulsoup,我有这个网站，我从那里刮数据作为CSV文件。我勉强算出了日期和价格。然而，日期是周格式，我需要将其转换为日期格式，如5个工作日的每日价格。（周一至周六）。我用蟒蛇、熊猫和漂亮的汤做了这个。从urllib.request导入urlopen from urllib.error import HTTPError from urllib.error import URLError from bs4 import BeautifulSoup from pandas import DataFrame i

我有这个网站，我从那里刮数据作为CSV文件。我勉强算出了日期和价格。然而，日期是周格式，我需要将其转换为日期格式，如5个工作日的每日价格。（周一至周六）。我用蟒蛇、熊猫和漂亮的汤做了这个。从urllib.request导入urlopen

from urllib.error import HTTPError 
from urllib.error import URLError
from bs4 import BeautifulSoup
from pandas import DataFrame
import csv
import pandas as pd 
from urllib.request import urlopen

尝试：

除HTTPError作为e外：

print(e)

除URL错误外：

print("Server down or incorrect domain")

其他：

我不完全清楚你们想要什么样的日期，但我把它们都提取出来，并称之为开始日期和结束日期

在：

输出：

您的实际代码为每一行创建一个列表，为每个单元格创建一个列表，但这两者并不匹配。下面的脚本搜索表（它是唯一具有属性summary的表）并在每行（tr）上循环。然后从Week列（td类B6）中获取“to”之前的第一部分，并将其转换为datetime。对于每个单元格（td类B3），它将获得价格（或空字符串），设置日期并递增日期

from urllib.error import HTTPError 
from urllib.error import URLError
from bs4 import BeautifulSoup
from pandas import DataFrame
import csv
import pandas as pd 
from urllib.request import urlopen
import datetime

try:
    html = urlopen("https://www.eia.gov/dnav/ng/hist/rngwhhdD.htm")
except HTTPError as e:
    print(e)
except URLError:
    print("Server down or incorrect domain")
else:
    res = BeautifulSoup(html.read(),"html5lib")

table = None
for t in res.findAll("table"):
    table = t if "summary" in t.attrs else table
if table == None: exit()

# stop_date = datetime.datetime(year = 2018, month = 7, day = 12)
# today = datetime.datetime.now()
# abort = False

price_list = []
date_list = []

rows = table.findAll("tr")[1:]
for row in rows:
    date = None
    cells = row.findAll("td")
    if cells[0].get("class") == None: continue # placeholder..
    if "B6" in cells[0].get("class"):
        d = cells[0].getText().split(" to ")[0].strip().replace(" ", "")
        date = datetime.datetime.strptime(d,"%Y%b-%d")
        for cell in cells:
            if "B3" in cell.get("class"): # and abort == False:
                price = cell.getText().strip()
                if price == "" or price == "NA": price = ""
                else: price = float(price)
                price_list.append(price)
                date_list.append(date)
                date = date + datetime.timedelta(days=1)
                #if date > today: abort = True
        #if abort == True: break

d1 = pd.DataFrame({'Date': date_list})
d2 = pd.DataFrame({'Price': price_list})
df = pd.concat([d1,d2], axis=1)
print(df)
df.to_csv(r"Gas Price.csv", index=False, header=True)

res = BeautifulSoup(html.read(),"html5lib")



price = res.findAll(class_=["tbody", "td", "B3"])
price_list = []

for tag in price:
    price_tag=tag.getText()
    price_list.append(price_tag)
    print(price_tag)



date = res.findAll(class_=["tbody", "td", "B6"])
date_list = []

for tag in date:
    date_tag=tag.getText()
    date_list.append(date_tag)
    print(date_tag)


d1 = pd.DataFrame({'Date': date_list})
d2 = pd.DataFrame({'Price': price_list})
df = pd.concat([d1,d2], axis=1)
print(df)
df.to_csv("Gas Price.csv", index=False, header=True)

df = pd.DataFrame({'Date': ['1997 Jan- 6 to Jan-10', '1997 Jan-13 to Jan-17'], 'Price': [3.80, 5.00] })

df['Temp_Year'] = df.Date.str.extract(r'((?:19|20)\d\d)')
df['Temp_Date'] = df.Date.str.replace(r'((?:19|20)\d\d)','')

df[['Start Date', 'End Date']] = df.Temp_Date.str.split('to', expand=True)

df['Start Date'] = pd.to_datetime(df['Temp_Year'] + ' ' + df['Start Date'].str.replace(" ",""))
df['End Date'] = pd.to_datetime(df['Temp_Year'] + ' ' + df['End Date'].str.replace(" ",""))

df.drop(['Temp_Year', 'Temp_Date'], axis=1)

|   | Date                  | Price | Start Date | End Date   |
|---|-----------------------|-------|------------|------------|
| 0 | 1997 Jan- 6 to Jan-10 | 3.8   | 1997-01-06 | 1997-01-10 |
| 1 | 1997 Jan-13 to Jan-17 | 5.0   | 1997-01-13 | 1997-01-17 |

from urllib.error import HTTPError 
from urllib.error import URLError
from bs4 import BeautifulSoup
from pandas import DataFrame
import csv
import pandas as pd 
from urllib.request import urlopen
import datetime

try:
    html = urlopen("https://www.eia.gov/dnav/ng/hist/rngwhhdD.htm")
except HTTPError as e:
    print(e)
except URLError:
    print("Server down or incorrect domain")
else:
    res = BeautifulSoup(html.read(),"html5lib")

table = None
for t in res.findAll("table"):
    table = t if "summary" in t.attrs else table
if table == None: exit()

# stop_date = datetime.datetime(year = 2018, month = 7, day = 12)
# today = datetime.datetime.now()
# abort = False

price_list = []
date_list = []

rows = table.findAll("tr")[1:]
for row in rows:
    date = None
    cells = row.findAll("td")
    if cells[0].get("class") == None: continue # placeholder..
    if "B6" in cells[0].get("class"):
        d = cells[0].getText().split(" to ")[0].strip().replace(" ", "")
        date = datetime.datetime.strptime(d,"%Y%b-%d")
        for cell in cells:
            if "B3" in cell.get("class"): # and abort == False:
                price = cell.getText().strip()
                if price == "" or price == "NA": price = ""
                else: price = float(price)
                price_list.append(price)
                date_list.append(date)
                date = date + datetime.timedelta(days=1)
                #if date > today: abort = True
        #if abort == True: break

d1 = pd.DataFrame({'Date': date_list})
d2 = pd.DataFrame({'Price': price_list})
df = pd.concat([d1,d2], axis=1)
print(df)
df.to_csv(r"Gas Price.csv", index=False, header=True)