Python 如何使用matplotlib和pandas绘制datetime与值的线性趋势线?

Python 如何使用matplotlib和pandas绘制datetime与值的线性趋势线?,python,pandas,matplotlib,Python,Pandas,Matplotlib,我想产生一个线性最佳拟合的平均CPU使用率每天的趋势线 我的数据如下所示: host_df_means['cpu_usage_percent'] history_datetime 2020-03-03 9.727273 2020-03-04 9.800000 2020-03-05 9.727273 2020-03-06 10.818182 2020-03-07 9.500000 2020-03-08 10.909091 2020-03-09 15

我想产生一个线性最佳拟合的平均CPU使用率每天的趋势线

我的数据如下所示:

host_df_means['cpu_usage_percent']
history_datetime
2020-03-03     9.727273
2020-03-04     9.800000
2020-03-05     9.727273
2020-03-06    10.818182
2020-03-07     9.500000
2020-03-08    10.909091
2020-03-09    15.000000
2020-03-10    14.333333
2020-03-11    15.333333
2020-03-12    16.000000
2020-03-13    21.000000
2020-03-14    28.833333
Name: cpu_usage_percent, dtype: float64
#Import the necessary libraries
import matplotlib.pyplot as plot
from scipy import stats
import numpy as np

# recreating the data arrays
data = [["2020-03-03",9.727273],
["2020-03-04",9.800000],
["2020-03-05",9.727273],
["2020-03-06",10.818182],
["2020-03-07",9.500000],
["2020-03-08",10.909091],
["2020-03-09",15.000000],
["2020-03-10",14.333333],
["2020-03-11",15.333333],
["2020-03-12",16.000000],
["2020-03-13",21.000000],
["2020-03-14",28.833333]]

fig, ax = plot.subplots()

# Separating string array and CPU usage array
dates = [x[0] for x in data]
usage = [x[1] for x in data]

# dates are linear so just use len of usage array and save dates as tick labels
bestfit = stats.linregress(range(len(usage)),usage)
equation = str(round(bestfit[0],2)) + "x + " + str(round(bestfit[1],2)) 
ax.plot(range(len(usage)), usage)
ax.plot(range(len(usage)), np.poly1d(np.polyfit(range(len(usage)), usage, 1))(range(len(usage))), '--',label=equation)

# Add how many days ahead you want to prediction
extension = 5
# Extended prediction
for x in range(len(usage),len(usage)+extension):
    usage.append((bestfit[0]*x)+bestfit[1])  # mx + c from linear regression found before
    day = str()
    newdate = dates[-1][0:8]+str(int(dates[-1][-2:])+1)
    dates.append(newdate)
ax.plot(range(len(usage)), np.poly1d(np.polyfit(range(len(usage)), usage, 1))(range(len(usage))), ':',label=str(extension)+" day prediction")

# Set date tick labels and legend
plot.xticks(range(len(dates)))
ax.set_xticklabels(dates)
plot.legend()

# Display plot
plot.show()
然后,我用以下方式来描绘这一点:

plot = host_df_means['cpu_usage_percent'].plot()
plot.set_xlim([datetime.date(2020, 3, 3), datetime.date(2020, 3, 31)])
plot;
这就产生了这样一个情节

因此,现在我想为未来添加一条趋势线,如下所示:

host_df_means['cpu_usage_percent']
history_datetime
2020-03-03     9.727273
2020-03-04     9.800000
2020-03-05     9.727273
2020-03-06    10.818182
2020-03-07     9.500000
2020-03-08    10.909091
2020-03-09    15.000000
2020-03-10    14.333333
2020-03-11    15.333333
2020-03-12    16.000000
2020-03-13    21.000000
2020-03-14    28.833333
Name: cpu_usage_percent, dtype: float64
#Import the necessary libraries
import matplotlib.pyplot as plot
from scipy import stats
import numpy as np

# recreating the data arrays
data = [["2020-03-03",9.727273],
["2020-03-04",9.800000],
["2020-03-05",9.727273],
["2020-03-06",10.818182],
["2020-03-07",9.500000],
["2020-03-08",10.909091],
["2020-03-09",15.000000],
["2020-03-10",14.333333],
["2020-03-11",15.333333],
["2020-03-12",16.000000],
["2020-03-13",21.000000],
["2020-03-14",28.833333]]

fig, ax = plot.subplots()

# Separating string array and CPU usage array
dates = [x[0] for x in data]
usage = [x[1] for x in data]

# dates are linear so just use len of usage array and save dates as tick labels
bestfit = stats.linregress(range(len(usage)),usage)
equation = str(round(bestfit[0],2)) + "x + " + str(round(bestfit[1],2)) 
ax.plot(range(len(usage)), usage)
ax.plot(range(len(usage)), np.poly1d(np.polyfit(range(len(usage)), usage, 1))(range(len(usage))), '--',label=equation)

# Add how many days ahead you want to prediction
extension = 5
# Extended prediction
for x in range(len(usage),len(usage)+extension):
    usage.append((bestfit[0]*x)+bestfit[1])  # mx + c from linear regression found before
    day = str()
    newdate = dates[-1][0:8]+str(int(dates[-1][-2:])+1)
    dates.append(newdate)
ax.plot(range(len(usage)), np.poly1d(np.polyfit(range(len(usage)), usage, 1))(range(len(usage))), ':',label=str(extension)+" day prediction")

# Set date tick labels and legend
plot.xticks(range(len(dates)))
ax.set_xticklabels(dates)
plot.legend()

# Display plot
plot.show()

我会使用scipy统计回归函数来获得最佳拟合方程,以预测未来的使用情况,您也可以像推荐的Chris A一样使用np.polyfit和np.poly1d。以下内容包括这两个方面,类似于:

host_df_means['cpu_usage_percent']
history_datetime
2020-03-03     9.727273
2020-03-04     9.800000
2020-03-05     9.727273
2020-03-06    10.818182
2020-03-07     9.500000
2020-03-08    10.909091
2020-03-09    15.000000
2020-03-10    14.333333
2020-03-11    15.333333
2020-03-12    16.000000
2020-03-13    21.000000
2020-03-14    28.833333
Name: cpu_usage_percent, dtype: float64
#Import the necessary libraries
import matplotlib.pyplot as plot
from scipy import stats
import numpy as np

# recreating the data arrays
data = [["2020-03-03",9.727273],
["2020-03-04",9.800000],
["2020-03-05",9.727273],
["2020-03-06",10.818182],
["2020-03-07",9.500000],
["2020-03-08",10.909091],
["2020-03-09",15.000000],
["2020-03-10",14.333333],
["2020-03-11",15.333333],
["2020-03-12",16.000000],
["2020-03-13",21.000000],
["2020-03-14",28.833333]]

fig, ax = plot.subplots()

# Separating string array and CPU usage array
dates = [x[0] for x in data]
usage = [x[1] for x in data]

# dates are linear so just use len of usage array and save dates as tick labels
bestfit = stats.linregress(range(len(usage)),usage)
equation = str(round(bestfit[0],2)) + "x + " + str(round(bestfit[1],2)) 
ax.plot(range(len(usage)), usage)
ax.plot(range(len(usage)), np.poly1d(np.polyfit(range(len(usage)), usage, 1))(range(len(usage))), '--',label=equation)

# Add how many days ahead you want to prediction
extension = 5
# Extended prediction
for x in range(len(usage),len(usage)+extension):
    usage.append((bestfit[0]*x)+bestfit[1])  # mx + c from linear regression found before
    day = str()
    newdate = dates[-1][0:8]+str(int(dates[-1][-2:])+1)
    dates.append(newdate)
ax.plot(range(len(usage)), np.poly1d(np.polyfit(range(len(usage)), usage, 1))(range(len(usage))), ':',label=str(extension)+" day prediction")

# Set date tick labels and legend
plot.xticks(range(len(dates)))
ax.set_xticklabels(dates)
plot.legend()

# Display plot
plot.show()

保持数据为
pd.DataFrame
,诀窍是将日期转换为可用于执行线性回归的数字类型

import datetime
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as stats
from io import StringIO

# Set up data as in question

host_df_means = pd.read_csv(StringIO("""
        2020-03-03     9.727273
        2020-03-04     9.800000
        2020-03-05     9.727273
        2020-03-06    10.818182
        2020-03-07     9.500000
        2020-03-08    10.909091
        2020-03-09    15.000000
        2020-03-10    14.333333
        2020-03-11    15.333333
        2020-03-12    16.000000
        2020-03-13    21.000000
        """),
        sep='\s+', header=None, parse_dates=[0], index_col=0)
host_df_means.columns = ['cpu_usage_percent']
host_df_means.index.name = 'history_datetime'

fig, ax = plt.subplots(1, 1)
ax.plot(host_df_means.index, host_df_means)
ax.set_xlim([datetime.date(2020, 3, 3), datetime.date(2020, 3, 31)])

# To perform the linear regression we need the dates to be numeric
host_df_means.index = host_df_means.index.map(datetime.date.toordinal)
# Perform linear regression
slope, y0, r, p, stderr = stats.linregress(host_df_means.index,
                                           host_df_means['cpu_usage_percent'])

# x co-ordinates for the start and end of the line
x_endpoints = pd.DataFrame([host_df_means.index[0], host_df_means.index[-1]])

# Compute predicted values from linear regression
y_endpoints = y0 + slope * x_endpoints
# Overlay the line
ax.plot(x_endpoints, y_endpoints, c='r')
ax.set_xlabel('history_datetime')

您遇到的具体问题是什么?请检查并