详细代码请转链接
🍞正在为您运送作品详情https://mianbaoduo.com/o/bread/Ypmck5tq
内容目录
import pandas as pd
import numpy as np
from datetime import datetime
# Configure matplotlib plotting
import seaborn as sns
import matplotlib.pyplot as plt
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
plt.style.use('fivethirtyeight')
from pylab import rcParams
rcParams['figure.figsize'] = 11, 9
#加载创建的EDA助手函数以进行一些高级分析
class EDA():
df = pd.DataFrame()
def __init__(self, df):
'''
Creates EDA object for the DataFrame
Note for time series data, have the index be the timestamp prior to creating this Object.
:param df : DataFrame
'''
self.df = df
def missing_values(self):
'''
Checks missing values
:return DataFrame
'''
missing = self.df[self.df.isna().any(axis=1)]
print("Missing values data")
return missing
def duplicate_values(self):
duplicates = self.df[self.df.duplicated(subset=None, keep='first')==True]
print("Duplicate values data")
return duplicates
def duplicate_indices(self):
'''
Check whether the indices have any duplicates
:return DataFrame
'''
duplicate_indices = self.df[self.df.index.duplicated()==True]
print("Duplicate indices")
return duplicate_indices
def summary(self):
'''
Return summary/describe of DataFrame
:return DataFrame
'''
df = self.df.reset_index() # Reset to include the index
summary = df.describe(include='all').transpose()
print("Summary metrics")
return summary
def pandas_profiling(self):
import pandas_profiling
self.df.profile_report(style={'full_width':True})
def histogram_KDE(self):
'''
:return seaborn plot
'''
sns.pairplot(self.df, diag='kde')
sns.distplot(kde=True, fit=[st.norm or st.lognorm])
def outliers(self, col):
'''
Checks outliers - anything outside of 5% to 95% quartile range
:param col : str
Name of col to be tested
:return DataFrame
'''
outliers = self.df[~self.df[col].between(self.df[col].quantile(.05), self.df[col].quantile(.95))]
print("Outliers")
return outliers
def missing_timeseries_points(self, freq='D'):
'''
Checks whether there's any missing data points in continuous time series data.
:param freq optional default = 'D' : str
Frequency compliant with pandas formatting
:return DataFrame
'''
# First create date range
date_range = pd.date_range(start=data.index.min(), end=data.index.max(), freq=freq)
# Now compare against dataset
missing_timeseries = self.df.index[~self.df.index.isin(date_range)]
print("Missing timeseries data")
return missing_timeseries
def corr_heatmap(df):
fig, ax = plt.subplots(figsize=(10, 6))
corr = self.df.corr()
hm = sns.heatmap(round(corr,2), annot=True, cmap="coolwarm",fmt='.2f', linewidths=.05)
fig.subplots_adjust(top=0.93)
title = fig.suptitle('Wine Attributes Correlation Heatmap', fontsize=14)
plt.show()
def plot_time_series_seasonal_decomp(self, type='add'):
'''
Plots seasonal decomposition of timeseries data
:return matplotlib Plot
'''
from statsmodels.tsa.seasonal import seasonal_decompose
decomposition = seasonal_decompose(self.df, model='multiplicative')
fig = decomposition.plot()
plt.show()
def time_series_ADF(self):
'''
Returns Augmented Dickey-Fuller Test
'''
from statsmodels.tsa.stattools import adfuller as ADF
series = data['KwH'] # ADF takes series, not DF
result = ADF(series)
print('ADF Statistic: %f4.2' % result[0])
print('P-value %f4.2' % result[1])
#加载数据
data = pd.read_csv("15分钟电力负荷.csv")
data.set_index('Datetime',inplace=True)
data.index = pd.to_datetime(data.index)
data_copy = data.copy(deep=True)
统计测试“烟雾报警器
这里的日期由于为了方便吧15分钟日期改为了每天的日期,后期分析完在改回来
cutoff = '2246-11-17'
daily_data.sort_index()
train = daily_data[:cutoff]
test = daily_data['2243-11-17':]
def engineer_date_attributes(df):
'''
Pre-supposes input df has datetime as index.
:param df : DataFrame
return: Dataframe
'''
df['DOW'] = df.index.dayofweek
df['Month'] = df.index.month
df['WOY'] = df.index.weekofyear
df = df.reset_index()
df = df.set_index(['Month', 'DOW', 'WOY'])
return df
# Calculate average of current by DOW and Month
# THen get the 'shape' of the curve by day of week
# That is represent, for every DOW | MONTH | YOW, what the % of the DOW | Month is
current_df = train['1950-11-16':'2246-11-15']
current_df = engineer_date_attributes(current_df)
current_df['SHAPE_AVG'] = current_df.reset_index().groupby(by=['Month', 'DOW']).mean()['PJME_MW']
current_df['PCT_SHAPE_AVG'] = current_df['PJME_MW'] / current_df['SHAPE_AVG']
current_df = current_df.reset_index().groupby(by=['Month', 'DOW', 'WOY']).mean()
current_df.head(21)
#current_df.count()
future_df = pd.DataFrame()
future_df['Datetime'] = pd.date_range(start='2246-11-16', end='2252-11-15', freq='D')
future_df = future_df.set_index('Datetime')
future_df = engineer_date_attributes(future_df)
。。。。。
。。。。。
。。。。。略
baseline_v2_prediction = baseline_v2_prediction.set_index('Datetime').drop(columns=['SHAPE_AVG', 'PCT_SHAPE_AVG'])
baseline_v2_prediction.head(14)
、、、、
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)