第十届泰迪杯数据挖掘挑战赛-B题--第一题_python

详细代码请转链接
🍞正在为您运送作品详情https://mianbaoduo.com/o/bread/Ypmck5tq

内容目录

import pandas as pd
import numpy as np
from datetime import datetime

# Configure matplotlib plotting
import seaborn as sns
import matplotlib.pyplot as plt
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
plt.style.use('fivethirtyeight')

from pylab import rcParams
rcParams['figure.figsize'] = 11, 9

#加载创建的EDA助手函数以进行一些高级分析
class EDA():

    df = pd.DataFrame()
    
    def __init__(self, df):
        '''
        Creates EDA object for the DataFrame
        
        Note for time series data, have the index be the timestamp prior to creating this Object.
        
        :param df : DataFrame
        '''
        self.df = df
        
    def missing_values(self):
        '''
        Checks missing values
        
        :return DataFrame
        
        '''
        missing = self.df[self.df.isna().any(axis=1)]
        
        print("Missing values data")
        
        return missing
    
    def duplicate_values(self):
        duplicates = self.df[self.df.duplicated(subset=None, keep='first')==True]
        
        print("Duplicate values data")
        
        return duplicates
        
    def duplicate_indices(self):
        '''
        Check whether the indices have any duplicates
        
        :return DataFrame
        '''        
        duplicate_indices = self.df[self.df.index.duplicated()==True]
        
        print("Duplicate indices")
        
        return duplicate_indices
            
    def summary(self):
        '''
        Return summary/describe of DataFrame
        
        :return DataFrame
        '''
        df = self.df.reset_index() # Reset to include the index
        
        summary = df.describe(include='all').transpose()
        
        print("Summary metrics")
        
        return summary
    
    def pandas_profiling(self):
        import pandas_profiling
        
        self.df.profile_report(style={'full_width':True})  
    
    def histogram_KDE(self):
        ''' 
        :return seaborn plot
        '''       
        sns.pairplot(self.df, diag='kde')
        sns.distplot(kde=True, fit=[st.norm or st.lognorm])
        
    def outliers(self, col):
        ''' 
        Checks outliers - anything outside of 5% to 95% quartile range
        
        :param col : str
            Name of col to be tested
            
        :return DataFrame
        '''
        outliers = self.df[~self.df[col].between(self.df[col].quantile(.05), self.df[col].quantile(.95))]
        
        print("Outliers")
        
        return outliers
        
    def missing_timeseries_points(self, freq='D'):
        '''
        Checks whether there's any missing data points in continuous time series data.
        
        :param freq optional default = 'D' : str
            Frequency compliant with pandas formatting
        
        :return DataFrame
        '''
        # First create date range
        date_range = pd.date_range(start=data.index.min(), end=data.index.max(), freq=freq)

        # Now compare against dataset
        missing_timeseries = self.df.index[~self.df.index.isin(date_range)]
        
        print("Missing timeseries data")
        
        return missing_timeseries

    def corr_heatmap(df):
        fig, ax = plt.subplots(figsize=(10, 6))
        corr = self.df.corr()
        hm = sns.heatmap(round(corr,2), annot=True, cmap="coolwarm",fmt='.2f', linewidths=.05)
        fig.subplots_adjust(top=0.93)
        title = fig.suptitle('Wine Attributes Correlation Heatmap', fontsize=14)

        plt.show()

    def plot_time_series_seasonal_decomp(self, type='add'):
        '''
        Plots seasonal decomposition of timeseries data
        
        :return matplotlib Plot
        '''
        from statsmodels.tsa.seasonal import seasonal_decompose
        decomposition = seasonal_decompose(self.df, model='multiplicative')

        fig = decomposition.plot()
        plt.show()
   
    def time_series_ADF(self):            
        '''
        Returns Augmented Dickey-Fuller Test
        '''
        from statsmodels.tsa.stattools import adfuller as ADF

        series = data['KwH'] # ADF takes series, not DF

        result = ADF(series)

        print('ADF Statistic: %f4.2' % result[0])
        print('P-value %f4.2' % result[1])

#加载数据
data = pd.read_csv("15分钟电力负荷.csv")

data.set_index('Datetime',inplace=True)

data.index = pd.to_datetime(data.index)

data_copy = data.copy(deep=True)

统计测试“烟雾报警器

这里的日期由于为了方便吧15分钟日期改为了每天的日期，后期分析完在改回来
cutoff = '2246-11-17'

daily_data.sort_index()

train = daily_data[:cutoff]
test = daily_data['2243-11-17':]

def engineer_date_attributes(df):
    '''
    
    Pre-supposes input df has datetime as index.
    
    :param df : DataFrame
    
    return: Dataframe
    '''
    
    
    df['DOW'] = df.index.dayofweek
    df['Month'] = df.index.month
    df['WOY'] = df.index.weekofyear
    df = df.reset_index()

    df = df.set_index(['Month', 'DOW', 'WOY'])

    return df

# Calculate average of current by DOW and Month
# THen get the 'shape' of the curve by day of week
# That is represent, for every DOW | MONTH | YOW, what the % of the DOW | Month is
current_df = train['1950-11-16':'2246-11-15']
current_df = engineer_date_attributes(current_df)
current_df['SHAPE_AVG'] = current_df.reset_index().groupby(by=['Month', 'DOW']).mean()['PJME_MW']
current_df['PCT_SHAPE_AVG'] = current_df['PJME_MW'] / current_df['SHAPE_AVG']
current_df = current_df.reset_index().groupby(by=['Month', 'DOW', 'WOY']).mean()

current_df.head(21)
#current_df.count()


future_df = pd.DataFrame()
future_df['Datetime'] = pd.date_range(start='2246-11-16', end='2252-11-15', freq='D')
future_df = future_df.set_index('Datetime')
future_df = engineer_date_attributes(future_df)

。。。。。
。。。。。
。。。。。略


baseline_v2_prediction = baseline_v2_prediction.set_index('Datetime').drop(columns=['SHAPE_AVG', 'PCT_SHAPE_AVG'])

baseline_v2_prediction.head(14)

、、、、

欢迎分享，转载请注明来源：内存溢出

原文地址: https://outofmemory.cn/langs/737334.html

第十届泰迪杯数据挖掘挑战赛-B题--第一题

发表评论

评论列表（0条）