mark来源:时间序列缺失值填充
import pandas as pd
def fill_source(source, start_time, end_time):
"""
采用窗口长度为5的移动均值对缺失值进行填充
@param source_df:
@param start_time: 开始时间戳,str格式
@param end_time: 结束时间,str格式
@return:
"""
if source is None or len(source) <= 0:
return source
# 转换起止时间格式
start_time = datetime.strptime(start_time, time_format_sample_time)
end_time = datetime.strptime(end_time, time_format_sample_time)
# 缺失值填充
source[column_name_time] = pd.to_datetime(source[column_name_time])
helper = pd.DataFrame({column_name_time: pd.date_range(start_time, end_time, freq="H")})
source = pd.merge(source, helper, on=column_name_time, how="outer").sort_values(column_name_time)
# 线性插值
source[column_name_data] = source[column_name_data].interpolate(method="linear")
# 填补未计算出来的缺失值(method可以参考官方文档)
source[column_name_data].fillna(method="backfill", inplace=True)
return source
source = pd.DataFrame()
time_list = ["2022-01-04 04:00:00", "2022-01-04 06:00:00", "2022-01-04 00:00:00", "2022-01-04 02:00:00"]
data_list = [4, 6, 0, 2]
source[column_name_time] = time_list
source[column_name_data] = data_list
source = fill_source(source, "2022-01-03 23:00:00", "2022-01-04 07:00:00")
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)