Multi-step Time series prediction using LSTM
Dataset Description
The data was collected between December 2006 and November 2010 and observations of power consumption within the household were collected every minute.
It is a multivariate series comprised of seven variables
global_active_power: The total active power consumed by the household (kilowatts).
global_reactive_power: The total reactive power consumed by the household (kilowatts).
voltage: Average voltage (volts).
global_intensity: Average current intensity (amps).
sub_metering_1: Active energy for kitchen (watt-hours of active energy).
sub_metering_2: Active energy for laundry (watt-hours of active energy).
sub_metering_3: Active energy for climate control systems (watt-hours of active energy).
This data represents a multivariate time series of power-related variables that in turn could be used to model and even forecast future electricity consumption.
Download dataset
https://archive.ics.uci.edu/ml/machine-learning-databases/00235/household_power_consumption.zip
Importing necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from numpy import nan
from tensorflow.keras import Sequential
from tensorflow.keras.layers import LSTM, Dense
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
Reading the dataset
data = pd.read_csv('household_power_consumption.txt', sep = ';',parse_dates = True,low_memory = False)data.info()<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1844765 entries, 0 to 1844764
Data columns (total 9 columns):
# Column Dtype
--- ------ -----
0 Date object
1 Time object
2 Global_active_power object
3 Global_reactive_power object
4 Voltage object
5 Global_intensity object
6 Sub_metering_1 object
7 Sub_metering_2 object
8 Sub_metering_3 float64
dtypes: float64(1), object(8)data['date_time'] = data['Date'].str.cat(data['Time'], sep= ' ')
data.drop(['Date', 'Time'], inplace= True, axis = 1)
data.head()Setting time as index
data.set_index(['date_time'], inplace=True)
data.head()data.replace('?', nan, inplace=True)
data = data.astype('float')
Checking nan values
np.isnan(data).sum()Global_active_power 13513
Global_reactive_power 13513
Voltage 13513
Global_intensity 13513
Sub_metering_1 13513
Sub_metering_2 13513
Sub_metering_3 13514
dtype: int64
Imputing missing data
def fill_missing(data):
one_day = 24*60
for row in range(data.shape[0]):
for col in range(data.shape[1]):
if np.isnan(data[row, col]):
data[row, col] = data[row-one_day, col]fill_missing(data.values)np.isnan(data).sum()Global_active_power 0
Global_reactive_power 0
Voltage 0
Global_intensity 0
Sub_metering_1 0
Sub_metering_2 0
Sub_metering_3 0
Final cleaned dataframe
data.to_csv('cleaned_data.csv')dataset = pd.read_csv('cleaned_data.csv', parse_dates = True, index_col = 'date_time', low_memory = False)
dataset.head()Downsampling the data into dáy-wise bins and sum the values of the timestamps falling into a bin.
data = dataset.resample('D').sum()fig, ax = plt.subplots(figsize=(18,18))
for i in range(len(data.columns)):
plt.subplot(len(data.columns), 1, i+1)
name = data.columns[i]
plt.plot(data[name])
plt.title(name, y=0, loc = 'right')
plt.yticks([])
plt.show()
fig.tight_layout()
fig, ax = plt.subplots(figsize=(18,18))
for i in range(len(years)):
plt.subplot(len(years), 1, i+1)
year = years[i]
active_power_data = data[str(year)]
active_power_data['Global_active_power'].hist(bins = 200)
plt.title(str(year), y = 0, loc = 'left')
plt.show()
fig.tight_layout()
fig, ax = plt.subplots(figsize=(18,18))
for i in range(len(data.columns)):
plt.subplot(len(data.columns), 1, i+1)
name = data.columns[i]
data[name].hist(bins=200)
plt.title(name, y=0, loc = 'right')
plt.yticks([])
plt.show()
fig.tight_layout()
Here splitting the dataset upto end of 2009 is in train dataset and remaining we keeping it in test dataset.
data_train = data.loc[:'2009-12-31', :]['Global_active_power']
data_train.head()date_time
2006-12-16 1209.176
2006-12-17 3390.460
2006-12-18 2203.826
2006-12-19 1666.194
2006-12-20 2225.748
Freq: D, Name: Global_active_power, dtype: float64data_test = data['2010']['Global_active_power']
data_test.head()date_time
2010-01-01 1224.252
2010-01-02 1693.778
2010-01-03 1298.728
2010-01-04 1687.440
2010-01-05 1320.158
Freq: D, Name: Global_active_power, dtype: float64data_train.shape, data_test.shape((1112,), (340,))data_train = np.array(data_train)We are splitting the data weekly wise(7days)
X_train, y_train = [], []
for i in range(7, len(data_train)-7):
X_train.append(data_train[i-7:i])
y_train.append(data_train[i:i+7])X_train, y_train = np.array(X_train), np.array(y_train) #converting list into numpy arrayX_train.shape, y_train.shape((1098, 7), (1098, 7))Normalising the dataset between 0 and 1
x_scaler = MinMaxScaler()
X_train = x_scaler.fit_transform(X_train)X_train = X_train.reshape(1098, 7, 1) #Reshaping the shape into 3 dimensions to fit in the LSTM Model.
X_train.shape(1098, 7, 1)
Building sequential model using Keras
reg = Sequential()
reg.add(LSTM(units = 200, activation = 'relu', input_shape=(7,1)))
reg.add(Dense(7))reg.compile(loss='mse', optimizer='adam')reg.fit(X_train, y_train, epochs = 100)
At the end of 100th epoch we got loss of 0.0235.
data_test = np.array(data_test)Here we are splitting the data weekly wise(7days)
X_test, y_test = [], []
for i in range(7, len(data_test)-7):
X_test.append(data_test[i-7:i])
y_test.append(data_test[i:i+7])X_test, y_test = np.array(X_test), np.array(y_test)X_test = x_scaler.transform(X_test)
y_test = y_scaler.transform(y_test)
X_test.shape(326, 7)X_test = X_test.reshape(326,7,1)
X_test.shape(326, 7, 1)y_pred = reg.predict(X_test)Bringing y_pred values to their original forms by using inverse transform
y_pred = y_scaler.inverse_transform(y_pred)y_true = y_scaler.inverse_transform(y_test)
Evaluating the model Here, we using metric as mean square error since it is a regression problem.
def evaluate_model(y_true, y_predicted):
scores = []
#calculate scores for each day
for i in range(y_true.shape[1]):
mse = mean_squared_error(y_true[:, i], y_predicted[:, i])
rmse = np.sqrt(mse)
scores.append(rmse)
#calculate score for whole prediction
total_score = 0
for row in range(y_true.shape[0]):
for col in range(y_predicted.shape[1]):
total_score = total_score + (y_true[row, col] - y_predicted[row, col])**2
total_score = np.sqrt(total_score/(y_true.shape[0]*y_predicted.shape[1]))
return total_score, scoresevaluate_model(y_true, y_pred)(223.66414385063564,
[23.755551511896517,
19.149462663780774,
196.17614003285703,
410.0865160205628,
209.69801685827352,
121.53376189597935,
289.56652857013654])Standard deviation
np.std(y_true[0])0.2449465129684969
Result
Since the model result 24.4 is less than the standard deviation obtained we can say that our model is giving a good performance on the test dataset.
Hope you liked the analysis!
You can follow me on Linkedin , Github and Kaggle.
Github Link of this project
https://github.com/ratul442/Multi-step-Time-series-predicting-using-RNN-LSTM