This notebook presents a few examples of clustering time series with the regular k-means as well as with the time series k-means (DTW Barycenter Averaging k-means, DBA-k-means).
REMARK: start with Dataset 1 and see that k-means works (it should not - the time series are the same, but shifted only), then move to the Dataset 4
# !conda install -c conda-forge tslearn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime
import gzip
import pickle
import time
%matplotlib inline
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42
plt.rcParams['axes.facecolor'] = 'white'
plt.rcParams['figure.facecolor'] = 'white'
import warnings
warnings.filterwarnings("ignore")
from sklearn.metrics import confusion_matrix
from sklearn.cluster import KMeans
from tslearn.clustering import TimeSeriesKMeans
# Dataset 0
df = pd.read_csv('water_consumption_profiles.csv')
df['date'] = df['date'].map(lambda x: pd.to_datetime(x, format='%Y-%m-%d'))
df = df.set_index('date')
df_ = df.groupby([df.index.month, df.index.weekday]).mean()
X = df_.values
y = df_.reset_index([1])['date'].map(lambda x: 0 if x < 5 else 1).values
N = len(y)
print(X.shape, y.shape)
plt.figure(figsize=(12, 4))
# plt.plot(X.T, 'k')
plt.plot(X[y == 0, :].T, '#1F77B4')
plt.plot(X[y == 1, :].T, '#FF7F0E')
plt.show()
# plt.figure(figsize=(12, 4))
# # plt.plot(X.T, 'k')
# plt.plot(X[y == 0, :][:1,:].T, '#1F77B4')
# plt.plot(X[y == 1, :][:1,:].T, '#FF7F0E')
# plt.show()
(84, 144) (84,)
shuffled_indices_ = np.random.permutation(N)
test_indices = shuffled_indices_[int(0.65*N):].copy()
train_indices = shuffled_indices_[:int(0.65*N)].copy()
X_train, y_train = X[train_indices, :], y[train_indices]
X_test, y_test = X[test_indices, :], y[test_indices]
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)
(54, 144) (54,) (30, 144) (30,)
t = time.time()
kmeans = KMeans(n_clusters=2)
kmeans.fit(X_train)
print(time.time() - t)
0.013117551803588867
labels = kmeans.predict(X_train)
plt.figure(figsize=(12, 4))
# plt.plot(X.T, 'k')
plt.plot(X_train[labels == 0, :].T, '#1F77B4')
plt.plot(X_train[labels == 1, :].T, '#FF7F0E')
plt.show()
plt.figure(figsize=(12, 4))
# plt.plot(X.T, 'k')
plt.plot(kmeans.cluster_centers_[0, :].T, '#1F77B4')
plt.plot(kmeans.cluster_centers_[1, :].T, '#FF7F0E')
plt.show()
labels = kmeans.predict(X_test)
plt.figure(figsize=(12, 4))
# plt.plot(X.T, 'k')
plt.plot(X_test[labels == 0, :].T, '#1F77B4')
plt.plot(X_test[labels == 1, :].T, '#FF7F0E')
plt.show()
# # Dataset 1
# L = 100
# N1 = 100
# N2 = 100
# N = N1 + N2
# T = np.linspace(0, 4*np.pi, L)
# # X1 = np.sin(T + 0.00 * np.random.randn(N1, L)) + 0.15 * np.random.randn(N1, L)
# X1 = np.sin(T + 0.35 * np.random.randn(N1, L)) + 0.15 * np.random.randn(N1, L)
# y1 = 0 * np.ones(N1)
# # X2 = np.cos(T + 0.00 * np.random.randn(N2, L)) + 0.15 * np.random.randn(N2, L)
# X2 = np.cos(T + 0.35 * np.random.randn(N2, L)) + 0.15 * np.random.randn(N2, L)
# y2 = 1 * np.ones(N2)
# X = np.vstack([X1, X2])
# y = np.hstack([y1, y2])
# print(X.shape, y.shape)
# plt.figure(figsize=(12, 4))
# # plt.plot(X.T, 'k')
# plt.plot(X[y == 0, :].T, '#1F77B4')
# plt.plot(X[y == 1, :].T, '#FF7F0E')
# plt.show()
# plt.figure(figsize=(12, 4))
# # plt.plot(X.T, 'k')
# plt.plot(X[y == 0, :][:1,:].T, '#1F77B4')
# plt.plot(X[y == 1, :][:1,:].T, '#FF7F0E')
# plt.show()
# # Dataset 2
# L = 100
# N1 = 100
# N2 = 100
# N = N1 + N2
# T = np.linspace(0, 4*np.pi, L)
# # X1 = np.sin(T + 0.00 * np.random.randn(N1, L)) + 0.15 * np.random.randn(N1, L)
# X1 = np.sin(T + 0.35 * np.random.randn(N1, L)) + 0.15 * np.random.randn(N1, L)
# y1 = 0 * np.ones(N1)
# # X2 = np.cos(T + 0.00 * np.random.randn(N2, L)) + 0.15 * np.random.randn(N2, L)
# # X2 = np.cos(T + 0.35 * np.random.randn(N2, L)) + 0.15 * np.random.randn(N2, L)
# X2 = np.sin(T + 0.35 * np.random.randn(N2, L)) + 0.15 * np.random.randn(N2, L)
# X2 = np.clip(1.75 * X2, -1.0, 1.0)
# y2 = 1 * np.ones(N2)
# X = np.vstack([X1, X2])
# y = np.hstack([y1, y2])
# print(X.shape, y.shape)
# plt.figure(figsize=(12, 4))
# # plt.plot(X.T, 'k')
# plt.plot(X[y == 0, :].T, '#1F77B4')
# plt.plot(X[y == 1, :].T, '#FF7F0E')
# plt.show()
# plt.figure(figsize=(12, 4))
# # plt.plot(X.T, 'k')
# plt.plot(X[y == 0, :][:1,:].T, '#1F77B4')
# plt.plot(X[y == 1, :][:1,:].T, '#FF7F0E')
# plt.show()
# # Dataset 3
# L = 100
# N1a = 100
# N1b = 100
# N1 = N1a + N1b
# N2a = 100
# N2b = 100
# N2 = N2a + N2b
# N = N1 + N2
# T = np.linspace(0, 4*np.pi, L)
# X1a = np.sin(T + 0.00 * np.random.randn(N1a, L)) + 0.15 * np.random.randn(N1a, L)
# # X1a = np.sin(T + 0.35 * np.random.randn(N1a, L)) + 0.15 * np.random.randn(N1a, L)
# y1a = 0 * np.ones(N1a)
# X2a = np.cos(T + 0.00 * np.random.randn(N2a, L)) + 0.15 * np.random.randn(N2a, L)
# # X2a = np.cos(T + 0.35 * np.random.randn(N2a, L)) + 0.15 * np.random.randn(N2a, L)
# # X2a = np.sin(T + 0.35 * np.random.randn(N2a, L)) + 0.15 * np.random.randn(N2a, L)
# X2a = np.clip(1.75 * X2a, -1.0, 1.0)
# y2a = 1 * np.ones(N2a)
# X1b = np.cos(T + 0.00 * np.random.randn(N1b, L)) + 0.15 * np.random.randn(N1b, L)
# # X1b = np.cos(T + 0.35 * np.random.randn(N1b, L)) + 0.15 * np.random.randn(N1b, L)
# y1b = 0 * np.ones(N1b)
# X2b = np.sin(T + 0.00 * np.random.randn(N2b, L)) + 0.15 * np.random.randn(N2b, L)
# # X2a = np.sin(T + 0.35 * np.random.randn(N2b, L)) + 0.15 * np.random.randn(N2b, L)
# # X2a = np.cos(T + 0.35 * np.random.randn(N2b, L)) + 0.15 * np.random.randn(N2b, L)
# X2b = np.clip(1.75 * X2b, -1.0, 1.0)
# y2b = 1 * np.ones(N2b)
# X = np.vstack([X1a, X2a, X1b, X2b])
# y = np.hstack([y1a, y2a, y1b, y2b])
# print(X.shape, y.shape)
# plt.figure(figsize=(12, 4))
# # plt.plot(X.T, 'k')
# plt.plot(X[y == 0, :].T, '#1F77B4')
# plt.plot(X[y == 1, :].T, '#FF7F0E')
# plt.show()
# plt.figure(figsize=(12, 4))
# # plt.plot(X.T, 'k')
# plt.plot(X[y == 0, :][:1,:].T, '#1F77B4')
# plt.plot(X[y == 1, :][:1,:].T, '#FF7F0E')
# plt.show()
# Dataset 4
L = 100
N1 = 100
N2 = 100
N = N1 + N2
T = np.linspace(0, 4*np.pi, L)
X1 = 0.1 * (np.sin(2*T + 0.00 * np.random.randn(N1, L)) + 0.15 * np.random.randn(N1, L))
# X1 = 0.1 * (np.sin(2*T + 0.35 * np.random.randn(N1, L)) + 0.15 * np.random.randn(N1, L))
# X1[:, int(0.25*L):] = np.zeros((N1, L - int(0.25*L)))
X1[:, int(0.25*L):] = 0.025 * np.random.randn(N1, L - int(0.25*L))
# X1 = np.roll(X1, shift=int(0.25*L), axis=1)
for i in range(N1):
X1[i, :] = np.roll(X1[i, :], shift=np.random.randint(int(0.75*L)), axis=0)
y1 = 0 * np.ones(N1)
X2 = 0.1 * (np.sin(2*T + np.pi + 0.00 * np.random.randn(N2, L)) + 0.15 * np.random.randn(N2, L))
# X2 = 0.1 * (np.sin(2*T + np.pi + 0.35 * np.random.randn(N2, L)) + 0.15 * np.random.randn(N2, L))
# X2[:, int(0.25*L):] = np.zeros((N2, L - int(0.25*L)))
X2[:, int(0.25*L):] = 0.025 * np.random.randn(N2, L - int(0.25*L))
# X2 = np.roll(X2, shift=int(0.25*L), axis=1)
for i in range(N2):
X2[i, :] = np.roll(X2[i, :], shift=np.random.randint(int(0.75*L)), axis=0)
y2 = 1 * np.ones(N2)
X = np.vstack([X1, X2])
y = np.hstack([y1, y2])
print(X.shape, y.shape)
plt.figure(figsize=(12, 4))
# plt.plot(X.T, 'k')
plt.plot(X[y == 0, :].T, '#1F77B4')
plt.plot(X[y == 1, :].T, '#FF7F0E')
plt.show()
plt.figure(figsize=(12, 4))
# plt.plot(X.T, 'k')
plt.plot(X[y == 0, :][:1,:].T, '#1F77B4')
plt.plot(X[y == 1, :][:1,:].T, '#FF7F0E')
plt.show()
(200, 100) (200,)
shuffled_indices_ = np.random.permutation(N)
test_indices = shuffled_indices_[int(0.65*N):].copy()
train_indices = shuffled_indices_[:int(0.65*N)].copy()
X_train, y_train = X[train_indices, :], y[train_indices]
X_test, y_test = X[test_indices, :], y[test_indices]
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)
(130, 100) (130,) (70, 100) (70,)
t = time.time()
kmeans = KMeans(n_clusters=2)
kmeans.fit(X_train)
print(time.time() - t)
0.012945890426635742
labels = kmeans.predict(X_train)
plt.figure(figsize=(12, 4))
# plt.plot(X.T, 'k')
plt.plot(X_train[labels == 0, :].T, '#1F77B4')
plt.plot(X_train[labels == 1, :].T, '#FF7F0E')
plt.show()
plt.figure(figsize=(12, 4))
# plt.plot(X.T, 'k')
plt.plot(kmeans.cluster_centers_[0, :].T, '#1F77B4')
plt.plot(kmeans.cluster_centers_[1, :].T, '#FF7F0E')
plt.show()
confusion_matrix(y_train, labels)
array([[33, 34], [30, 33]])
labels = kmeans.predict(X_test)
plt.figure(figsize=(12, 4))
# plt.plot(X.T, 'k')
plt.plot(X_test[labels == 0, :].T, '#1F77B4')
plt.plot(X_test[labels == 1, :].T, '#FF7F0E')
plt.show()
confusion_matrix(y_test, labels)
array([[18, 15], [19, 18]])
t = time.time()
# tskmeans = TimeSeriesKMeans(n_clusters=2, metric='euclidean', n_init=5)
tskmeans = TimeSeriesKMeans(n_clusters=2, metric='dtw', max_iter_barycenter=100, n_init=5)
# tskmeans = TimeSeriesKMeans(n_clusters=2, metric='softdtw', metric_params={"gamma": .01}, n_init=5)
tskmeans.fit(X_train)
print(time.time() - t)
13.5380220413208
labels = tskmeans.predict(X_train)
plt.figure(figsize=(12, 4))
# plt.plot(X.T, 'k')
plt.plot(X_train[labels == 0, :].T, '#1F77B4')
plt.plot(X_train[labels == 1, :].T, '#FF7F0E')
plt.show()
confusion_matrix(y_train, labels)
array([[67, 0], [ 0, 63]])
plt.figure(figsize=(12, 4))
# plt.plot(X.T, 'k')
plt.plot(tskmeans.cluster_centers_[0, :, 0].T, '#1F77B4')
plt.plot(tskmeans.cluster_centers_[1, :, 0].T, '#FF7F0E')
plt.show()
labels = tskmeans.predict(X_test)
plt.figure(figsize=(12, 4))
# plt.plot(X.T, 'k')
plt.plot(X_test[labels == 0, :].T, '#1F77B4')
plt.plot(X_test[labels == 1, :].T, '#FF7F0E')
plt.show()
confusion_matrix(y_test, labels)
array([[33, 0], [ 0, 37]])
[1] F. Petitjean, A. Ketterlin, P. Gancarski, "A global averaging method for dynamic time warping, with applications to clustering". Pattern Recognition, 3(44), 2011, pp.678--693.
[2] R. Tavenard, J. Faouzi, G. Vandewiele, F. Divo, G. Androz, C. Holtz, M. Payne, R. Yurchak, M. Russwurm, K. Kolar, E. Woods, "Tslearn, A Machine Learning Toolkit for Time Series Data". Journal of Machine Learning Research, 21(118), 2020, pp.1-6.