Source code for UCTB.dataset.data_loader

import os
import copy
import datetime
import numpy as np

from dateutil.parser import parse
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import pearsonr

from ..preprocess.time_utils import is_work_day_china, is_work_day_america, is_valid_date
from ..preprocess import MoveSample, SplitData, ST_MoveSample, Normalizer
from ..model_unit import GraphBuilder

from .dataset import DataSet


[docs]class GridTrafficLoader(object): def __init__(self, dataset, city=None, data_range='all', train_data_length='all', test_ratio=0.1, closeness_len=6, period_len=7, trend_len=4, target_length=1, normalize=True, workday_parser=is_work_day_america, data_dir=None, **kwargs): self.dataset = DataSet(dataset, city, data_dir=data_dir) self.daily_slots = 24 * 60 / self.dataset.time_fitness if type(data_range) is str and data_range.lower() == 'all': data_range = [0, len(self.dataset.grid_traffic)] elif type(data_range) is float: data_range = [0, int(data_range * len(self.dataset.grid_traffic))] else: data_range = [int(data_range[0] * self.daily_slots), int(data_range[1] * self.daily_slots)] num_time_slots = data_range[1] - data_range[0] self.traffic_data = self.dataset.grid_traffic[data_range[0]:data_range[1], :].astype(np.float32) # external feature external_feature = [] # weather if len(self.dataset.external_feature_weather) > 0: external_feature.append(self.dataset.external_feature_weather[data_range[0]:data_range[1]]) # Weekday Feature weekday_feature = [[1 if workday_parser(parse(self.dataset.time_range[1]) + datetime.timedelta(hours=e * self.dataset.time_fitness / 60)) else 0] \ for e in range(data_range[0], num_time_slots + data_range[0])] # Hour Feature hour_feature = [[(parse(self.dataset.time_range[1]) + datetime.timedelta(hours=e * self.dataset.time_fitness / 60)).hour / 24.0] for e in range(data_range[0], num_time_slots + data_range[0])] external_feature.append(weekday_feature) external_feature.append(hour_feature) external_feature = np.concatenate(external_feature, axis=-1).astype(np.float32) self.height, self.width = self.traffic_data.shape[1], self.traffic_data.shape[2] self.external_dim = external_feature.shape[1] if test_ratio > 1 or test_ratio < 0: raise ValueError('test_ratio ') train_test_ratio = [1 - test_ratio, test_ratio] self.train_data, self.test_data = SplitData.split_data(self.traffic_data, train_test_ratio) self.train_ef, self.test_ef = SplitData.split_data(external_feature, train_test_ratio) # Normalize the traffic data if normalize: self.normalizer = Normalizer(self.train_data) self.train_data = self.normalizer.min_max_normal(self.train_data) self.test_data = self.normalizer.min_max_normal(self.test_data) if train_data_length.lower() != 'all': train_day_length = int(train_data_length) self.train_data = self.train_data[-int(train_day_length * self.daily_slots):] self.train_ef = self.train_ef[-int(train_day_length * self.daily_slots):] # expand the test data expand_start_index = len(self.train_data) - max(int(self.daily_slots * period_len), int(self.daily_slots * 7 * trend_len), closeness_len) self.test_data = np.vstack([self.train_data[expand_start_index:], self.test_data]) self.test_ef = np.vstack([self.train_ef[expand_start_index:], self.test_ef]) assert type(closeness_len) is int and closeness_len >= 0 assert type(period_len) is int and period_len >= 0 assert type(trend_len) is int and trend_len >= 0 self.closeness_len = closeness_len self.period_len = period_len self.trend_len = trend_len # init move sample obj self.st_move_sample = ST_MoveSample(closeness_len=closeness_len, period_len=period_len, trend_len=trend_len, target_length=1, daily_slots=self.daily_slots) self.train_closeness, \ self.train_period, \ self.train_trend, \ self.train_y = self.st_move_sample.move_sample(self.train_data) self.test_closeness, \ self.test_period, \ self.test_trend, \ self.test_y = self.st_move_sample.move_sample(self.test_data) self.train_closeness = self.train_closeness.squeeze(-1) self.train_period = self.train_period.squeeze(-1) self.train_trend = self.train_trend.squeeze(-1) self.test_closeness = self.test_closeness.squeeze(-1) self.test_period = self.test_period.squeeze(-1) self.test_trend = self.test_trend.squeeze(-1) self.train_sequence_len = max((len(self.train_closeness), len(self.train_period), len(self.train_trend))) self.test_sequence_len = max((len(self.test_closeness), len(self.test_period), len(self.test_trend))) # external feature self.train_ef = self.train_ef[-self.train_sequence_len - target_length: -target_length] self.test_ef = self.test_ef[-self.test_sequence_len - target_length: -target_length]
[docs]class NodeTrafficLoader(object): """The data loader that extracts and processes data from a :obj:`DataSet` object. Args: dataset (str): A string containing path of the dataset pickle file or a string of name of the dataset. city (:obj:`str` or ``None``): ``None`` if dataset is file path, or a string of name of the city. Default: ``None`` data_range: The range of data extracted from ``self.dataset`` to be further used. If set to ``'all'``, all data in ``self.dataset`` will be used. If set to a float between 0.0 and 1.0, the relative former proportion of data in ``self.dataset`` will be used. If set to a list of two integers ``[start, end]``, the data from *start* day to (*end* - 1) day of data in ``self.dataset`` will be used. Default: ``'all'`` train_data_length: The length of train data. If set to ``'all'``, all data in the split train set will be used. If set to int, the latest ``train_data_length`` days of data will be used as train set. Default: ``'all'`` test_ratio (float): The ratio of test set as data will be split into train set and test set. Default: 0.1 closeness_len (int): The length of closeness data history. The former consecutive ``closeness_len`` time slots of data will be used as closeness history. Default: 6 period_len (int): The length of period data history. The data of exact same time slots in former consecutive ``period_len`` days will be used as period history. Default: 7 trend_len (int): The length of trend data history. The data of exact same time slots in former consecutive ``trend_len`` weeks (every seven days) will be used as trend history. Default: 4 target_length (int): The numbers of steps that need prediction by one piece of history data. Have to be 1 now. Default: 1 graph (str): Types of graphs used in neural methods. Graphs should be a subset of { ``'Correlation'``, ``'Distance'``, ``'Interaction'``, ``'Line'``, ``'Neighbor'``, ``'Transfer'`` } and concatenated by ``'-'``, and *dataset* should have data of selected graphs. Default: ``'Correlation'`` threshold_distance (float): Used in building of distance graph. If distance of two nodes in meters is larger than ``threshold_distance``, the corresponding position of the distance graph will be 1 and otherwise 0.the corresponding Default: 1000 threshold_correlation (float): Used in building of correlation graph. If the Pearson correlation coefficient is larger than ``threshold_correlation``, the corresponding position of the correlation graph will be 1 and otherwise 0. Default: 0 threshold_interaction (float): Used in building of interatction graph. If in the latest 12 months, the number of times of interaction between two nodes is larger than ``threshold_interaction``, the corresponding position of the interaction graph will be 1 and otherwise 0. Default: 500 normalize (bool): If ``True``, do min-max normalization on data. Default: ``True`` workday_parser: Used to build external features to be used in neural methods. Default: ``is_work_day_america`` with_lm (bool): If ``True``, data loader will build graphs according to ``graph``. Default: ``True`` with_tpe (bool): If ``True``, data loader will build time position embeddings. Default: ``False`` data_dir (:obj:`str` or ``None``): The dataset directory. If set to ``None``, a directory will be created. If ``dataset`` is file path, ``data_dir`` should be ``None`` too. Default: ``None`` Attributes: dataset (DataSet): The DataSet object storing basic data. daily_slots (int): The number of time slots in one single day. station_number (int): The number of nodes. external_dim (int): The number of dimensions of external features. train_closeness (np.ndarray): The closeness history of train set data. When ``with_tpe`` is ``False``, its shape is [train_time_slot_num, ``station_number``, ``closeness_len``, 1]. On the dimension of ``closeness_len``, data are arranged from earlier time slots to later time slots. If ``closeness_len`` is set to 0, train_closeness will be an empty ndarray. ``train_period``, ``train_trend``, ``test_closeness``, ``test_period``, ``test_trend`` have similar shape and construction. train_y (np.ndarray): The train set data. Its shape is [train_time_slot_num, ``station_number``, 1]. ``test_y`` has similar shape and construction. LM (list): If ``with_lm`` is ``True``, the list of Laplacian matrices of graphs listed in ``graph``. """ def __init__(self, dataset, city=None, data_range='all', train_data_length='all', test_ratio=0.1, closeness_len=6, period_len=7, trend_len=4, target_length=1, graph='Correlation', threshold_distance=1000, threshold_correlation=0, threshold_interaction=500, normalize=True, workday_parser=is_work_day_america, with_lm=True, with_tpe=False, data_dir=None, **kwargs): self.dataset = DataSet(dataset, city, data_dir=data_dir) self.daily_slots = 24 * 60 / self.dataset.time_fitness self.closeness_len = int(closeness_len) self.period_len = int(period_len) self.trend_len = int(trend_len) assert type(self.closeness_len) is int and self.closeness_len >= 0 assert type(self.period_len) is int and self.period_len >= 0 assert type(self.trend_len) is int and self.trend_len >= 0 if type(data_range) is str and data_range.lower() == 'all': data_range = [0, len(self.dataset.node_traffic)] elif type(data_range) is float: data_range = [0, int(data_range * len(self.dataset.node_traffic))] else: data_range = [int(data_range[0] * self.daily_slots), int(data_range[1] * self.daily_slots)] num_time_slots = data_range[1] - data_range[0] # traffic feature self.traffic_data_index = np.where(np.mean(self.dataset.node_traffic, axis=0) * self.daily_slots > 1)[0] self.traffic_data = self.dataset.node_traffic[data_range[0]:data_range[1], self.traffic_data_index].astype( np.float32) # external feature external_feature = [] # weather if len(self.dataset.external_feature_weather) > 0: external_feature.append(self.dataset.external_feature_weather[data_range[0]:data_range[1]]) # Weekday Feature weekday_feature = [[1 if workday_parser(parse(self.dataset.time_range[1]) + datetime.timedelta(hours=e * self.dataset.time_fitness / 60)) else 0] \ for e in range(data_range[0], num_time_slots + data_range[0])] # Hour Feature hour_feature = [[(parse(self.dataset.time_range[1]) + datetime.timedelta(hours=e * self.dataset.time_fitness / 60)).hour / 24.0] for e in range(data_range[0], num_time_slots + data_range[0])] external_feature.append(weekday_feature) external_feature.append(hour_feature) external_feature = np.concatenate(external_feature, axis=-1).astype(np.float32) self.station_number = self.traffic_data.shape[1] self.external_dim = external_feature.shape[1] if test_ratio > 1 or test_ratio < 0: raise ValueError('test_ratio ') self.train_test_ratio = [1 - test_ratio, test_ratio] self.train_data, self.test_data = SplitData.split_data(self.traffic_data, self.train_test_ratio) self.train_ef, self.test_ef = SplitData.split_data(external_feature, self.train_test_ratio) # Normalize the traffic data if normalize: self.normalizer = Normalizer(self.train_data) self.train_data = self.normalizer.min_max_normal(self.train_data) self.test_data = self.normalizer.min_max_normal(self.test_data) if train_data_length.lower() != 'all': train_day_length = int(train_data_length) self.train_data = self.train_data[-int(train_day_length * self.daily_slots):] self.train_ef = self.train_ef[-int(train_day_length * self.daily_slots):] # expand the test data expand_start_index = len(self.train_data) - \ max(int(self.daily_slots * self.period_len), int(self.daily_slots * 7 * self.trend_len), self.closeness_len) self.test_data = np.vstack([self.train_data[expand_start_index:], self.test_data]) self.test_ef = np.vstack([self.train_ef[expand_start_index:], self.test_ef]) # init move sample obj self.st_move_sample = ST_MoveSample(closeness_len=self.closeness_len, period_len=self.period_len, trend_len=self.trend_len, target_length=1, daily_slots=self.daily_slots) self.train_closeness, \ self.train_period, \ self.train_trend, \ self.train_y = self.st_move_sample.move_sample(self.train_data) self.test_closeness, \ self.test_period, \ self.test_trend, \ self.test_y = self.st_move_sample.move_sample(self.test_data) self.train_sequence_len = max((len(self.train_closeness), len(self.train_period), len(self.train_trend))) self.test_sequence_len = max((len(self.test_closeness), len(self.test_period), len(self.test_trend))) # external feature self.train_ef = self.train_ef[-self.train_sequence_len - target_length: -target_length] self.test_ef = self.test_ef[-self.test_sequence_len - target_length: -target_length] if with_tpe: # Time position embedding self.closeness_tpe = np.array(range(1, self.closeness_len + 1), dtype=np.float32) self.period_tpe = np.array(range(1 * int(self.daily_slots), self.period_len * int(self.daily_slots) + 1, int(self.daily_slots)), dtype=np.float32) self.trend_tpe = np.array(range(1 * int(self.daily_slots) * 7, self.trend_len * int(self.daily_slots) * 7 + 1, int(self.daily_slots) * 7), dtype=np.float32) self.train_closeness_tpe = np.tile(np.reshape(self.closeness_tpe, [1, 1, -1, 1]), [len(self.train_closeness), len(self.traffic_data_index), 1, 1]) self.train_period_tpe = np.tile(np.reshape(self.period_tpe, [1, 1, -1, 1]), [len(self.train_period), len(self.traffic_data_index), 1, 1]) self.train_trend_tpe = np.tile(np.reshape(self.trend_tpe, [1, 1, -1, 1]), [len(self.train_trend), len(self.traffic_data_index), 1, 1]) self.test_closeness_tpe = np.tile(np.reshape(self.closeness_tpe, [1, 1, -1, 1]), [len(self.test_closeness), len(self.traffic_data_index), 1, 1]) self.test_period_tpe = np.tile(np.reshape(self.period_tpe, [1, 1, -1, 1]), [len(self.test_period), len(self.traffic_data_index), 1, 1]) self.test_trend_tpe = np.tile(np.reshape(self.trend_tpe, [1, 1, -1, 1]), [len(self.test_trend), len(self.traffic_data_index), 1, 1]) self.tpe_dim = self.train_closeness_tpe.shape[-1] # concat temporal feature with time position embedding self.train_closeness = np.concatenate((self.train_closeness, self.train_closeness_tpe,), axis=-1) self.train_period = np.concatenate((self.train_period, self.train_period_tpe,), axis=-1) self.train_trend = np.concatenate((self.train_trend, self.train_trend_tpe,), axis=-1) self.test_closeness = np.concatenate((self.test_closeness, self.test_closeness_tpe,), axis=-1) self.test_period = np.concatenate((self.test_period, self.test_period_tpe,), axis=-1) self.test_trend = np.concatenate((self.test_trend, self.test_trend_tpe,), axis=-1) else: self.tpe_dim = None if with_lm: self.AM = [] self.LM = [] self.threshold_distance = threshold_distance self.threshold_correlation = threshold_correlation self.threshold_interaction = threshold_interaction for graph_name in graph.split('-'): AM, LM = self.build_graph(graph_name) if AM is not None: self.AM.append(AM) if LM is not None: self.LM.append(LM) self.LM = np.array(self.LM, dtype=np.float32)
[docs] def build_graph(self, graph_name): AM, LM = None, None if graph_name.lower() == 'distance': lat_lng_list = np.array([[float(e1) for e1 in e[2:4]] for e in self.dataset.node_station_info]) AM = GraphBuilder.distance_adjacent(lat_lng_list[self.traffic_data_index], threshold=float(self.threshold_distance)) LM = GraphBuilder.adjacent_to_laplacian(AM) if graph_name.lower() == 'interaction': monthly_interaction = self.dataset.node_monthly_interaction[:, self.traffic_data_index, :][:, :, self.traffic_data_index] monthly_interaction, _ = SplitData.split_data(monthly_interaction, self.train_test_ratio) annually_interaction = np.sum(monthly_interaction[-12:], axis=0) annually_interaction = annually_interaction + annually_interaction.transpose() AM = GraphBuilder.interaction_adjacent(annually_interaction, threshold=float(self.threshold_interaction)) LM = GraphBuilder.adjacent_to_laplacian(AM) if graph_name.lower() == 'correlation': AM = GraphBuilder.correlation_adjacent(self.train_data[-30 * int(self.daily_slots):], threshold=float(self.threshold_correlation)) LM = GraphBuilder.adjacent_to_laplacian(AM) if graph_name.lower() == 'neighbor': LM = GraphBuilder.adjacent_to_laplacian( self.dataset.data.get('contribute_data').get('graph_neighbors')) if graph_name.lower() == 'line': LM = GraphBuilder.adjacent_to_laplacian(self.dataset.data.get('contribute_data').get('graph_lines')) if graph_name.lower() == 'transfer': LM = GraphBuilder.adjacent_to_laplacian( self.dataset.data.get('contribute_data').get('graph_transfer')) return AM, LM
[docs]def st_map(self, zoom=11, style='mapbox://styles/rmetfc/ck1manozn0edb1dpmvtzle2cp', build_order=None): if self.dataset.node_station_info is None or len(self.dataset.node_station_info) == 0: raise ValueError('No station information found in dataset') import numpy as np import plotly from plotly.graph_objs import Scattermapbox, Layout mapboxAccessToken = "pk.eyJ1Ijoicm1ldGZjIiwiYSI6ImNrMW02YmwxbjAxN24zam9kNGVtMm5raWIifQ.FXKqZCxsFK-dGLLNdeRJHw" # os.environ['MAPBOX_API_KEY'] = mapboxAccessToken lat_lng_name_list = [e[2:] for e in self.dataset.node_station_info] build_order = build_order or list(range(len(self.dataset.node_station_info))) color = ['rgb(255, 0, 0)' for _ in build_order] lat = np.array([float(e[2]) for e in self.dataset.node_station_info])[self.traffic_data_index] lng = np.array([float(e[3]) for e in self.dataset.node_station_info])[self.traffic_data_index] text = [str(e) for e in range(len(build_order))] file_name = self.dataset.dataset + '-' + self.dataset.city + '.html' bikeStations = [Scattermapbox( lon=lng, lat=lat, text=text, mode='markers', marker=dict( size=6, # color=['rgb(%s, %s, %s)' % (255, # # 195 - e * 195 / max(build_order), # # 195 - e * 195 / max(build_order)) for e in build_order], color=color, opacity=1, ))] layout = Layout( title='Bike Station Location & The latest built stations with deeper color', autosize=True, hovermode='closest', showlegend=False, mapbox=dict( accesstoken=mapboxAccessToken, bearing=0, center=dict( lat=np.median(lat), lon=np.median(lng) ), pitch=0, zoom=zoom, style=style ), ) fig = dict(data=bikeStations, layout=layout) plotly.offline.plot(fig, filename=file_name)
[docs]def make_concat(self, node='all', is_train=True): """A function to concatenate all closeness, period and trend history data to use as inputs of models. Args: node (int or ``'all'``): To specify the index of certain node. If set to ``'all'``, return the concatenation result of all nodes. If set to an integer, it will be the index of the selected node. Default: ``'all'`` is_train (bool): If set to ``True``, ``train_closeness``, ``train_period``, and ``train_trend`` will be concatenated. If set to ``False``, ``test_closeness``, ``test_period``, and ``test_trend`` will be concatenated. Default: True Returns: np.ndarray: Function returns an ndarray with shape as [time_slot_num, ``station_number``, ``closeness_len`` + ``period_len`` + ``trend_len``, 1], and time_slot_num is the temporal length of train set data if ``is_train`` is ``True`` or the temporal length of test set data if ``is_train`` is ``False``. On the second dimension, data are arranged as ``earlier closeness -> later closeness -> earlier period -> later period -> earlier trend -> later trend``. """ if is_train: length = len(self.train_y) closeness = self.train_closeness period = self.train_period trend = self.train_trend else: length = len(self.test_y) closeness = self.test_closeness period = self.test_period trend = self.test_trend if node == 'all': node = list(range(self.station_number)) else: node = [node] history = np.zeros([length, len(node), self.closeness_len + self.period_len + self.trend_len]) for i in range(len(node)): for c in range(self.closeness_len): history[:, i, c] = closeness[:, node[i], c, -1] for p in range(self.period_len): history[:, i, self.closeness_len + p] = period[:, node[i], p, -1] for t in range(self.trend_len): history[:, i, self.closeness_len + self.period_len + t] = trend[:, node[i], t, -1] history = np.expand_dims(history, 3) return history
[docs]class TransferDataLoader(object): def __init__(self, sd_params, td_params, model_params, td_data_length=None): if td_data_length: td_params.update({'train_data_length': td_data_length}) self.sd_loader = NodeTrafficLoader(**sd_params, **model_params) self.td_loader = NodeTrafficLoader(**td_params, **model_params) td_params.update({'train_data_length': '180'}) self.fake_td_loader = NodeTrafficLoader(**td_params, **model_params)
[docs] def traffic_sim(self): assert self.sd_loader.daily_slots == self.td_loader.daily_slots similar_record = [] for i in range(0, self.sd_loader.train_data.shape[0] - self.td_loader.train_data.shape[0], int(self.sd_loader.daily_slots)): sim = cosine_similarity(self.td_loader.train_data.transpose(), self.sd_loader.train_data[i:i + self.td_loader.train_data.shape[0]].transpose()) max_sim, max_index = np.max(sim, axis=1), np.argmax(sim, axis=1) if len(similar_record) == 0: similar_record = [[max_sim[e], max_index[e], i, i + self.td_loader.train_data.shape[0]] for e in range(len(max_sim))] else: for index in range(len(similar_record)): if similar_record[index][0] < max_sim[index]: similar_record[index] = [max_sim[index], max_index[index], i, i + self.td_loader.train_data.shape[0]] return similar_record
[docs] def traffic_sim_fake(self): assert self.sd_loader.daily_slots == self.fake_td_loader.daily_slots similar_record = [] for i in range(0, self.sd_loader.train_data.shape[0] - self.fake_td_loader.train_data.shape[0], int(self.sd_loader.daily_slots)): sim = cosine_similarity(self.fake_td_loader.train_data.transpose(), self.sd_loader.train_data[ i:i + self.fake_td_loader.train_data.shape[0]].transpose()) max_sim, max_index = np.max(sim, axis=1), np.argmax(sim, axis=1) if len(similar_record) == 0: similar_record = [[max_sim[e], max_index[e], i, i + self.fake_td_loader.train_data.shape[0]] for e in range(len(max_sim))] else: for index in range(len(similar_record)): if similar_record[index][0] < max_sim[index]: similar_record[index] = [max_sim[index], max_index[index], i, i + self.td_loader.train_data.shape[0]] return similar_record
[docs] def checkin_sim(self): from sklearn.metrics.pairwise import cosine_similarity td_checkin = np.array([e[0] for e in self.td_loader.dataset.data['ExternalFeature']['CheckInFeature']] )[self.td_loader.traffic_data_index] sd_checkin = np.array([e[0] for e in self.sd_loader.dataset.data['ExternalFeature']['CheckInFeature']] )[self.sd_loader.traffic_data_index] td_checkin = td_checkin / (np.max(td_checkin, axis=1, keepdims=True) + 0.0001) sd_checkin = sd_checkin / (np.max(sd_checkin, axis=1, keepdims=True) + 0.0001) # cs = cosine_similarity(td_checkin, sd_checkin) # similar_record = [[e[np.argmax(e)], np.argmax(e), ] for e in cs] similar_record = [] for td_index in range(len(td_checkin)): tmp_sim_record = [] for sd_index in range(len(sd_checkin)): r, p = pearsonr(td_checkin[td_index], sd_checkin[sd_index]) tmp_sim_record.append([r, sd_index, len(self.sd_loader.train_y) - len(self.td_loader.train_y), len(self.sd_loader.train_y)]) similar_record.append(max(tmp_sim_record, key=lambda x: x[0])) return similar_record
[docs] def checkin_sim_sd(self): sd_checkin = np.array([e[0] for e in self.sd_loader.dataset.data['ExternalFeature']['CheckInFeature']] )[self.sd_loader.traffic_data_index] sd_checkin = sd_checkin / (np.max(sd_checkin, axis=1, keepdims=True) + 0.0001) cs = cosine_similarity(sd_checkin, sd_checkin) - np.eye(sd_checkin.shape[0]) return np.array([np.argmax(e) for e in cs], np.int32)
[docs] def poi_sim(self): from sklearn.metrics.pairwise import cosine_similarity td_checkin = np.array([e[1] for e in self.td_loader.dataset.data['ExternalFeature']['CheckInFeature']] )[self.td_loader.traffic_data_index] sd_checkin = np.array([e[1] for e in self.sd_loader.dataset.data['ExternalFeature']['CheckInFeature']] )[self.sd_loader.traffic_data_index] return [[e[np.argmax(e)], np.argmax(e), ] for e in cosine_similarity(td_checkin, sd_checkin)]