diff --git a/AUTHORS b/AUTHORS index 8003cab..eb97b49 100644 --- a/AUTHORS +++ b/AUTHORS @@ -1 +1,2 @@ Oscar Celma (ocelma __at__ gmail __dot__ com), http://ocelma.net +Ibrahim Abou Elseoud (Ibrahim__dot__Elseoud__at__ gmail __dot__ com), for updating SVD model part diff --git a/CHANGELOG b/CHANGELOG index f15b79f..ad6bba0 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -28,3 +28,8 @@ See: https://github.com/ocelma/python-recsys/commits/master 2011-10-08 * Added the whole project at github + +2017-06-08 + * Added updating the SVD model incrementally by folding-in + * Added a capability to split the dataset for train,test,foldin to facilitate testing the Fold-in implementation + * Added update to github diff --git a/README.rst b/README.rst index 27e9d66..d09f9a5 100644 --- a/README.rst +++ b/README.rst @@ -4,6 +4,20 @@ python-recsys A python library for implementing a recommender system. +Incremental SVD update for python-recsys +======================================== +- python-recsys now supports incrementally adding new users or items instead of building the model from scratch for these new users or items via the folding-in technique which was mentioned in Sarwar et al.'s `paper`_ (Titled: Incremental Singular Value Decomposition Algorithms for Highly Scalable Recommender Systems), this latest commit is simply an implementation to it for python-recsys. + +.. _`paper`: http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.3.7894&rep=rep1&type=pdf + +- A `Demonstration video is available`_ for this latest commit in form of a demo site built using the MEAN stack which uses the updated python-recsys as backend for the recommender which folds-in the website's user in to the SVD model and gets recommendations instantaneously instead of building the model from scratch. + +.. _`Demonstration video is available`: https://youtu.be/tIvQxBfa2d4 + +- There is also an accompanying `bachelor thesis paper`_ (For those interested) which outlines the background, architecture and discusses the "Folding-in" approach. + +.. _`bachelor thesis paper`: https://drive.google.com/file/d/0BylQe2cRVWE_RmZoUTJYSGZNaXM/view + Installation ============ @@ -57,8 +71,8 @@ Example from recsys.algorithm.factorize import SVD svd = SVD() - svd.load_data(filename='./data/movielens/ratings.dat', - sep='::', + svd.load_data(filename='./data/movielens/ratings.dat', + sep='::', format={'col':0, 'row':1, 'value':2, 'ids': int}) 2. Compute Singular Value Decomposition (SVD), M=U Sigma V^t: @@ -66,11 +80,11 @@ Example :: k = 100 - svd.compute(k=k, - min_values=10, - pre_normalize=None, - mean_center=True, - post_normalize=True, + svd.compute(k=k, + min_values=10, + pre_normalize=None, + mean_center=True, + post_normalize=True, savefile='/tmp/movielens') 3. Get similarity between two movies: @@ -111,10 +125,10 @@ Example USERID = 1 svd.predict(ITEMID, USERID, MIN_RATING, MAX_RATING) - # Predicted value 5.0 + # Predicted value 5.0 svd.get_matrix().value(ITEMID, USERID) - # Real value 5.0 + # Real value 5.0 6. Recommend (non-rated) movies to a user: @@ -152,7 +166,129 @@ Example (4801, 5.4947999354188548), (1131, 5.4941438045650068), (2339, 5.4916048051511659)] - + +Example for incremental update +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +1. Load Movielens dataset and prepare for training and testing: + +:: + + import recsys.algorithm + recsys.algorithm.VERBOSE = True + + from recsys.algorithm.factorize import SVD + from recsys.datamodel.data import Data + + filename = “(your movielens file path here)” + + #In movielens dataset, the user is at 0 so I made them the row (could keep it as above {'col':0, 'row':1, 'value':2, 'ids': int} but I changed order to emphasis a parameter in an upcoming function) + format = {'col':1, 'row':0, 'value':2, 'ids': int} + + data = Data() + data.load(filename, sep='::', format=format) + #splits the dataset according to row or column (based on is_row=true or false) which causes there to be no overlap (of users for example) between train and foldin dataset + train, test, foldin = data.split_train_test_foldin(base=60,percentage_base_user=80,shuffle_data=True,is_row=True) #since users are in the row so is_row=true + + # Returns: a tuple for train, test, foldin + # Prints: (If VERBOSE=True) + total number of tuples: 1000209 + percentage of data for training: 48.0 % with 479594 tuples + percentage of data for testing: 20.0 % with 200016 tuples # 100-percentage_base_user per user (percentage of tuples which means the ratings since a user has many tuples(ratings)) + percentage of data for foldin: 32.0 % with 320599 tuples + _____________ + percentage of users for foldin: 40.0 % with 2416 users # 100-base= foldin (percentage of users) + percentage of users for training: 60.0 % with 3624 users #base for training (percentage of users) + +2. Compute Singular Value Decomposition (SVD), M=U Sigma V^t: + +:: + + svd = SVD() + svd.set_data(train) + svd.compute(k=100, + min_values=1, + pre_normalize=None, + mean_center=False, + post_normalize=True) + + # Prints: + Creating matrix (479594 tuples) + Matrix density is: 3.7007% + Updating matrix: squish to at least 1 values + Computing svd k=14, min_values=1, pre_normalize=None, mean_center=False, post_normalize=False + +3. "Foldin" those new users or items (update model instead of updating from scratch) + +:: + + svd.load_updateDataBatch_foldin(data=foldin,is_row=True) + + # Prints: (If VERBOSE=True) + before updating, M= (3624, 3576) + done updating, M= (6040, 3576) # Folds in all the new users (not previously in model) + +4. Recommend (non-rated) movies to a NEW user +:: + + user_id=foldin[0][1] #returns userID which is in foldin dataset BUT not in train dataset + svd.recommend(user_id,is_row=True,only_unknowns=True) #The userID is in row and gets only the unrated (unknowns) + + # Returns: + [(1307, 3.6290483094468913), + (1394, 3.5741565545425957), + (1259, 3.5303836262378048), + (1968, 3.4565426585553927), + (2791, 3.3470277643217203), + (1079, 3.268283171487782), + (1198, 3.2381080336246675), + (593, 3.204915630088236), + (1270, 3.1859618303393233), + (2918, 3.1548530640630252)] + +5. Recommend (non-rated) movies to a NEW user and validate not in base model (prior to folding-in) +:: + + # BEFORE running points 3 and 4 (prior to calling svd.load_updateDataBatch_foldin) + + user_id=foldin[0][1] #returns userID which is in foldin dataset BUT not in train dataset + + # Try block to validate that the userID is new and not in the base model + try: + print "Getting recommendation for user_id which was not in original model training set" + print "recommendations:",svd.recommend(user_id) + except Exception: + print "New user not in base model so in except block and will foldin the foldin dataset (update the model NOT calculate from scratch)" + svd.load_updateDataBatch_foldin(data=foldin,format=format,is_row=True,truncate=True,post_normalize=True) + print "recommendations:",svd.recommend(user_id,is_row=True,only_unknowns=True) #The userID is in row and get us only the unrated (unknowns) + + + # Prints: + Getting recommendation for user_id which was not in original model training set + recommendations: New user not in base model so in except block and will foldin the foldin dataset (update the model NOT calculate from scratch) + before updating, M= (3624, 3576) + done updating, M= (6040, 3576) + recommendations: [(1307, 3.6290483094468913), (1394, 3.5741565545425957), (1259, 3.5303836262378048), (1968, 3.4565426585553927), (2791, 3.3470277643217203), (1079, 3.268283171487782), (1198, 3.2381080336246675), (593, 3.204915630088236), (1270, 3.1859618303393233), (2918, 3.1548530640630252)] + + +6. Load previous SVD model and foldin NEW users from file then instantly get recommendations +:: + + format = {'col':1, 'row':0, 'value':2, 'ids': int} + + svd = SVD() + #load base svd model + svd.load_model('SVDModel') + + # load new users by their movie rating data file and use it to fold-in the users into the model (loads data and folds in) + svd.load_updateDataBatch_foldin(filename = 'newUsers.dat', sep='::', format=formate, is_row=True) + + # gets recommendedations + print "recommendations:", svd.recommend(new_userID,is_row=True,only_unknowns=True) + + +- All the normal functionalities of python-recsys are compatible with the incremental update commit. The incremental update can even work if you load the model then foldin a new user or users or even items. + +- Please note that preexisting users can't be folded-in only new users which aren't already in the svd model. Documentation ~~~~~~~~~~~~~ @@ -168,10 +304,8 @@ To create the HTML documentation files from doc/source do: cd doc make html -HTML files are created here: +HTML files are created here: :: doc/build/html/index.html - - diff --git a/recsys/algorithm/baseclass.py b/recsys/algorithm/baseclass.py index 89656a5..8310a7b 100644 --- a/recsys/algorithm/baseclass.py +++ b/recsys/algorithm/baseclass.py @@ -36,6 +36,15 @@ def __init__(self): self._matrix_similarity = None #self-similarity matrix (only for the input Matrix rows) self._matrix_and_data_aligned = False #both Matrix and Data contain the same info? + #new for update + self._updateData=Data() + self._singleUpdateMatrix=SparseMatrix() + #new for batch + self._batchDict={} + #new for foldin additional + self._singleAdditionalFoldin=SparseMatrix() + + def __len__(self): return len(self.get_data()) @@ -97,6 +106,8 @@ def load_data(self, filename, force=True, sep='\t', format={'value':0, 'row':1, self._matrix_similarity = None self._data.load(filename, force, sep, format, pickle) + + def save_data(self, filename, pickle=False): """ diff --git a/recsys/algorithm/factorize.py b/recsys/algorithm/factorize.py index 00a9e61..75e893f 100644 --- a/recsys/algorithm/factorize.py +++ b/recsys/algorithm/factorize.py @@ -9,6 +9,7 @@ import os import sys import zipfile + try: import divisi2 except: @@ -25,16 +26,24 @@ from divisi2 import DenseVector from divisi2 import DenseMatrix from divisi2.ordered_set import OrderedSet - + from recsys.algorithm.baseclass import Algorithm from recsys.algorithm.matrix import SimilarityMatrix from recsys.algorithm import VERBOSE +from numpy.linalg import inv #for update +import numpy as np +from divisi2.sparse import SparseMatrix as divisiSparseMatrix +from divisi2.sparse import SparseVector as divisiSparseVector +from divisi2.dense import DenseVector as divisiDenseVector + +from recsys.datamodel.data import Data + TMPDIR = '/tmp' class SVD(Algorithm): """ - Inherits from base class Algorithm. + Inherits from base class Algorithm. It computes SVD (Singular Value Decomposition) on a matrix *M* It also provides recommendations and predictions using the reconstructed matrix *M'* @@ -66,6 +75,10 @@ def __init__(self, filename=None): self._file_row_ids = None self._file_col_ids = None + #Update feature + self._foldinZeroes={} + self.inv_S=None #since it doesn't get updated so redundent to calculate each time + def __repr__(self): try: s = '\n'.join(('M\':' + str(self._reconstruct_matrix()), \ @@ -117,7 +130,7 @@ def load_model(self, filename): idx = [ int(idx.strip()) for idx in zip.read('.row_ids').split('\n') if idx] except: idx = [ idx.strip() for idx in zip.read('.row_ids').split('\n') if idx] - #self._U = DenseMatrix(vectors) + #self._U = DenseMatrix(vectors) self._U = DenseMatrix(vectors, OrderedSet(idx), None) try: self._V = loads(zip.read('.V')) @@ -133,7 +146,7 @@ def load_model(self, filename): idx = [ int(idx.strip()) for idx in zip.read('.col_ids').split('\n') if idx] except: idx = [ idx.strip() for idx in zip.read('.col_ids').split('\n') if idx] - #self._V = DenseMatrix(vectors) + #self._V = DenseMatrix(vectors) self._V = DenseMatrix(vectors, OrderedSet(idx), None) self._S = loads(zip.read('.S')) @@ -141,7 +154,7 @@ def load_model(self, filename): # Shifts for Mean Centerer Matrix self._shifts = None if '.shifts.row' in zip.namelist(): - self._shifts = [loads(zip.read('.shifts.row')), + self._shifts = [loads(zip.read('.shifts.row')), loads(zip.read('.shifts.col')), loads(zip.read('.shifts.total')) ] @@ -191,7 +204,7 @@ def save_model(self, filename, options={}): # Store Options in the ZIP file fp.write(filename=filename + '.config', arcname='README') os.remove(filename + '.config') - + # Store matrices in the ZIP file for extension in ['.U', '.S', '.V']: fp.write(filename=filename + extension, arcname=extension) @@ -224,6 +237,7 @@ def _reconstruct_matrix(self, shifts=None, force=True): self._matrix_reconstructed = divisi2.reconstruct(self._U, self._S, self._V) return self._matrix_reconstructed + def compute(self, k=100, min_values=None, pre_normalize=None, mean_center=False, post_normalize=True, savefile=None): """ Computes SVD on matrix *M*, :math:`M = U \Sigma V^T` @@ -241,10 +255,10 @@ def compute(self, k=100, min_values=None, pre_normalize=None, mean_center=False, :param savefile: path to save the SVD factorization (U, Sigma and V matrices) :type savefile: string """ - super(SVD, self).compute(min_values) + super(SVD, self).compute(min_values) #creates matrix and does squish to not have empty values if VERBOSE: - sys.stdout.write('Computing svd k=%s, min_values=%s, pre_normalize=%s, mean_center=%s, post_normalize=%s\n' + sys.stdout.write('Computing svd k=%s, min_values=%s, pre_normalize=%s, mean_center=%s, post_normalize=%s\n' % (k, min_values, pre_normalize, mean_center, post_normalize)) if not min_values: sys.stdout.write('[WARNING] min_values is set to None, meaning that some funky recommendations might appear!\n') @@ -257,17 +271,18 @@ def compute(self, k=100, min_values=None, pre_normalize=None, mean_center=False, if mean_center: if VERBOSE: sys.stdout.write("[WARNING] mean_center is True. svd.similar(...) might return nan's. If so, then do svd.compute(..., mean_center=False)\n") - matrix, row_shift, col_shift, total_shift = matrix.mean_center() + matrix, row_shift, col_shift, total_shift = matrix.mean_center() self._shifts = (row_shift, col_shift, total_shift) + # Pre-normalize input matrix? if pre_normalize: """ - Divisi2 divides each entry by the geometric mean of its row norm and its column norm. + Divisi2 divides each entry by the geometric mean of its row norm and its column norm. The rows and columns don't actually become unit vectors, but they all become closer to unit vectors. """ if pre_normalize == 'tfidf': - matrix = matrix.normalize_tfidf() #TODO By default, treats the matrix as terms-by-documents; + matrix = matrix.normalize_tfidf() #TODO By default, treats the matrix as terms-by-documents; # pass cols_are_terms=True if the matrix is instead documents-by-terms. elif pre_normalize == 'rows': matrix = matrix.normalize_rows() @@ -289,7 +304,7 @@ def compute(self, k=100, min_values=None, pre_normalize=None, mean_center=False, options = {'k': k, 'min_values': min_values, 'pre_normalize': pre_normalize, 'mean_center': mean_center, 'post_normalize': post_normalize} self.save_model(savefile, options) - def _get_row_reconstructed(self, i, zeros=None): + def _get_row_reconstructed(self, i, zeros=None): #if foldin that means it is known what the user rated and zeros contains the rated items if zeros: return self._matrix_reconstructed.row_named(i)[zeros] return self._matrix_reconstructed.row_named(i) @@ -299,6 +314,40 @@ def _get_col_reconstructed(self, j, zeros=None): return self._matrix_reconstructed.col_named(j)[zeros] return self._matrix_reconstructed.col_named(j) + def _get_row_unrated(self,i,rated): # use for foldin since that means users new rated items are known so no need to squish or need normal matrix + sparse_matrix = self._matrix_reconstructed.row_named(i).to_sparse() + # values: np array with the predicted ratings or ratings + # named_rows: normal array with movie names + values, named_cols = sparse_matrix.named_lists() #values contains a np array with predicted ratings , while named_cols contains list of labels of columns + removal_indicies = [] #array of indicies for removal + + for item in rated: + index_remove = named_cols.index(item) + del named_cols[index_remove] #since its a normal list can remove like this + removal_indicies.append(index_remove) + + values = np.delete(values, removal_indicies) #since it's a numpy array so must remove like this + + return divisiSparseVector.from_named_lists(values, named_cols).to_dense() + + + + def _get_col_unrated(self, j,rated): # use for foldin since that means users new rated items are known so no need to squish or need normal matrix + sparse_matrix=self._matrix_reconstructed.col_named(j).to_sparse() + # values: np array with the predicted ratings or ratings + # named_rows: normal array with movie names + values, named_rows= sparse_matrix.named_lists() + removal_indicies=[] + + for item in rated: + index_remove = named_rows.index(item) + del named_rows[index_remove] + removal_indicies.append(index_remove) + + values=np.delete(values, removal_indicies) + + return divisiSparseVector.from_named_lists(values,named_rows).to_dense() + def predict(self, i, j, MIN_VALUE=None, MAX_VALUE=None): """ Predicts the value of :math:`M_{i,j}`, using reconstructed matrix :math:`M^\prime = U \Sigma_k V^T` @@ -340,18 +389,363 @@ def recommend(self, i, n=10, only_unknowns=False, is_row=True): self.compute() #will use default values! item = None zeros = [] - if only_unknowns and not self._matrix.get(): + seeDict=False + if only_unknowns and not self._matrix.get() and len(self._foldinZeroes)==0: raise ValueError("Matrix is empty! If you loaded an SVD model you can't use only_unknowns=True, unless svd.create_matrix() is called") + if not self._matrix.get(): + seeDict=True if is_row: if only_unknowns: - zeros = self._matrix.get().row_named(i).zero_entries() - item = self._get_row_reconstructed(i, zeros) + if seeDict: + zeros=self._foldinZeroes[i] #zeros in this instance contains the rated items + if len(zeros)==0: + raise ValueError("Matrix is empty! If you loaded an SVD model you can't use only_unknowns=True, unless svd.create_matrix() is called or youve just folded them in") + else: + item = self._get_row_unrated(i, zeros) #removing the rated items from utility row for recommendations + else: + zeros = self._matrix.get().row_named(i).zero_entries() + item = self._get_row_reconstructed(i, zeros) + else: + item = self._get_row_reconstructed(i, zeros) else: if only_unknowns: - zeros = self._matrix.get().col_named(i).zero_entries() - item = self._get_col_reconstructed(i, zeros) + if seeDict: + zeros=self._foldinZeroes[i] #zeros in this instance contains the rated items + if len(zeros)==0: + raise ValueError("Matrix is empty! If you loaded an SVD model you can't use only_unknowns=True, unless svd.create_matrix() is called or you just folded them in") + else: + item = self._get_col_unrated(i, zeros) #removing the rated items from utility columns for recommendations + else: + zeros = self._matrix.get().col_named(i).zero_entries() + item = self._get_col_reconstructed(i, zeros) + else: + item = self._get_row_reconstructed(i, zeros) + return item.top_items(n) + def _calc_mean_center(self, matrix, is_row=True): #created this to use the loaded shifts and calculate the row or column shift + row_shift, col_shift, total_shift = self._shifts + + total_mean = total_shift # use the global shift one + if is_row: + row_means = matrix.row_op(np.mean) - total_mean # calculate row shift + col_means = col_shift # use already given col shifts + else: + row_means = row_shift # use already given row shifts + col_means = matrix.col_op(np.mean) - total_mean # calculate col shifts + + row_lengths = matrix.row_op(len) + col_lengths = matrix.col_op(len) + + shifted = matrix.copy() + for row, col in shifted.keys(): + shifted[row, col] -= ( + (row_means[row] * row_lengths[row] + + col_means[col] * col_lengths[col] + ) / (row_lengths[row] + col_lengths[col]) + ) + total_mean + + return (shifted, row_means, col_means, total_mean) + # return shifted + + def load_updateDataTuple_foldin(self, filename,force=True, sep='\t', format={'value':0, 'row':1, 'col':2}, pickle=False,is_row=True,truncate=True,post_normalize=False): + """ + Folds-in a SINGLE user OR item. First loads a dataset file that contains a SINGLE tuple (a dataset for a single user OR item , has to be either same row or same column depending on is_row aka tuple) + + For params: filename,force,sep,format,pickle then see params definition in *datamodel.Data.load()* + + :param is_row: are you trying to foldin a row or a column ? yes->foldin row , no->foldin column + :type is_row: boolean + :param truncate: sometimes new users rate new items not in the original SVD matrix so would you like new items to be truncated or folded in ? default is foldin + :type truncate: boolean + :param post_normalize: Normalize every row of :math:`U \Sigma` to be a unit vector. Thus, row similarity (using cosine distance) returns :math:`[-1.0 .. 1.0]` + :type post_normalize: Boolean + + """ + if force: + self._updateData = Data() + + self._updateData.load(filename, force, sep, format, pickle) + + if VERBOSE: + print "reading the new tuple" + if(is_row): + nDimensionLabels=self._V.all_labels()[0] #get labels from V matrix to complete the sparse matrix + print type(nDimensionLabels) + print type(nDimensionLabels[0]) + print len(nDimensionLabels) + self._singleUpdateMatrix.create(self._updateData.get(), col_labels=nDimensionLabels, foldin=True,truncate=truncate) + self._foldinZeroes[self._singleUpdateMatrix.get_rows()[0]] = self._singleUpdateMatrix.get_cols() + + + else: + nDimensionLabels = self._U.all_labels() #get labels from U matrix to complete the sparse matrix + print nDimensionLabels + self._singleUpdateMatrix.create(self._updateData.get(), row_labels=nDimensionLabels, foldin=True,truncate=truncate) + self._foldinZeroes[self._singleUpdateMatrix.get_cols()[0]] = self._singleUpdateMatrix.get_rows() + + if not truncate: + additionalElements=self._singleUpdateMatrix.get_additional_elements() + #If it's trying to foldin a new user who has rated a new item which was not used before, then foldin the item first then foldin that user + print "dimension",len(nDimensionLabels) + print "additional elements:",additionalElements + print "length",len(additionalElements) + if len(additionalElements) !=0: + for item in additionalElements: + if (is_row): #if I am folding in a row then , the additionals added that shouldn't be are the columns to be folded in to the rows + self._singleAdditionalFoldin.create([(0,nDimensionLabels[0],item)], row_labels=self._U.all_labels()[0]) + else: + self._singleAdditionalFoldin.create([(0,item,nDimensionLabels[0])], col_labels=self._V.all_labels()[0]) + self._update(update_matrix=self._singleAdditionalFoldin,is_row=not is_row) + + # #update the data matrix + if VERBOSE: + print "updating the sparse matrix" + if self._matrix.get(): #if matrix not there due to load ignore it + self._matrix.update(self._singleUpdateMatrix) # updating the data matrix for the zeroes , also for saving the data matrix if needed + + # Mean centering + if self._shifts: #if not None then it means mean_center was equal true + row_shift, col_shift, total_shift=self._shifts + + + meanedMatrix, rowShift, colShift, totalShift=self._calc_mean_center(self._singleUpdateMatrix.get(),is_row=is_row) + + self._singleUpdateMatrix.set(meanedMatrix) + + if is_row: + values, named_rows = row_shift.to_sparse().named_lists() #values numpy array, named_rows normal array + valuesFold, named_rowsFold = rowShift.to_sparse().named_lists() + + else: + values, named_rows = col_shift.to_sparse().named_lists() # values numpy array, named_rows normal array + valuesFold, named_rowsFold = colShift.to_sparse().named_lists() + + + values=np.concatenate((values, valuesFold)) + named_rows.extend(named_rowsFold) + + if is_row: + row_shift=divisiSparseVector.from_named_lists(values, named_rows).to_dense() + else: + col_shift=divisiSparseVector.from_named_lists(values, named_rows).to_dense() + + self._shifts=(row_shift, col_shift, total_shift) + + + self._update(is_row=is_row,post_normalize=post_normalize) + + def _construct_batch_dictionary(self,data,is_row=True): + """ + + :param data: Data() + :param is_row: Boolean + :return: constructs a dictionary with the row or col as the keys (depending on which is being added) with values as the tuples + in self._batchDict + """ + + key_idx=1 #key index default is the row + if not is_row: + key_idx=2 + + #collecting the significant col or row tuples at one place to fold them in at once + + for item in data: #data is a list of tuples so item is 1 tuple + try: + self._batchDict[item[key_idx]].append(item) + except KeyError: + self._batchDict[item[key_idx]] = [] + self._batchDict[item[key_idx]].append(item) + + #batch loaded , now need to fold them in one by one + print "Batch loaded successfully" + + + def load_updateDataBatch_foldin(self, filename=None, data=None, force=True, sep='\t', format={'value': 0, 'row': 1, 'col': 2}, + pickle=False, is_row=True,truncate=True,post_normalize=False): + """ + Folds in the batch users or items, first Loads a dataset file that contains Multiple tuples (users or items) or uses the preloaded data from the datamodel/data.py object then folds them in with their ratings + + :param data: Contains the dataset that was loaded using the Data() class + :type data: Data() + + For params: filename,force,sep,format,pickle then see params definition in *datamodel.Data.load()* + + :param is_row: are you trying to foldin a row or a column ? yes->foldin row , no->foldin column + :type is_row: boolean + :param truncate: sometimes new users rate new items not in the original SVD matrix so would you like new items to be truncated or folded in ? default is foldin + :type truncate: boolean + :param post_normalize: Normalize every row of :math:`U \Sigma` to be a unit vector. Thus, row similarity (using cosine distance) returns :math:`[-1.0 .. 1.0]` + :type post_normalize: Boolean + """ + + if force: + self._updateData = Data() + if filename: #not null + self._updateData.load(filename, force, sep, format, pickle) #load array of tuples + else: + if data: + self._updateData =data + else: + raise ValueError('No data or filename set!') + print "Reading the new batch" + + self._construct_batch_dictionary(self._updateData.get(),is_row) + + print "Folding in batch entries" + nDimensionLabels=None + if (is_row): + nDimensionLabels = self._V.all_labels()[0] # get labels from V matrix to complete the sparse matrix + else: + nDimensionLabels = self._U.all_labels()[0] # get labels from U matrix to complete the sparse matrix + length_of_dict=len(self._batchDict) + i=0 + meanDenseVector=[] + isbatch=True + for key_idx in self._batchDict: #data in batchDict in form {key:[(tuple)]} + i += 1 + if VERBOSE: + if i % 100 == 0: + sys.stdout.write('.') + if i % 1000 == 0: + sys.stdout.write('|') + if i % 10000 == 0: + sys.stdout.write(' (%d K user)\n' % int(i / 1000)) + + if (is_row): + self._singleUpdateMatrix.create(self._batchDict[key_idx], col_labels=nDimensionLabels,foldin=True,truncate=truncate) + + else: + self._singleUpdateMatrix.create(self._batchDict[key_idx], row_labels=nDimensionLabels,foldin=True,truncate=truncate) + + # If it's trying to foldin a new user who has rated a new item which was not used before, then foldin the item first then foldin that user + if not truncate: + additionalElements = self._singleUpdateMatrix.get_additional_elements() + + if len(additionalElements) != 0: + for item in additionalElements: + if (is_row): # if I am folding in a row then , the additionals added that shouldn't be are the columns to be folded in to the rows + self._singleAdditionalFoldin.create([(0, nDimensionLabels[0], item)], + row_labels=self._U.all_labels()[0]) + else: + self._singleAdditionalFoldin.create([(0, item, nDimensionLabels[0])], + col_labels=self._V.all_labels()[0]) + + self._update(update_matrix=self._singleAdditionalFoldin, is_row=not is_row) + + if self._shifts: # if not None then it means mean_center was equal true + row_shift, col_shift, total_shift = self._shifts + + + meanedMatrix, rowShift, colShift, totalShift = self._calc_mean_center(self._singleUpdateMatrix.get(),is_row=is_row) + + self._singleUpdateMatrix.set(meanedMatrix) + # row shift cause it's row for the time being + if is_row: + meanDenseVector.append(rowShift) + + else: + meanDenseVector.append(colShift) + + + if self._matrix.get(): #if matrix not there due to load ignore it + self._matrix.update( + self._singleUpdateMatrix,is_batch=isbatch) # updating the data matrix for the zeroes , also for saving the data matrix if needed + + self._update(is_row=is_row,is_batch=isbatch) #Do foldin on the singleUpdateMatrix tuple + if VERBOSE: + sys.stdout.write('\n') + # UPDATING MEAN CENTER PART + if self._shifts: + sys.stdout.write("updating shifts") + if is_row: + values, named_rows = row_shift.to_sparse().named_lists() # values numpy array, named_rows normal array + else: + values, named_rows = col_shift.to_sparse().named_lists() # values numpy array, named_rows normal array + for vector in meanDenseVector: + valuesFold, named_rowsFold = vector.to_sparse().named_lists() # rowShift contains new calculated row shift + values = np.concatenate((values, valuesFold)) + named_rows.extend(named_rowsFold) + if is_row: + row_shift = divisiSparseVector.from_named_lists(values, named_rows).to_dense() + else: + col_shift = divisiSparseVector.from_named_lists(values, named_rows).to_dense() + + self._shifts = (row_shift, col_shift, total_shift) + + self.update_sparse_matrix_data(is_batch=True,squish=False,post_normalize=post_normalize) + + + def update_sparse_matrix_data(self,squishFactor=10,is_batch=False,squish=True,post_normalize=False): + #update the data matrix + if is_batch: + if self._matrix.get(): + if VERBOSE: + print "updating sparse index" + self._matrix.index_sparseMatrix() + if VERBOSE: + print "before updating, M=", self._matrix_reconstructed.shape + # Sim. matrix = U \Sigma^2 U^T + self._reconstruct_similarity(post_normalize=post_normalize, force=True) + # M' = U S V^t + self._reconstruct_matrix(shifts=self._shifts, force=True) + if VERBOSE: + print "done updating, M=", self._matrix_reconstructed.shape + if squish: + if self._matrix.get(): #if loaded model there is no matrix + if VERBOSE: + print "commiting the sparse data matrix by removing empty rows and columns divisi created" + self._matrix.squish(squishFactor) # updating the data matrix for the zeroes ,#NOTE: Intensive so do at end + + + def _update(self,update_matrix=None,is_row=True,is_batch=False,post_normalize=False): + #The function which does the actual folding-in process + if self.inv_S is None: + self.inv_S=np.zeros((self._S.shape[0], self._S.shape[0])) + for i in range(self._S.shape[0]): + self.inv_S[i, i] = self._S[i]**-1 # creating diagonal matrix and inverting using special property of diagonal matrix + + #if new is row -> V*S^-1 + if is_row: + prodM=self._V.dot(self.inv_S) + # if VERBOSE: + # print "dimension of VxS^-1=", prodM.shape + else: #if new is col -> U*S^-1 + prodM = self._U.dot(self.inv_S) + # if VERBOSE: + # print "dimension of UxS^-1=", prodM.shape + + if update_matrix: + updateTupleMatrix=update_matrix.get() + else: + updateTupleMatrix = self._singleUpdateMatrix.get() + + if not is_row: + updateTupleMatrix=updateTupleMatrix.transpose() #transpose + + res=updateTupleMatrix.dot(prodM) + + if is_row: + #new value can now be concatinated with U + + self._U=self._U.concatenate(res) + + else: + #new value can now be concatinated with V + + self._V = self._V.concatenate(res) + + if not is_batch: #will reconstruct all at end with batch using another function + if VERBOSE: + print "before updating, M=",self._matrix_reconstructed.shape + # Sim. matrix = U \Sigma^2 U^T + self._reconstruct_similarity(post_normalize=post_normalize, force=True) + # M' = U S V^t + self._reconstruct_matrix(shifts=self._shifts, force=True) + if VERBOSE: + print "done updating, M=",self._matrix_reconstructed.shape + + def centroid(self, ids, is_row=True): points = [] for id in ids: @@ -397,7 +791,7 @@ def kmeans(self, ids, k=5, components=3, are_rows=True): i = 0 clusters = dict() for cluster in labels: - if not clusters.has_key(cluster): + if not clusters.has_key(cluster): clusters[cluster] = dict() clusters[cluster]['centroid'] = centroids[cluster] clusters[cluster]['points'] = [] @@ -467,7 +861,7 @@ def similar_neighbours(self, i, j, Sk=10): _Sk += 1 current += 1 _Sk -= 1 - if _Sk == 0: + if _Sk == 0: break # We have enough elements to use return similars[:Sk] @@ -529,7 +923,7 @@ def predict(self, i, j, Sk=10, weighted=True, MIN_VALUE=None, MAX_VALUE=None): # SVDNeighbourhoodKoren class __SVDNeighbourhoodKoren(SVDNeighbourhood): """ - Inherits from SVDNeighbourhood class. + Inherits from SVDNeighbourhood class. Neighbourhood model, using Singular Value Decomposition. Based on 'Factorization Meets the Neighborhood: a Multifaceted @@ -614,7 +1008,7 @@ def predict(self, i, j, Sk=None, MIN_VALUE=None, MAX_VALUE=None): Predicts the value of *M(i,j)* It is based on 'Factorization Meets the Neighborhood: a Multifaceted - Collaborative Filtering Model' (Yehuda Koren). + Collaborative Filtering Model' (Yehuda Koren). Equation 3 (section 2.2): :math:`\hat{r}_{ui} = b_{ui} + \\frac{\sum_{j \in S^k(i;u)} s_{ij} (r_{uj} - b_{uj})}{\sum_{j \in S^k(i;u)} s_{ij}}`, where @@ -638,8 +1032,8 @@ def predict(self, i, j, Sk=None, MIN_VALUE=None, MAX_VALUE=None): # bui = µ + bu + bi # The parameters bu and bi indicate the observed deviations of user # u and item i, respectively, from the average - # - # S^k(i; u): + # + # S^k(i; u): # Using the similarity measure, we identify the k items rated # by u, which are most similar to i. # @@ -659,7 +1053,7 @@ def predict(self, i, j, Sk=None, MIN_VALUE=None, MAX_VALUE=None): bui = bu + bi #if self._Mu: #TODO uncomment? # bui += self._Mu - + sim_ratings = [] sum_similarity = 0.0 for similar, sij in similars[1:]: @@ -678,10 +1072,9 @@ def predict(self, i, j, Sk=None, MIN_VALUE=None, MAX_VALUE=None): Sumj_Sk = sum(sim_ratings)/sum_similarity rui = bui + Sumj_Sk predicted_value = rui - + if MIN_VALUE: predicted_value = max(predicted_value, MIN_VALUE) if MAX_VALUE: predicted_value = min(predicted_value, MAX_VALUE) return float(predicted_value) - diff --git a/recsys/algorithm/matrix.py b/recsys/algorithm/matrix.py index 9a8fa53..253a07f 100644 --- a/recsys/algorithm/matrix.py +++ b/recsys/algorithm/matrix.py @@ -73,12 +73,66 @@ def get_col_len(self): class SparseMatrix(Matrix): def __init__(self): super(SparseMatrix, self).__init__() + self._values=None + self._rows=None + self._cols=None + self._additional_elements=[] #if no additional then len will equal 0 - def create(self, data): - values = map(itemgetter(0), data) - rows = map(itemgetter(1), data) - cols = map(itemgetter(2), data) - self._matrix = divisiSparseMatrix.from_named_lists(values, rows, cols) + + def get_rows(self): #can use to get rated items and remove from recommendation + return self._rows + + def get_cols(self): #can use to get rated items and remove from recommendation + return self._cols + + def get_additional_elements(self): # can use to get additional items to either fold or truncate + return self._additional_elements + +#row_labels specifies the row labels the complete matrix should have incase the inputted file doesn't include all indicies and it was saved in previous matrix (for update) +#same explination for col_labels but for columns +#matrix should have, in case it is larger than the largest index. + def create(self, data,row_labels=None, col_labels=None, foldin=False,truncate=False): + #is_row is what I'm originally folding in + self._values = map(itemgetter(0), data) + self._rows = map(itemgetter(1), data) + self._cols = map(itemgetter(2), data) + + if foldin: #new to make sure not folding in user and item at same time + #idea: create matrix normally but keep track of the columns (items) or rows to be folded in before doing update + if col_labels: #if col_labels defined then I'm folding in a row + self._additional_elements = [x for x in self._cols if x not in col_labels] + else: #else I am folding in a column + self._additional_elements = [x for x in self._rows if x not in row_labels] + if truncate: + for item in self._additional_elements: + if col_labels: + index_remove = self._cols.index(item) + else: + index_remove = self._rows.index(item) + del self._values[index_remove] + del self._rows[index_remove] + del self._cols[index_remove] + + + self._matrix = divisiSparseMatrix.from_named_lists(self._values, self._rows, self._cols,row_labels, col_labels) + + + + + def update(self, matrix,is_batch=False): #isbatch is for creating the final sparse matrix ,since you will want to collect all then construct final matrix at end +#To update the stored data matrix with the new values and create a new divisi spare matrix with it to retain the zeroes + self._values.extend(matrix._values) + self._rows.extend(matrix._rows) + self._cols.extend(matrix._cols) + + if not is_batch: + self._matrix = divisiSparseMatrix.from_named_lists(self._values, self._rows, self._cols) + + def squish(self,squishFactor): #remove additional empty fields created by divisiSparseMatrix + self._matrix=self._matrix.squish(squishFactor) + + def index_sparseMatrix(self): #create the divisi2 sparse matrix from already existing values + self._matrix = divisiSparseMatrix.from_named_lists(self._values, self._rows, self._cols) def empty(self): return not self._matrix or not self._matrix.values() @@ -95,4 +149,3 @@ def empty(self): if self._matrix: nrows, ncols = self._matrix.shape return not self._matrix or not (nrows and ncols) - diff --git a/recsys/datamodel/data.py b/recsys/datamodel/data.py index 21de618..6919229 100644 --- a/recsys/datamodel/data.py +++ b/recsys/datamodel/data.py @@ -17,6 +17,7 @@ def __init__(self): #:type data: list #""" self._data = list([]) + self._tupleDict = {} def __repr__(self): s = '%d rows.' % len(self.get()) @@ -52,6 +53,13 @@ def get(self): :returns: a list of tuples """ return self._data + def get_tuple_dict(self): + """ + :returns: a dictionary of users or items and corresponding ratings + """ + if not self._tupleDict: + raise ValueError('Tuple dictionary hasn\'t been created yet, please run split_train_test_foldin first then try again') + return self._tupleDict def add_tuple(self, tuple): """ @@ -94,6 +102,166 @@ def split_train_test(self, percent=80, shuffle_data=True): return train, test + def split_train_test_foldin(self,base=60,percentage_base_user=80, shuffle_data=True,is_row=True,force=True,data_report_path=None,id=None,ignore_rating_count=0): + """ + Splits the data in three datasets: train, test, and foldin + + :param base: % of training set to be used (Foldin set size = 100-base) for base SVD model (not folded) + :type base: int + :param percentage_base_user: % of user ratings per user (or item ratings per item depending on which is row and column) to be used as base for training or foldin (testing will be percentage of ratings from 100-percentage_base_user per user or item ) + :type percentage_base_user: int + + :param shuffle_data: shuffle dataset? + :type shuffle_data: Boolean + + :param is_row: are you trying to foldin a row or a column ? yes-> row , no-> column + :type is_row: Boolean + :param force: clear the values in data + :type force: Boolean + + + The following parameters are used for when generating a report of the dataset distribution: + :param data_report_path: path to create report in + :type data_report_path: String + :param id: id number to be given to the report + :type id: String + :param ignore_rating_count: The threshold number of ratings to be removed from the data. + :type ignore_rating_count: int + + :returns: a tuple for train, test, foldin + """ + if force: + self._construct_dictionary(is_row=is_row,force=True) + elif len(self._tupleDict)==0: + self._construct_dictionary(is_row=is_row) + self._remove_ratings_count_from_dictionary(ignore_rating_count) + dictKeys=self._tupleDict.keys() #users + numberOfKeys= len(dictKeys) #number of users + + train_list =[] + test_list=[] + foldin_list=[] + + if shuffle_data: + shuffle(dictKeys) + train_list_keys=dictKeys[:int(round(numberOfKeys*base/100.0))] + if base==100: + foldin_list_keys=[] + else: + foldin_list_keys=dictKeys[-int(round(numberOfKeys*(100-base)/100.0)):] + + for key in train_list_keys: + tupleList=self._tupleDict[key] + lengthTupleList=len(tupleList) + if shuffle_data: + shuffle(tupleList) + + train_list.extend(tupleList[:int(round(lengthTupleList*percentage_base_user/100.0))]) + if int(round(lengthTupleList*(100-percentage_base_user)/100.0)) !=0: #if test=0 then can't take that percentage so skip taking it's tuple for test + test_list.extend(tupleList[-int(round(lengthTupleList*(100-percentage_base_user)/100.0)):]) + + for key in foldin_list_keys: + tupleList=self._tupleDict[key] + lengthTupleList=len(tupleList) + if shuffle_data: + shuffle(tupleList) + + foldin_list.extend(tupleList[:int(round(lengthTupleList*percentage_base_user/100.0))]) + if int(round(lengthTupleList*(100-percentage_base_user)/100.0)) !=0: #if test=0 then can't take that percentage so skip taking it's tuple for test + test_list.extend(tupleList[-int(round(lengthTupleList*(100-percentage_base_user)/100.0)):]) + + + + length = len(self._data) + if VERBOSE: + print "total number of tuples:",length + print "percentage of data for training:",round((len(train_list)*1.0/length)*100),"%","with",len(train_list),"tuples" + print "percentage of data for testing:",round((len(test_list)*1.0/length)*100),"%","with",len(test_list),"tuples" + print "percentage of data for foldin:",round((len(foldin_list)*1.0/length)*100),"%","with",len(foldin_list),"tuples" + print "_____________" + print "percentage of users for foldin:",round((len(foldin_list_keys)*1.0/numberOfKeys*1.0)*100),"%","with",len(foldin_list_keys),"users" + print "percentage of users for training:",round((len(train_list_keys)*1.0/numberOfKeys*1.0)*100),"%","with",len(train_list_keys),"users" + + if data_report_path: + myFile = open(data_report_path+"/data_distribution_report.txt", 'a+') + + myFile.write("DataID:"+ str(id)) + myFile.write("total number of tuples:"+ str(length)) + myFile.write("\n") + myFile.write( "percentage of data for training:"+ str(round((len(train_list) * 1.0 / length) * 100))+ "%"+ "with"+str(len(train_list))+"tuples") + myFile.write("\n") + myFile.write( "percentage of data for testing:"+ str(round((len(test_list) * 1.0 / length) * 100))+ "%"+ "with"+ str(len(test_list))+ "tuples") + myFile.write("\n") + myFile.write("percentage of data for foldin:"+ str(round((len(foldin_list) * 1.0 / length) * 100))+ "%"+ "with"+ str(len(foldin_list))+ "tuples") + myFile.write("\n") + myFile.write("_____________") + myFile.write("\n") + myFile.write("percentage of users for foldin:"+ str(round((len(foldin_list_keys) * 1.0 / numberOfKeys * 1.0) * 100))+ "%"+ "with"+ str(len(foldin_list_keys))+ "users") + myFile.write("\n") + myFile.write("percentage of users for training:"+ str(round((len(train_list_keys) * 1.0 / numberOfKeys * 1.0) * 100))+ "%"+"with"+ str(len(train_list_keys))+ "users") + myFile.write("\n") + myFile.write("________________________________________________________________") + myFile.write("\n") + + myFile.close() + + + train = Data() + train.set(train_list) + test = Data() + test.set(test_list) + foldin=Data() + foldin.set(foldin_list) + + return train, test, foldin + + def _remove_ratings_count_from_dictionary(self,count_threshold_to_remove): + ''' + :param count_threshold_to_remove: The threshold number of ratings to be removed from the data. + :type count_threshold_to_remove: int + :return: void, it changes the data itself in the class. + ''' + if count_threshold_to_remove==0: + return + removed=0 + dictKeys=self._tupleDict.keys() + for key in dictKeys: + if len(self._tupleDict[key])<=count_threshold_to_remove: + del self._tupleDict[key] + removed+=1 + + print "users removed less than or equal threshold count=",removed,"users" + return + + def _construct_dictionary(self, is_row=True,force=True): + ''' + + :param data: Data() + :param is_row: Boolean + :return: constructs a dictionary with the row or col as the keys (depending on which is being added) with values as the tuples + in self._batchDict + ''' + # self._values = map(itemgetter(0), data) + # self._rows = map(itemgetter(1), data) + # self._cols = map(itemgetter(2), data) + key_idx = 1 # key index default is the row + if not is_row: + key_idx = 2 + if force: #construct new dictionary + self._tupleDict={} + # collecting the significant col or row tuples at one place to fold them in at once + + for item in self._data: # data is a list of tuples so item is 1 tuple + try: + self._tupleDict[item[key_idx]].append(item) + except KeyError: + self._tupleDict[item[key_idx]] = [] + self._tupleDict[item[key_idx]].append(item) + + # batch loaded , now need to fold them in one by one + if VERBOSE: + print "Dictionary created successfully" + def load(self, path, force=True, sep='\t', format=None, pickle=False): """ Loads data from a file @@ -104,10 +272,10 @@ def load(self, path, force=True, sep='\t', format=None, pickle=False): :type force: Boolean :param sep: Separator among the fields of the file content :type sep: string - :param format: Format of the file content. + :param format: Format of the file content. Default format is 'value': 0 (first field), then 'row': 1, and 'col': 2. - E.g: format={'row':0, 'col':1, 'value':2}. The row is in position 0, - then there is the column value, and finally the rating. + E.g: format={'row':0, 'col':1, 'value':2}. The row is in position 0, + then there is the column value, and finally the rating. So, it resembles to a matrix in plain format :type format: dict() :param pickle: is input file in pickle format? @@ -120,8 +288,8 @@ def load(self, path, force=True, sep='\t', format=None, pickle=False): if pickle: self._load_pickle(path) else: - i = 0 - for line in codecs.open(path, 'r', 'utf8'): + i = 0 + for line in codecs.open(path, 'r', 'ISO-8859-1'): #was utf8 changed it to 'ISO-8859-1' data = line.strip('\r\n').split(sep) value = None if not data: @@ -140,7 +308,7 @@ def load(self, path, force=True, sep='\t', format=None, pickle=False): value = data[format['value']] except KeyError, ValueError: value = 1 - try: + try: row_id = data[format['row']] except KeyError: row_id = data[1]