diff --git a/.gitignore b/.gitignore index 29924d2dca81c2d6698023cf8c853ff98e401531..2324cd311cfa5428b4137487787d349ec81edc93 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,5 @@ -.ipynb_checkpoints/* -MultiplicativeAlgorithm/.ipynb_checkpoints/* +*/.ipynb_checkpoints/ +*/__pycache__/ MultiplicativeAlgorithm/D_trips.txt MultiplicativeAlgorithm/D_speeds.txt MultiplicativeAlgorithm/*.png diff --git a/MultiplicativeAlgorithm/Extreme_Events.ipynb b/MultiplicativeAlgorithm/ExtremeEvents.ipynb similarity index 99% rename from MultiplicativeAlgorithm/Extreme_Events.ipynb rename to MultiplicativeAlgorithm/ExtremeEvents.ipynb index f362fa74ac6c81b30b3e8b71e2397638eb75866a..2320491bb519d053556b72de360906f638351246 100644 --- a/MultiplicativeAlgorithm/Extreme_Events.ipynb +++ b/MultiplicativeAlgorithm/ExtremeEvents.ipynb @@ -163,7 +163,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.5" + "version": "3.6.6" } }, "nbformat": 4, diff --git a/MultiplicativeAlgorithm/Phase2.ipynb b/MultiplicativeAlgorithm/Phase2.ipynb index c542a3f4b786f3ad9e7a45954eda6bfb4a8977fc..29d6bf1daf7916af95759c6848f231c48a8c6887 100644 --- a/MultiplicativeAlgorithm/Phase2.ipynb +++ b/MultiplicativeAlgorithm/Phase2.ipynb @@ -2518,7 +2518,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.5" + "version": "3.6.6" } }, "nbformat": 4, diff --git a/MultiplicativeAlgorithm/Visualizations.py b/MultiplicativeAlgorithm/Visualizations.py index cb86e9853444886df5955f9f40f50c38babd3194..f3896b4b2fbf8ef2daca643bdcdd98a10366c096 100644 --- a/MultiplicativeAlgorithm/Visualizations.py +++ b/MultiplicativeAlgorithm/Visualizations.py @@ -315,3 +315,14 @@ def spikeyness_vs_error(): plt.savefig('Spikeyness_vs_Error') return None +def spikeyness2(W): + def spikeyness_column(column_number): + trend = W[:52*24*7, column_number].reshape(52,24*7) + if np.mean(W[:52*24*7, column_number]) == 0: + return np.nan + else: + return np.mean(np.nanstd(trend, axis=0))/np.mean(W[:, column_number]) + + spikey_mean = np.mean([spikeyness_column(sig_number) for sig_number in range(W.shape[1])]) + spikey_std = np.std([spikeyness(sig_number) for sig_number in range(W.shape[1])]) + return spikey_mean, spikey_std \ No newline at end of file diff --git a/MultiplicativeAlgorithm/__init__.py b/MultiplicativeAlgorithm/__init__.py index e5fdd929a31eb888ffcec7c2c3c79dcf32a5af2a..b3cc5fba4151e5e1d1e34f62ec1859d23bdfce7b 100644 --- a/MultiplicativeAlgorithm/__init__.py +++ b/MultiplicativeAlgorithm/__init__.py @@ -22,24 +22,4 @@ formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(messag fh.setFormatter(formatter) logger.addHandler(fh) -logger.addHandler(sh) - - - -'''Select path name for files depending on whether you are working on your PC or cluster''' -# path_for_data is an optional variable that defaults to 'path'. This is included to facilate a seperate path -# address for large data files - -#path = './Data_Files/' # Use this if working on PC -#path_for_data = '../../' # Use this if working on PC and if data files are in a different folder - -#path = './scratch/' # Use this if working on cluster -#path_for_data = './scratch/' # Use this if working on cluster and if data files are in a different folder - -#filenames = config.generate_filenames(path, path_for_data) - -#logger.info('Filenames have been loaded') -#[logger.debug(i) for i in filenames] - -#logger.info('Constants have been loaded') -#[logger.debug(i) for i in dir(globals()['config'])]; \ No newline at end of file +logger.addHandler(sh) \ No newline at end of file diff --git a/MultiplicativeAlgorithm/cSNMF.ipynb b/MultiplicativeAlgorithm/cSNMF.ipynb index 08950a7fcba3db8375509515440951833d5b8bc2..a21e801378929dd3891b09145d99ac488e19f594 100644 --- a/MultiplicativeAlgorithm/cSNMF.ipynb +++ b/MultiplicativeAlgorithm/cSNMF.ipynb @@ -4,12 +4,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Run cSNMF on Trips data using multiplicative update rules\n", - "\n", - "**Constraint:** L1 norm of columns of W should be 1\n", - "\n", - "Get a copy of the data matrices in your local machine from the following links:\n", - " - https://uofi.box.com/s/yo60oe084d68obohgraek4weuaqtpgsp" + "# Run cSNMF on Trips data using multiplicative update rules" ] }, { @@ -18,13 +13,13 @@ "metadata": {}, "outputs": [], "source": [ - "from __init__ import *\n", "import numpy as np\n", - "import pandas as pd\n", - "import config\n", - "import cSNMF\n", "import matplotlib.pyplot as plt\n", - "%matplotlib inline" + "%matplotlib inline\n", + "\n", + "from __init__ import * # initializes logger\n", + "import config # contains all the global variables\n", + "import cSNMF # contains update rules and factorization functions" ] }, { @@ -33,17 +28,19 @@ "metadata": {}, "outputs": [], "source": [ - "## Read Full-Link data and prep for running NMF.\n", - "D = np.loadtxt('D_trips.txt')\n", - "logger.info('Full_link data has been read')\n", + "## Read Full-Link data and prep for running cSNMF.\n", + "if config.TRIPS:\n", + " D = np.loadtxt('D_trips.txt')\n", + " logger.info('Full_link data for trips has been read')\n", + "else:\n", + " D = np.loadtxt('D_traveltimes.txt')\n", + " logger.info('Full_link data for traveltimes has been read')\n", "\n", - "if config.SEEDED == 1:\n", - " seed_W = 0; seed_H = 1\n", - "elif config.SEEDED == 0:\n", - " seed_W = None; seed_H = None\n", + "# Seeded W and H will always return the same result. Turn off seeded for different results with each run. \n", + "if config.SEEDED:\n", + " seed_W, seed_H = 0, 1\n", "else:\n", - " logger.critical('Seed value invalid. Needs to be 0 or 1. Check config.py!')\n", - " quit()" + " seed_W, seed_H = None, None" ] }, { @@ -52,10 +49,11 @@ "metadata": {}, "outputs": [], "source": [ + "# Factorizes D and imposes contraints.\n", + "# Returns numpy arrays W and H, and pandas dataframe results.\n", "W, H, results = cSNMF.factorize(D,\n", - " beta = 5000,\n", + " beta = config.BETA,\n", " rank = config.RANK,\n", - " max_iter = 600,\n", " seed_W = seed_W,\n", " seed_H = seed_H,\n", " debug = True,\n", @@ -68,6 +66,7 @@ "metadata": {}, "outputs": [], "source": [ + "# Displays runtime statistics as a dataframe\n", "results" ] }, @@ -77,8 +76,8 @@ "metadata": {}, "outputs": [], "source": [ - "#np.savetxt('W_(seed_W = 10,seed_H = 21).txt', W)\n", - "#np.savetxt('H_(seed_W = 10,seed_H = 21).txt', H)" + "np.savetxt('W_trips.txt', W)\n", + "np.savetxt('H_trips.txt', H)" ] } ], @@ -98,7 +97,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.3" + "version": "3.6.6" } }, "nbformat": 4, diff --git a/MultiplicativeAlgorithm/cSNMF.py b/MultiplicativeAlgorithm/cSNMF.py index d56b90a4fc48d69d22f0e7f6a9cf09d0e07f4240..e02494ee72fd6b3c5066bb8faf4d73f4709bc9b2 100644 --- a/MultiplicativeAlgorithm/cSNMF.py +++ b/MultiplicativeAlgorithm/cSNMF.py @@ -1,4 +1,4 @@ -''' This contains functions for cSNMF.''' +''' This contains functions called by cSNMF.ipynb for constrained Sparse Nonnegative Matrix Factorization''' import numpy as np import pandas as pd @@ -11,119 +11,100 @@ logger = logging.getLogger(__name__) ## Useful matrix operations: -def m(A, B): return np.multiply(A,B) def div(A, B): return np.array([[A[i,j]/B[i,j] if A[i,j]>0 else 0 for j in range(A.shape[1])] for i in range(A.shape[0])]) -def d(A): return np.diag(np.diag(A)) # d(A) replaces all off-diagonal entries of A with 0. -def N(A): return m(A, NONZEROS) # NONZEROS is a global variable. - - +def d(A): return np.diag(np.diag(A)) # d(A) replaces all off-diagonal entries of A with 0. +def N(A): return A*NONZEROS # NONZEROS is a global variable. +# Calculates sparsity of H as in Low_Rank_Manhattan_Traffic.pdf def sparsity_metric(H): def sparsity_column(column): n = len(column) - if sum(column) == 0: - ratio = 1 - logger.info('Column of zeros in H') - else: - ratio = np.linalg.norm(column, 1)/np.linalg.norm(column, 2) + assert sum(column) != 0, 'Column of zeros in H. Reduce rank' + ratio = np.linalg.norm(column, 1)/np.linalg.norm(column, 2) return (math.sqrt(n) - ratio)/(math.sqrt(n) - 1) return np.mean([sparsity_column(column) for column in H.T]) def calculate_error(D, W, H): - norm_D = np.linalg.norm(D) - return np.linalg.norm(D - N(W@H))/norm_D*100 - -def spikeyness(W): - def spikeyness_column(column_number): - trend = W[:52*24*7, column_number].reshape(52,24*7) - if np.mean(W[:52*24*7, column_number]) == 0: - return np.nan - else: - return np.mean(np.nanstd(trend, axis=0))/np.mean(W[:, column_number]) - - spikey_mean = np.mean([spikeyness_column(sig_number) for sig_number in range(W.shape[1])]) - spikey_std = np.std([spikeyness(sig_number) for sig_number in range(W.shape[1])]) - return spikey_mean, spikey_std - -def zeros_in_H(H): - # Percentage of zero entries in H - # Use this on H_axed - return sum(sum(H==0))/H.size + return np.linalg.norm(D - N(W@H))/NORM_D*100 +# Constraint imposed on the columns of W def impose_L1_constraint(W, H): W2 = W/sum(W) H2 = np.array([H[i]*sum(W)[i] for i in range(len(H))]) return W2, H2 - -def axe_H(H, tail_cutoff = 0.4): +# Sets the bottom tail_cutoff fraction of entries in each column of H to 0. +def axe_H(H, tail_cutoff = config.AXING_CUTOFF): H2 = H.copy() for i, column in enumerate(H.T): - sorted_column = sorted(column) + sorted_column = sorted(column) cumulative_sum = it.accumulate(sorted_column) - kill_count = sum(list(cumulative_sum) < tail_cutoff*sum(column)) - H2.T[i] = np.array([entry if entry >= sorted_column[kill_count] else 0 for entry in column]) + kill_count = sum(list(cumulative_sum) < tail_cutoff*sum(column)) + H2.T[i] = np.array([entry if entry >= sorted_column[kill_count] else 0 for entry in column]) return H2 +'''Sort columns of W and rows of H according to the following rules -- + - Give each link a total weight of 1. + - Distribute that weight across all of its signatures in proportion to the corresponding entry in H. + - For each signature, calculate popularity to be sum of weights across all links. + - Sort columns of W in decreasing order of popularity. + - Sort rows of H based on the same permutaion in order to preserve W@H''' +# Sorting is done so we have a rough intuition for which signatures get used the most. def sort_WH(W, H): - H2 = axe_H(H) - weights = H2/sum(H2) + weights = H/sum(H) signature_popularities = sum(weights.T) - signature_popularities = sorted(enumerate(signature_popularities), key = lambda x : x[1], reverse = True) + signature_popularities = sorted(enumerate(signature_popularities),\ + key = lambda x : x[1], reverse = True) W2 = W.copy() - H3 = H2.copy() + H2 = H.copy() for i in range(len(signature_popularities)): W2[:, i] = W[:, signature_popularities[i][0]] - H3[i] = H2[signature_popularities[i][0]] - try: - assert ( np.linalg.norm(W@H2 - W2@H3) / np.linalg.norm(W@H2) ) < 0.01 - except AssertionError: - logger.critical('Sorting is not working correctly') - return W2, H3 - - return W2, H3 + H2[i] = H[signature_popularities[i][0]] + + assert ( np.linalg.norm(W@H - W2@H2) / np.linalg.norm(W@H) ) < 0.01,\ + 'Sorting is not working correctly' + return W2, H2 def factorize(data_array, - init_W = None, - rank = config.RANK, - beta = None, - threshold = 0.5, - max_iter = 600, - seed_W = None, - seed_H = None, - log = logger, - debug = False, - axing = True, - update_W = True): + rank = 50, + beta = None, + threshold = 0.5, # algorithm stops if change in W and H is less than threshold % + max_iter = 600, + seed_W = None, + seed_H = None, + log = logger, + debug = False, # if debug, then return runtime-statistics + axing = True, # if axing, then set small entries in H to zero + init_W = None): # optionally specify W as a numpy array and keep it unchanged in every iteration log.info('Rank= %s, Threshold= %s', rank, threshold) D = np.nan_to_num(data_array) eta = D.max() if beta is None: - beta = len(D.T)//2 - global NONZEROS - NONZEROS = ~np.isnan(data_array) + beta = len(D.T)/2 - JNN = np.ones((rank, rank)) + global NONZEROS, NORM_D + NONZEROS = ~ np.isnan(data_array) + NORM_D = np.linalg.norm(D) + JNN = np.ones((rank, rank)) def update_W(W, H): Term_1 = D @ H.T Term_2 = N(W @ H) @ H.T + eta*W - W = div(m(W, Term_1), Term_2) + W = div(W * Term_1, Term_2) return W - def update_H(W, H): Term_3 = W.T @ D Term_4 = W.T @ N(W @ H) + beta*(JNN @ H) - H = div(m(H, Term_3), Term_4) + H = div(H * Term_3, Term_4) return H @@ -134,32 +115,30 @@ def factorize(data_array, logger.info('Initializing W and H...') - W_shape = (D.shape[0], rank) H_shape = (rank, D.shape[1]) if init_W is None: + update_W = True np.random.seed(seed_W) W = np.random.uniform(low = 0.01, high = 1., size = W_shape) else: + update_W = False W = init_W np.random.seed(seed_H) H = np.random.uniform(low = 0.01, high = 1., size = H_shape) log.info('W, H chosen') - iterations = 0 + + iterations = 0 column_names = ['error', 'sparsity', 'diff_W', 'diff_H', 'W_minmax', 'H_0th', 'H_25th', 'H_50th', 'H_75th', 'H_100th'] # For results DataFrame results = pd.DataFrame(columns = column_names) - - - diff_W = 100 - diff_H = 100 - - global norm_D - norm_D = np.linalg.norm(N(D)) + diff_W = 100 + diff_H = 100 + # Run till stopping condition is met while abs(diff_W) + abs(diff_H) > threshold or iterations < 100: if iterations > max_iter: break @@ -194,16 +173,14 @@ def factorize(data_array, if debug is True: H_0th, H_25th, H_50th, H_75th, H_100th = quartiles(H) W_minmax = (W.min(), W.max()) - column_values = [error, sparsity, diff_W, diff_H, W_minmax,\ H_0th, H_25th, H_50th, H_75th, H_100th] - results = results.append(dict(zip(column_names, column_values)), ignore_index = True) - - if axing: - W, H = impose_L1_constraint(W, H) - W, H = sort_WH(W, H) + + W, H = impose_L1_constraint(W, H) + if axing: H = axe_H(H) + W, H = sort_WH(W, H) error = calculate_error(D, W, H) sparsity = sparsity_metric(H) diff --git a/MultiplicativeAlgorithm/config.py b/MultiplicativeAlgorithm/config.py index f11cb2ed3b36bae3c14ed216fb86a1e8b039778d..66fb064d4800769597b5b99626060d1f8cdd0a83 100644 --- a/MultiplicativeAlgorithm/config.py +++ b/MultiplicativeAlgorithm/config.py @@ -1,53 +1,21 @@ -''' * This file defines addresses to all data files, input files and output files. - * It also lists all the Global variables used in Non-Negative Matrix Factorization. +''' * This file lists all the Global variables used in Non-Negative Matrix Factorization. * In case any of the Factorization paramaters need to be changed, they should be changed in this file instead of locally within each program.''' +# Global variables +RANK = 50 # Rank for Matrix Factorization in Phase1.py +TRIPS = True # False reads travel_traveltimes dataset, True reads trips dataset. + # Currently False doesn't work due to bad hyperparameters. +SEEDED = True # Determines whether cSNMF.py uses seeded random matrices for W,H or not. +BETA = 5000 # Higher value penalizes lack of sparsity in H. (See Low_Rank_Manhattan_Traffic.pdf). + # If unsure, then set BETA = None +AXING_CUTOFF = 0.4 # Set between 0 and 1. Sets the bottom AXING_CUTOFF fraction of entries in each column of H to 0. -def generate_filenames(path, path_for_data=None): - if path_for_data is None: - path_for_data = path - filenames = {# Raw data files - 'links': path + 'links.csv', - 'nodes': path + 'nodes.csv', - 'raw_data': path_for_data + 'travel_times_2011.csv', - 'data_coo_form': path_for_data + 'data_coo_form.txt', - 'data_trips': path_for_data + 'data_trips.csv', - 'data_traveltimes': path_for_data + 'data_travel_times.csv', - 'data_trips_transpose': path_for_data + 'data_trips_transpose.csv', - - # Inputs and Outputs of Read_data.py - 'full_link_ids': path + 'full_link_ids.txt', - 'empty_link_ids': path + 'empty_link_ids.txt', - 'full_link_trips': path + 'full_link_trips.json', - 'full_link_traveltimes': path + 'full_link_travel_times.json', - 'full_link_speeds': path + 'full_link_speeds.json', - # Inputs and Outputs of Phase1.py - 'random': path + 'Random_numbers.txt', - 'W_trips': path + 'W_trips.txt', - 'W_speeds': path + 'W_speeds.csv', - 'W_trips_seed': path + 'Seeded0,1/W_trips.csv', - 'W_speeds_seed': path + 'Seeded0,1/W_speeds.csv', - 'H_trips': path + 'H_trips.txt', - 'H_speeds': path + 'H_speeds.csv', - 'H_trips_seed': path + 'Seeded0,1/H_trips.csv', - 'H_speeds_seed': path + 'Seeded0,1/H_speeds.csv', - 'H_trips_axed': path + 'H_trips_axed.csv', - 'H_speeds_axed': path + 'H_speeds_axed.csv', - 'H_trips_axed_seed': path + 'Seeded0,1/H_trips_axed.csv', - 'W_speeds_axed_seed': path + 'Seeded0,1/H_speeds_axed.csv'} - return filenames +# Fixed global parameters for 2011 traffic dataset +HOURS_IN_YEAR = 8760 # 24*365 +TOTAL_LINKS = 260855 # Total Links aka Road-segments in NYC. +FULL_LINKS = 2302 # Links with less than 720 hours worth of missing data in a year. +EMPTY_LINKS = 234892 # Links with less that 720 hours worth of data in a year. +MID_LINKS = 23661 # TOTAL_LINKS - FULL_LINKS - EMPTY_LINKS - -'''Global variables''' - -RANK = 50 # Rank for Matrix Factorization in Phase1.py -TRIPS = 1 # Boolean variable: 0 reads travel_speeds data, 1 reads number_of_trips data. -HOURS_IN_YEAR = 8760 # 24*365 -SEEDED = 1 # Boolean variable: 1 or 0 means Phase1.py is seeded or not respectively. - -TOTAL_LINKS = 260855 # Total Links aka Road-segments in NYC. -FULL_LINKS = 2302 # Links with less than 720 hours worth of missing data in a year. -EMPTY_LINKS = 234892 # Links with less that 720 hours worth of data in a year. -MID_LINKS = 23661 # TOTAL_LINKS - FULL_LINKS - EMPTY_LINKS assert MID_LINKS == TOTAL_LINKS - FULL_LINKS - EMPTY_LINKS \ No newline at end of file diff --git a/ReadData/ReadData.ipynb b/ReadData/ReadData.ipynb index d3f1f59cd62efe9ccc5bed315775c0fce920aebf..f4987f6a58ef7899e216e9153a8b204407676b1f 100644 --- a/ReadData/ReadData.ipynb +++ b/ReadData/ReadData.ipynb @@ -115,7 +115,7 @@ "source": [ "## Read from full_link_ids.txt\n", "\n", - "with open('../Multiplicative Algorithm/full_link_ids.txt', 'r') as readfile:\n", + "with open('../MultiplicativeAlgorithm/full_link_ids.txt', 'r') as readfile:\n", " full_links = [int(line.strip()) for line in readfile]\n", " print(len(full_links))" ]