Skip to content
Snippets Groups Projects
Commit 117f016f authored by vkarve2's avatar vkarve2
Browse files

Add comments to cSNMF.py and cSNMF.ipynb

parent 5321959e
No related branches found
No related tags found
No related merge requests found
.ipynb_checkpoints/*
MultiplicativeAlgorithm/.ipynb_checkpoints/*
*/.ipynb_checkpoints/
*/__pycache__/
MultiplicativeAlgorithm/D_trips.txt
MultiplicativeAlgorithm/D_speeds.txt
MultiplicativeAlgorithm/*.png
......
......@@ -163,7 +163,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
"version": "3.6.6"
}
},
"nbformat": 4,
......
......@@ -2518,7 +2518,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
"version": "3.6.6"
}
},
"nbformat": 4,
......
......@@ -315,3 +315,14 @@ def spikeyness_vs_error():
plt.savefig('Spikeyness_vs_Error')
return None
def spikeyness2(W):
def spikeyness_column(column_number):
trend = W[:52*24*7, column_number].reshape(52,24*7)
if np.mean(W[:52*24*7, column_number]) == 0:
return np.nan
else:
return np.mean(np.nanstd(trend, axis=0))/np.mean(W[:, column_number])
spikey_mean = np.mean([spikeyness_column(sig_number) for sig_number in range(W.shape[1])])
spikey_std = np.std([spikeyness(sig_number) for sig_number in range(W.shape[1])])
return spikey_mean, spikey_std
\ No newline at end of file
......@@ -22,24 +22,4 @@ formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(messag
fh.setFormatter(formatter)
logger.addHandler(fh)
logger.addHandler(sh)
'''Select path name for files depending on whether you are working on your PC or cluster'''
# path_for_data is an optional variable that defaults to 'path'. This is included to facilate a seperate path
# address for large data files
#path = './Data_Files/' # Use this if working on PC
#path_for_data = '../../' # Use this if working on PC and if data files are in a different folder
#path = './scratch/' # Use this if working on cluster
#path_for_data = './scratch/' # Use this if working on cluster and if data files are in a different folder
#filenames = config.generate_filenames(path, path_for_data)
#logger.info('Filenames have been loaded')
#[logger.debug(i) for i in filenames]
#logger.info('Constants have been loaded')
#[logger.debug(i) for i in dir(globals()['config'])];
\ No newline at end of file
logger.addHandler(sh)
\ No newline at end of file
%% Cell type:markdown id: tags:
# Run cSNMF on Trips data using multiplicative update rules
**Constraint:** L1 norm of columns of W should be 1
Get a copy of the data matrices in your local machine from the following links:
- https://uofi.box.com/s/yo60oe084d68obohgraek4weuaqtpgsp
%% Cell type:code id: tags:
``` python
from __init__ import *
import numpy as np
import pandas as pd
import config
import cSNMF
import matplotlib.pyplot as plt
%matplotlib inline
from __init__ import * # initializes logger
import config # contains all the global variables
import cSNMF # contains update rules and factorization functions
```
%% Cell type:code id: tags:
``` python
## Read Full-Link data and prep for running NMF.
D = np.loadtxt('D_trips.txt')
logger.info('Full_link data has been read')
if config.SEEDED == 1:
seed_W = 0; seed_H = 1
elif config.SEEDED == 0:
seed_W = None; seed_H = None
## Read Full-Link data and prep for running cSNMF.
if config.TRIPS:
D = np.loadtxt('D_trips.txt')
logger.info('Full_link data for trips has been read')
else:
D = np.loadtxt('D_traveltimes.txt')
logger.info('Full_link data for traveltimes has been read')
# Seeded W and H will always return the same result. Turn off seeded for different results with each run.
if config.SEEDED:
seed_W, seed_H = 0, 1
else:
logger.critical('Seed value invalid. Needs to be 0 or 1. Check config.py!')
quit()
seed_W, seed_H = None, None
```
%% Cell type:code id: tags:
``` python
# Factorizes D and imposes contraints.
# Returns numpy arrays W and H, and pandas dataframe results.
W, H, results = cSNMF.factorize(D,
beta = 5000,
beta = config.BETA,
rank = config.RANK,
max_iter = 600,
seed_W = seed_W,
seed_H = seed_H,
debug = True,
axing = True)
```
%% Cell type:code id: tags:
``` python
# Displays runtime statistics as a dataframe
results
```
%% Cell type:code id: tags:
``` python
#np.savetxt('W_(seed_W = 10,seed_H = 21).txt', W)
#np.savetxt('H_(seed_W = 10,seed_H = 21).txt', H)
np.savetxt('W_trips.txt', W)
np.savetxt('H_trips.txt', H)
```
......
''' This contains functions for cSNMF.'''
''' This contains functions called by cSNMF.ipynb for constrained Sparse Nonnegative Matrix Factorization'''
import numpy as np
import pandas as pd
......@@ -11,119 +11,100 @@ logger = logging.getLogger(__name__)
## Useful matrix operations:
def m(A, B): return np.multiply(A,B)
def div(A, B): return np.array([[A[i,j]/B[i,j] if A[i,j]>0 else 0 for j in range(A.shape[1])] for i in range(A.shape[0])])
def d(A): return np.diag(np.diag(A)) # d(A) replaces all off-diagonal entries of A with 0.
def N(A): return m(A, NONZEROS) # NONZEROS is a global variable.
def d(A): return np.diag(np.diag(A)) # d(A) replaces all off-diagonal entries of A with 0.
def N(A): return A*NONZEROS # NONZEROS is a global variable.
# Calculates sparsity of H as in Low_Rank_Manhattan_Traffic.pdf
def sparsity_metric(H):
def sparsity_column(column):
n = len(column)
if sum(column) == 0:
ratio = 1
logger.info('Column of zeros in H')
else:
ratio = np.linalg.norm(column, 1)/np.linalg.norm(column, 2)
assert sum(column) != 0, 'Column of zeros in H. Reduce rank'
ratio = np.linalg.norm(column, 1)/np.linalg.norm(column, 2)
return (math.sqrt(n) - ratio)/(math.sqrt(n) - 1)
return np.mean([sparsity_column(column) for column in H.T])
def calculate_error(D, W, H):
norm_D = np.linalg.norm(D)
return np.linalg.norm(D - N(W@H))/norm_D*100
def spikeyness(W):
def spikeyness_column(column_number):
trend = W[:52*24*7, column_number].reshape(52,24*7)
if np.mean(W[:52*24*7, column_number]) == 0:
return np.nan
else:
return np.mean(np.nanstd(trend, axis=0))/np.mean(W[:, column_number])
spikey_mean = np.mean([spikeyness_column(sig_number) for sig_number in range(W.shape[1])])
spikey_std = np.std([spikeyness(sig_number) for sig_number in range(W.shape[1])])
return spikey_mean, spikey_std
def zeros_in_H(H):
# Percentage of zero entries in H
# Use this on H_axed
return sum(sum(H==0))/H.size
return np.linalg.norm(D - N(W@H))/NORM_D*100
# Constraint imposed on the columns of W
def impose_L1_constraint(W, H):
W2 = W/sum(W)
H2 = np.array([H[i]*sum(W)[i] for i in range(len(H))])
return W2, H2
def axe_H(H, tail_cutoff = 0.4):
# Sets the bottom tail_cutoff fraction of entries in each column of H to 0.
def axe_H(H, tail_cutoff = config.AXING_CUTOFF):
H2 = H.copy()
for i, column in enumerate(H.T):
sorted_column = sorted(column)
sorted_column = sorted(column)
cumulative_sum = it.accumulate(sorted_column)
kill_count = sum(list(cumulative_sum) < tail_cutoff*sum(column))
H2.T[i] = np.array([entry if entry >= sorted_column[kill_count] else 0 for entry in column])
kill_count = sum(list(cumulative_sum) < tail_cutoff*sum(column))
H2.T[i] = np.array([entry if entry >= sorted_column[kill_count] else 0 for entry in column])
return H2
'''Sort columns of W and rows of H according to the following rules --
- Give each link a total weight of 1.
- Distribute that weight across all of its signatures in proportion to the corresponding entry in H.
- For each signature, calculate popularity to be sum of weights across all links.
- Sort columns of W in decreasing order of popularity.
- Sort rows of H based on the same permutaion in order to preserve W@H'''
# Sorting is done so we have a rough intuition for which signatures get used the most.
def sort_WH(W, H):
H2 = axe_H(H)
weights = H2/sum(H2)
weights = H/sum(H)
signature_popularities = sum(weights.T)
signature_popularities = sorted(enumerate(signature_popularities), key = lambda x : x[1], reverse = True)
signature_popularities = sorted(enumerate(signature_popularities),\
key = lambda x : x[1], reverse = True)
W2 = W.copy()
H3 = H2.copy()
H2 = H.copy()
for i in range(len(signature_popularities)):
W2[:, i] = W[:, signature_popularities[i][0]]
H3[i] = H2[signature_popularities[i][0]]
try:
assert ( np.linalg.norm(W@H2 - W2@H3) / np.linalg.norm(W@H2) ) < 0.01
except AssertionError:
logger.critical('Sorting is not working correctly')
return W2, H3
return W2, H3
H2[i] = H[signature_popularities[i][0]]
assert ( np.linalg.norm(W@H - W2@H2) / np.linalg.norm(W@H) ) < 0.01,\
'Sorting is not working correctly'
return W2, H2
def factorize(data_array,
init_W = None,
rank = config.RANK,
beta = None,
threshold = 0.5,
max_iter = 600,
seed_W = None,
seed_H = None,
log = logger,
debug = False,
axing = True,
update_W = True):
rank = 50,
beta = None,
threshold = 0.5, # algorithm stops if change in W and H is less than threshold %
max_iter = 600,
seed_W = None,
seed_H = None,
log = logger,
debug = False, # if debug, then return runtime-statistics
axing = True, # if axing, then set small entries in H to zero
init_W = None): # optionally specify W as a numpy array and keep it unchanged in every iteration
log.info('Rank= %s, Threshold= %s', rank, threshold)
D = np.nan_to_num(data_array)
eta = D.max()
if beta is None:
beta = len(D.T)//2
global NONZEROS
NONZEROS = ~np.isnan(data_array)
beta = len(D.T)/2
JNN = np.ones((rank, rank))
global NONZEROS, NORM_D
NONZEROS = ~ np.isnan(data_array)
NORM_D = np.linalg.norm(D)
JNN = np.ones((rank, rank))
def update_W(W, H):
Term_1 = D @ H.T
Term_2 = N(W @ H) @ H.T + eta*W
W = div(m(W, Term_1), Term_2)
W = div(W * Term_1, Term_2)
return W
def update_H(W, H):
Term_3 = W.T @ D
Term_4 = W.T @ N(W @ H) + beta*(JNN @ H)
H = div(m(H, Term_3), Term_4)
H = div(H * Term_3, Term_4)
return H
......@@ -134,32 +115,30 @@ def factorize(data_array,
logger.info('Initializing W and H...')
W_shape = (D.shape[0], rank)
H_shape = (rank, D.shape[1])
if init_W is None:
update_W = True
np.random.seed(seed_W)
W = np.random.uniform(low = 0.01, high = 1., size = W_shape)
else:
update_W = False
W = init_W
np.random.seed(seed_H)
H = np.random.uniform(low = 0.01, high = 1., size = H_shape)
log.info('W, H chosen')
iterations = 0
iterations = 0
column_names = ['error', 'sparsity', 'diff_W', 'diff_H', 'W_minmax',
'H_0th', 'H_25th', 'H_50th', 'H_75th', 'H_100th'] # For results DataFrame
results = pd.DataFrame(columns = column_names)
diff_W = 100
diff_H = 100
global norm_D
norm_D = np.linalg.norm(N(D))
diff_W = 100
diff_H = 100
# Run till stopping condition is met
while abs(diff_W) + abs(diff_H) > threshold or iterations < 100:
if iterations > max_iter:
break
......@@ -194,16 +173,14 @@ def factorize(data_array,
if debug is True:
H_0th, H_25th, H_50th, H_75th, H_100th = quartiles(H)
W_minmax = (W.min(), W.max())
column_values = [error, sparsity, diff_W, diff_H, W_minmax,\
H_0th, H_25th, H_50th, H_75th, H_100th]
results = results.append(dict(zip(column_names, column_values)), ignore_index = True)
if axing:
W, H = impose_L1_constraint(W, H)
W, H = sort_WH(W, H)
W, H = impose_L1_constraint(W, H)
if axing: H = axe_H(H)
W, H = sort_WH(W, H)
error = calculate_error(D, W, H)
sparsity = sparsity_metric(H)
......
''' * This file defines addresses to all data files, input files and output files.
* It also lists all the Global variables used in Non-Negative Matrix Factorization.
''' * This file lists all the Global variables used in Non-Negative Matrix Factorization.
* In case any of the Factorization paramaters need to be changed, they should be changed in this file instead of locally within each program.'''
# Global variables
RANK = 50 # Rank for Matrix Factorization in Phase1.py
TRIPS = True # False reads travel_traveltimes dataset, True reads trips dataset.
# Currently False doesn't work due to bad hyperparameters.
SEEDED = True # Determines whether cSNMF.py uses seeded random matrices for W,H or not.
BETA = 5000 # Higher value penalizes lack of sparsity in H. (See Low_Rank_Manhattan_Traffic.pdf).
# If unsure, then set BETA = None
AXING_CUTOFF = 0.4 # Set between 0 and 1. Sets the bottom AXING_CUTOFF fraction of entries in each column of H to 0.
def generate_filenames(path, path_for_data=None):
if path_for_data is None:
path_for_data = path
filenames = {# Raw data files
'links': path + 'links.csv',
'nodes': path + 'nodes.csv',
'raw_data': path_for_data + 'travel_times_2011.csv',
'data_coo_form': path_for_data + 'data_coo_form.txt',
'data_trips': path_for_data + 'data_trips.csv',
'data_traveltimes': path_for_data + 'data_travel_times.csv',
'data_trips_transpose': path_for_data + 'data_trips_transpose.csv',
# Inputs and Outputs of Read_data.py
'full_link_ids': path + 'full_link_ids.txt',
'empty_link_ids': path + 'empty_link_ids.txt',
'full_link_trips': path + 'full_link_trips.json',
'full_link_traveltimes': path + 'full_link_travel_times.json',
'full_link_speeds': path + 'full_link_speeds.json',
# Inputs and Outputs of Phase1.py
'random': path + 'Random_numbers.txt',
'W_trips': path + 'W_trips.txt',
'W_speeds': path + 'W_speeds.csv',
'W_trips_seed': path + 'Seeded0,1/W_trips.csv',
'W_speeds_seed': path + 'Seeded0,1/W_speeds.csv',
'H_trips': path + 'H_trips.txt',
'H_speeds': path + 'H_speeds.csv',
'H_trips_seed': path + 'Seeded0,1/H_trips.csv',
'H_speeds_seed': path + 'Seeded0,1/H_speeds.csv',
'H_trips_axed': path + 'H_trips_axed.csv',
'H_speeds_axed': path + 'H_speeds_axed.csv',
'H_trips_axed_seed': path + 'Seeded0,1/H_trips_axed.csv',
'W_speeds_axed_seed': path + 'Seeded0,1/H_speeds_axed.csv'}
return filenames
# Fixed global parameters for 2011 traffic dataset
HOURS_IN_YEAR = 8760 # 24*365
TOTAL_LINKS = 260855 # Total Links aka Road-segments in NYC.
FULL_LINKS = 2302 # Links with less than 720 hours worth of missing data in a year.
EMPTY_LINKS = 234892 # Links with less that 720 hours worth of data in a year.
MID_LINKS = 23661 # TOTAL_LINKS - FULL_LINKS - EMPTY_LINKS
'''Global variables'''
RANK = 50 # Rank for Matrix Factorization in Phase1.py
TRIPS = 1 # Boolean variable: 0 reads travel_speeds data, 1 reads number_of_trips data.
HOURS_IN_YEAR = 8760 # 24*365
SEEDED = 1 # Boolean variable: 1 or 0 means Phase1.py is seeded or not respectively.
TOTAL_LINKS = 260855 # Total Links aka Road-segments in NYC.
FULL_LINKS = 2302 # Links with less than 720 hours worth of missing data in a year.
EMPTY_LINKS = 234892 # Links with less that 720 hours worth of data in a year.
MID_LINKS = 23661 # TOTAL_LINKS - FULL_LINKS - EMPTY_LINKS
assert MID_LINKS == TOTAL_LINKS - FULL_LINKS - EMPTY_LINKS
\ No newline at end of file
%% Cell type:markdown id: tags:
# This notebook reads Taxisim datafiles generated by Dan Work and Brian Donovan, cleans them up, and preps them for NMF.
## Warning: Do not run this notebook unless absolutely necessary. It takes a long time to run!
This notebook has been included here mostly for the sake of completion, and not for actual execution.
%% Cell type:code id: tags:
``` python
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
```
%% Cell type:code id: tags:
``` python
## Create a dictionary of links with entries as:
## (begin_node_id, end_node_id): links_id
with open('../DataFiles/links.csv') as linkfile:
print(linkfile.readline())
link_dict = {}
for counter, line in enumerate(linkfile):
link_id, begin_node, end_node = line[:-1].split(',')[0:3]
link_dict[(begin_node, end_node)] = link_id
link_dict[('0', '0')] = '0'
```
%% Cell type:code id: tags:
``` python
print('This message is a fail-safe. Comment out this line only if you know what you are doing.'); print(failsafe)
## Create a file called `D_2011.csv` that has as a row:
## L, T, traveltime, trips
with open('../MultiplicativeAlgorithm/D_2011.csv', 'w') as writefile:
writefile.write('L,T,traveltime,trips\n')
with open('../DataFiles/travel_times_2011.csv') as rawfile:
print(rawfile.readline())
start_time = dt.datetime.strptime('2011-01-01 00:00:00', '%Y-%m-%d %X')
for counter, line in enumerate(rawfile):
line = line[:-1]
begin_node_id, end_node_id, datetime, traveltime, trips = line.split(',')
L = link_dict[(begin_node_id, end_node_id)]
datetime = dt.datetime.strptime(datetime, '%Y-%m-%d %X')
T = str(int((datetime - start_time).total_seconds()/3600))
writefile.write(','.join([L, T, traveltime, trips]) + '\n')
if counter%1000000 == 0:
print(counter)
if counter > 20:
break
```
%% Cell type:code id: tags:
``` python
## Count the number of data-points throughout the year for each link.
# This helps with separating full_links
print('This message is a fail-safe. Comment out this line only if you know what you are doing.'); print(failsafe)
link_data_count = {}
with open('../MultiplicativeAlgorithm/D_2011.csv', 'r') as readfile:
readfile.readline()
for counter, line in enumerate(readfile):
L, T, traveltimes, trips = line[:-1].split(',')
if L not in link_data_count.keys():
link_data_count[L] = 0
else:
link_data_count[L] += 1
```
%% Cell type:code id: tags:
``` python
## Write to full_link_ids.txt
print('This message is a fail-safe. Comment out this line only if you know what you are doing.'); print(failsafe)
with open('../MultiplicativeAlgorithm/full_link_ids.txt', 'w') as writefile:
full_links = sorted([int(link) for link in link_data_count if link_data_count[link] >= 8760-721])
full_links = full_links[1:]
print(len(full_links))
full_links = map(str, full_links)
writefile.write('\n'.join(full_links))
```
%% Cell type:code id: tags:
``` python
## Read from full_link_ids.txt
with open('../Multiplicative Algorithm/full_link_ids.txt', 'r') as readfile:
with open('../MultiplicativeAlgorithm/full_link_ids.txt', 'r') as readfile:
full_links = [int(line.strip()) for line in readfile]
print(len(full_links))
```
%% Cell type:code id: tags:
``` python
## Separate traffic data for full_links from the big data file.
print('This message is a fail-safe. Comment out this line only if you know what you are doing.'); print(failsafe)
full_links_data = []
progress = 8761
with open('../MultiplicativeAlgorithm/D_2011.csv', 'r') as readfile:
header = readfile.readline()
for counter, line in enumerate(readfile):
line = line[:-1]
L, T, traveltimes, trips = line.split(',')
if L in full_links:
full_links_data.append(line)
if int(T) < progress:
progress = int(T)
print(progress)
with open('../MultiplicativeAlgorithm/D_2011_full_links.csv', 'w') as writefile:
writefile.write(header)
writefile.write('\n'.join(full_links_data))
```
%% Cell type:code id: tags:
``` python
## Write data to D_trips.txt and D_traveltimes.txt
print('This message is a fail-safe. Comment out this line only if you know what you are doing.'); print(failsafe)
D_trips = np.zeros((8760, 2302))
D_traveltimes = np.zeros((8760, 2302))
full_links = sorted([int(link) for link in link_data_count if link_data_count[link] >= 8760-721])
full_links = full_links[1:]
progress = 8761
with open('../MultiplicativeAlgorithm/D_2011_full_links.csv', 'r') as readfile:
header = readfile.readline()
for line in readfile:
line = line[:-1]
L, T, traveltimes, trips = line.split(',')
L, T, traveltimes, trips = full_links.index(int(L)), int(T), float(traveltimes), int(trips)
D_trips[T, L] += trips
D_traveltimes[T, L] += traveltimes
if T < progress:
progress = T
print(progress)
D_trips, D_traveltimes = D_trips.astype('float'), D_traveltimes.astype('float')
D_trips[D_trips == 0] = np.nan
D_traveltimes[D_traveltimes == 0] = np.nan
np.savetxt('../MultiplicativeAlgorithm/D_trips.txt', D_trips)
np.savetxt('../MultiplicativeAlgorithm/D_traveltimes.txt', D_traveltimes)
```
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment