Skip to content
Snippets Groups Projects
Commit 313bd076 authored by vkarve2's avatar vkarve2
Browse files

Added sort_WH function. Sorts W and H according to signature popularity.

parent a6e36d57
No related branches found
No related tags found
No related merge requests found
......@@ -4,7 +4,7 @@ import numpy as np
import pandas as pd
import logging
import itertools as it
import config
logger = logging.getLogger(__name__)
......@@ -46,37 +46,31 @@ def axe_H(H, tail_cutoff = 0.25):
return H2
'''TODO: Change this function.
def sort_WH(W0, H0):
# Sort H and W according to decreasing order of signature popularity
# which is calculated using H obtained from axe_H().
H = axe_H(pd.DataFrame(H0)).values()
column_entries = [0 for i in range(len(H.T))] # list of length 2302
for i in range(len(H)): # i varies from 0 to 49
for j in range(len(H.T)): # j varies from 0 to 2301
if H[i,j] != 0:
column_entries[j] += 1 # Counts number of nonzero entries
# in each column of H
# Calculate popularity
popularity = []
for i in range(len(H)):
popularity.append(0)
for j in range(len(H.T)):
if H[i,j] != 0:
popularity[i] += 1./column_entries[j]
import numpy as np
popularity, H, WT = zip(*[(pop,row_H,col_W) for (pop,row_H,col_W)
in sorted(zip(popularity,H0,W0.T),
key=lambda pair: pair[0], reverse=True)])
W = np.array(WT).T
H = np.array(H)
return W, H'''
def factorize(data_array, rank, beta = None, threshold = 0.20, max_iter = 400, seed_W = None, seed_H = None, log=logger):
def sort_WH(W, H):
H2 = axe_H(H)
weights = H2/sum(H2)
signature_popularities = sum(weights.T)
signature_popularities = sorted(enumerate(signature_popularities), key = lambda x : x[1])
signature_popularities.reverse()
signature_popularities
W2 = W.copy()
H3 = H2.copy()
for i in range(len(signature_popularities)):
W2[:, i] = W[:, signature_popularities[i][0]]
H3[i] = H2[signature_popularities[i][0]]
return W2, H3
def factorize(data_array,
rank = config.RANK,
beta = None,
threshold = 0.5,
max_iter = 600,
seed_W = None,
seed_H = None,
log = logger,
debug = False):
log.info('Rank= %s, Threshold= %s', rank, threshold)
......@@ -122,13 +116,16 @@ def factorize(data_array, rank, beta = None, threshold = 0.20, max_iter = 400, s
iterations = 0
results = pd.DataFrame()
column_names = ['error', 'sparsity', 'diff_W', 'diff_H', 'W_minmax',
'H_0th', 'H_25th', 'H_50th', 'H_75th', 'H_100th'] # For results DataFrame
diff_W = 100
diff_H = 100
global norm_D
norm_D = np.linalg.norm(D)
while abs(diff_W) + abs(diff_H) > threshold or iterations < 200:
while abs(diff_W) + abs(diff_H) > threshold or iterations < 100:
if iterations > max_iter:
break
......@@ -152,21 +149,23 @@ def factorize(data_array, rank, beta = None, threshold = 0.20, max_iter = 400, s
error = calculate_error(D, W, H)
sparsity = sparsity_metric(H)
log.info('Iteration= %s, Error= %s, Sparsity= %s', iterations, error, sparsity)
iterations += 1
H_0th, H_25th, H_50th, H_75th, H_100th = quartiles(H)
W_minmax = (W.min(), W.max())
if debug is True:
H_0th, H_25th, H_50th, H_75th, H_100th = quartiles(H)
W_minmax = (W.min(), W.max())
column_names = ['error', 'sparsity', 'diff_W', 'diff_H', 'W_minmax',\
'H_0th', 'H_25th', 'H_50th', 'H_75th', 'H_100th']
column_values = [error, sparsity, diff_W, diff_H, W_minmax,\
H_0th, H_25th, H_50th, H_75th, H_100th]
column_values = [error, sparsity, diff_W, diff_H, W_minmax,\
H_0th, H_25th, H_50th, H_75th, H_100th]
results = results.append(dict(zip(column_names, column_values)), ignore_index = True)
results = results.append(dict(zip(column_names, column_values)), ignore_index = True)
W, H = impose_L1_constraint(W, H)
W, H = sort_WH(W, H)
error = calculate_error(D, W, H)
sparsity = sparsity_metric(H)
log.info('Error= %s, Sparsity= %s', error, sparsity)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment