Added sort_WH function. Sorts W and H according to signature popularity.

313bd076 · vkarve2 · a6e36d57 · 313bd076
Commit 313bd076 authored 7 years ago by vkarve2
--- a/Multiplicative Algorithm/cSNMF.py
+++ b/Multiplicative Algorithm/cSNMF.py
@@ -4,7 +4,7 @@ import numpy as np
 import pandas as pd
 import logging
 import itertools as it
-
+import config

 logger = logging.getLogger(__name__)

@@ -46,37 +46,31 @@ def axe_H(H, tail_cutoff = 0.25):
    return H2


-
-'''TODO: Change this function.
-def sort_WH(W0, H0):
-    # Sort H and W according to decreasing order of signature popularity
-    # which is calculated using H obtained from axe_H().
-    H = axe_H(pd.DataFrame(H0)).values()
-    column_entries = [0 for i in range(len(H.T))] # list of length 2302
-    for i in range(len(H)): # i varies from 0 to 49
-        for j in range(len(H.T)): # j varies from 0 to 2301
-            if H[i,j] != 0:
-                column_entries[j] += 1 # Counts number of nonzero entries
-                                       # in each column of H
-
-    # Calculate popularity
-    popularity = []
-    for i in range(len(H)):
-        popularity.append(0)
-        for j in range(len(H.T)):
-            if H[i,j] != 0:
-                popularity[i] += 1./column_entries[j]
-    import numpy as np
-    popularity, H, WT = zip(*[(pop,row_H,col_W) for (pop,row_H,col_W)
-                              in sorted(zip(popularity,H0,W0.T),
-                                        key=lambda pair: pair[0], reverse=True)])
-    W = np.array(WT).T
-    H = np.array(H)
-    return W, H'''
-
-
-
-def factorize(data_array, rank, beta = None, threshold = 0.20, max_iter = 400, seed_W = None, seed_H = None, log=logger):    
+def sort_WH(W, H):
+    H2 = axe_H(H)
+    weights = H2/sum(H2)
+    signature_popularities = sum(weights.T)
+    signature_popularities = sorted(enumerate(signature_popularities), key = lambda x : x[1])
+    signature_popularities.reverse()
+    signature_popularities
+    W2 = W.copy()
+    H3 = H2.copy()
+    for i in range(len(signature_popularities)):
+        W2[:, i] = W[:, signature_popularities[i][0]]
+        H3[i] = H2[signature_popularities[i][0]]
+    return W2, H3
+
+
+
+def factorize(data_array,
+              rank = config.RANK,
+              beta = None,
+              threshold = 0.5,
+              max_iter = 600,
+              seed_W = None,
+              seed_H = None,
+              log = logger,
+              debug = False):    

    log.info('Rank= %s, Threshold= %s', rank, threshold)

@@ -122,13 +116,16 @@ def factorize(data_array, rank, beta = None, threshold = 0.20, max_iter = 400, s

    iterations = 0
    results = pd.DataFrame()
+    column_names = ['error', 'sparsity', 'diff_W', 'diff_H', 'W_minmax',
+                    'H_0th', 'H_25th', 'H_50th', 'H_75th', 'H_100th'] # For results DataFrame 
+
    diff_W = 100
    diff_H = 100
    
    global norm_D
    norm_D = np.linalg.norm(D)
    
-    while abs(diff_W) + abs(diff_H) > threshold or iterations < 200:
+    while abs(diff_W) + abs(diff_H) > threshold or iterations < 100:
        if iterations > max_iter:
            break
            
@@ -152,21 +149,23 @@ def factorize(data_array, rank, beta = None, threshold = 0.20, max_iter = 400, s
        error = calculate_error(D, W, H)
        sparsity = sparsity_metric(H)
        
+        
        log.info('Iteration= %s, Error= %s, Sparsity= %s', iterations, error, sparsity)
        iterations += 1
        
-        H_0th, H_25th, H_50th, H_75th, H_100th = quartiles(H)
-        W_minmax = (W.min(), W.max())
+        if debug is True:
+            H_0th, H_25th, H_50th, H_75th, H_100th = quartiles(H)
+            W_minmax = (W.min(), W.max())
        
-        column_names = ['error', 'sparsity', 'diff_W', 'diff_H', 'W_minmax',\
-                        'H_0th', 'H_25th', 'H_50th', 'H_75th', 'H_100th']
-        column_values = [error, sparsity, diff_W, diff_H, W_minmax,\
-                         H_0th, H_25th, H_50th, H_75th, H_100th]
+            column_values = [error, sparsity, diff_W, diff_H, W_minmax,\
+                             H_0th, H_25th, H_50th, H_75th, H_100th]

-        results = results.append(dict(zip(column_names, column_values)), ignore_index = True)
+            results = results.append(dict(zip(column_names, column_values)), ignore_index = True)
            
    
    W, H = impose_L1_constraint(W, H)
+    W, H = sort_WH(W, H)
+    
    error = calculate_error(D, W, H)
    sparsity = sparsity_metric(H)
    log.info('Error= %s, Sparsity= %s', error, sparsity)