Skip to content
Snippets Groups Projects
Commit b26dfba9 authored by Vaibhav Karve's avatar Vaibhav Karve
Browse files

add EntriesD.ipynb and LinkSizes.ipynb

parent a01a7143
No related branches found
No related tags found
No related merge requests found
%% Cell type:markdown id: tags:
# This notebook reads Taxisim datafiles generated by Dan Work and Brian Donovan, cleans them up, and preps them for NMF.
## Warning: Do not run this notebook unless absolutely necessary. It takes a long time to run!
This notebook has been included here mostly for the sake of completion, and not for actual execution.
%% Cell type:code id: tags:
``` python
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
```
%% Cell type:code id: tags:
``` python
## Create a dictionary of links with entries as:
## (begin_node_id, end_node_id): links_id
with open('../DataFiles/links.csv') as linkfile:
print(linkfile.readline())
link_dict = {}
for counter, line in enumerate(linkfile):
link_id, begin_node, end_node = line[:-1].split(',')[0:3]
link_dict[(begin_node, end_node)] = link_id
link_dict[('0', '0')] = '0'
```
%% Output
link_id,begin_node_id,end_node_id,begin_angle,end_angle,street_length,osm_name,osm_class,osm_way_id,startX,startY,endX,endY,osm_changeset,birth_timestamp,death_timestamp
%% Cell type:code id: tags:
``` python
print('This message is a fail-safe. Comment out this line only if you know what you are doing.'); print(failsafe)
## Create a file called `D_2011.csv` that has as a row:
## L, T, traveltime, trips
with open('../MultiplicativeAlgorithm/D_2011.csv', 'w') as writefile:
writefile.write('L,T,traveltime,trips\n')
with open('../DataFiles/travel_times_2011.csv') as rawfile:
print(rawfile.readline())
start_time = dt.datetime.strptime('2011-01-01 00:00:00', '%Y-%m-%d %X')
for counter, line in enumerate(rawfile):
line = line[:-1]
begin_node_id, end_node_id, datetime, traveltime, trips = line.split(',')
L = link_dict[(begin_node_id, end_node_id)]
datetime = dt.datetime.strptime(datetime, '%Y-%m-%d %X')
T = str(int((datetime - start_time).total_seconds()/3600))
writefile.write(','.join([L, T, traveltime, trips]) + '\n')
if counter%1000000 == 0:
print(counter)
if counter > 20:
break
```
%% Output
---------------------------------------------------------------------------
FileNotFoundError Traceback (most recent call last)
<ipython-input-4-ddb8c86be344> in <module>
6 with open('../MultiplicativeAlgorithm/D_2011.csv', 'w') as writefile:
7 writefile.write('L,T,traveltime,trips\n')
----> 8 with open('../DataFiles/travel_times_2011.csv') as rawfile:
9 print(rawfile.readline())
10 start_time = dt.datetime.strptime('2011-01-01 00:00:00', '%Y-%m-%d %X')
FileNotFoundError: [Errno 2] No such file or directory: '../DataFiles/travel_times_2011.csv'
%% Cell type:code id: tags:
``` python
## Count the number of data-points throughout the year for each link.
# This helps with separating full_links
print('This message is a fail-safe. Comment out this line only if you know what you are doing.'); print(failsafe)
link_data_count = {}
with open('../MultiplicativeAlgorithm/D_2011.csv', 'r') as readfile:
readfile.readline()
for counter, line in enumerate(readfile):
L, T, traveltimes, trips = line[:-1].split(',')
if L not in link_data_count.keys():
link_data_count[L] = 0
else:
link_data_count[L] += 1
```
%% Cell type:code id: tags:
``` python
## Write to full_link_ids.txt
print('This message is a fail-safe. Comment out this line only if you know what you are doing.'); print(failsafe)
with open('../MultiplicativeAlgorithm/full_link_ids.txt', 'w') as writefile:
full_links = sorted([int(link) for link in link_data_count if link_data_count[link] >= 8760-721])
full_links = full_links[1:]
print(len(full_links))
full_links = map(str, full_links)
writefile.write('\n'.join(full_links))
```
%% Cell type:code id: tags:
``` python
## Read from full_link_ids.txt
with open('../Multiplicative Algorithm/full_link_ids.txt', 'r') as readfile:
with open('../MultiplicativeAlgorithm/full_link_ids.txt', 'r') as readfile:
full_links = [int(line.strip()) for line in readfile]
print(len(full_links))
```
%% Cell type:code id: tags:
``` python
## Separate traffic data for full_links from the big data file.
print('This message is a fail-safe. Comment out this line only if you know what you are doing.'); print(failsafe)
full_links_data = []
progress = 8761
with open('../MultiplicativeAlgorithm/D_2011.csv', 'r') as readfile:
header = readfile.readline()
for counter, line in enumerate(readfile):
line = line[:-1]
L, T, traveltimes, trips = line.split(',')
if L in full_links:
full_links_data.append(line)
if int(T) < progress:
progress = int(T)
print(progress)
with open('../MultiplicativeAlgorithm/D_2011_full_links.csv', 'w') as writefile:
writefile.write(header)
writefile.write('\n'.join(full_links_data))
```
%% Cell type:code id: tags:
``` python
## Write data to D_trips.txt and D_traveltimes.txt
print('This message is a fail-safe. Comment out this line only if you know what you are doing.'); print(failsafe)
D_trips = np.zeros((8760, 2302))
D_traveltimes = np.zeros((8760, 2302))
full_links = sorted([int(link) for link in link_data_count if link_data_count[link] >= 8760-721])
full_links = full_links[1:]
progress = 8761
with open('../MultiplicativeAlgorithm/D_2011_full_links.csv', 'r') as readfile:
header = readfile.readline()
for line in readfile:
line = line[:-1]
L, T, traveltimes, trips = line.split(',')
L, T, traveltimes, trips = full_links.index(int(L)), int(T), float(traveltimes), int(trips)
D_trips[T, L] += trips
D_traveltimes[T, L] += traveltimes
if T < progress:
progress = T
print(progress)
D_trips, D_traveltimes = D_trips.astype('float'), D_traveltimes.astype('float')
D_trips[D_trips == 0] = np.nan
D_traveltimes[D_traveltimes == 0] = np.nan
np.savetxt('../MultiplicativeAlgorithm/D_trips.txt', D_trips)
np.savetxt('../MultiplicativeAlgorithm/D_traveltimes.txt', D_traveltimes)
```
......
%% Cell type:markdown id: tags:
# Entries in $D$
%% Cell type:markdown id: tags:
In this notebook, we look at how many numerical entries there are in the $D$ matrix.
%% Cell type:code id: tags:
``` python
import numpy as np
```
%% Cell type:code id: tags:
``` python
ls ../DataFiles/
```
%% Output
full_link_ids.txt links.csv README.md
%% Cell type:code id: tags:
``` python
D = np.loadtxt('../MultiplicativeAlgorithm/D_trips.txt')
D.shape
```
%% Output
(8760, 2302)
%% Cell type:code id: tags:
``` python
non_nan = np.count_nonzero(~np.isnan(D.flatten()))
total = D.size
non_nan/total*100
```
%% Output
97.25287024584539
%% Cell type:code id: tags:
``` python
total
```
%% Output
20165520
%% Cell type:code id: tags:
``` python
D2 = D.flatten()
```
%% Cell type:code id: tags:
``` python
x = D2[98]
```
%% Cell type:code id: tags:
``` python
np.isnan(x)
```
%% Output
True
%% Cell type:code id: tags:
``` python
np.nansum(D2)
```
%% Output
3548011030.0
%% Cell type:markdown id: tags:
# Link sizes
%% Cell type:markdown id: tags:
## This notebook computes the average sizes of all links in our dataset.
%% Cell type:markdown id: tags:
We read *links.csv*. This file contains data for all links (including links with missing entries).
%% Cell type:code id: tags:
``` python
import csv
import statistics
```
%% Cell type:code id: tags:
``` python
with open('../DataFiles/links.csv', 'r') as csvfile:
reader = csv.reader(csvfile)
header, *data = list(reader)
dict(enumerate(header))
```
%% Output
{0: 'link_id',
1: 'begin_node_id',
2: 'end_node_id',
3: 'begin_angle',
4: 'end_angle',
5: 'street_length',
6: 'osm_name',
7: 'osm_class',
8: 'osm_way_id',
9: 'startX',
10: 'startY',
11: 'endX',
12: 'endY',
13: 'osm_changeset',
14: 'birth_timestamp',
15: 'death_timestamp'}
%% Cell type:markdown id: tags:
---
%% Cell type:markdown id: tags:
We are interested in entry #5 i.e. `street_length`
%% Cell type:code id: tags:
``` python
street_lengths = [row[5] for row in data]
street_lengths = list(map(float, street_lengths)) # str -> float
print('Mean =', statistics.mean(street_lengths))
print('Median =', statistics.median(street_lengths))
print('Mode =', statistics.mode(street_lengths))
print('Std.Dev =', statistics.stdev(street_lengths))
print('Minimum =', min(street_lengths))
print('Maximum =', max(street_lengths))
print('Total =', len(street_lengths))
```
%% Output
Mean = 132.98258543251998
Median = 95.569
Mode = 79.259
Std.Dev = 107.84111322501701
Minimum = 2.806
Maximum = 3937.115
Total = 260855
%% Cell type:markdown id: tags:
---
%% Cell type:markdown id: tags:
We now read data for only the 2302 links we have chosen for our analysis.
%% Cell type:code id: tags:
``` python
with open('../DataFiles/full_link_ids.txt', 'r') as txtfile:
link_ids = [line.strip() for line in txtfile.readlines()]
```
%% Cell type:code id: tags:
``` python
full_links = [row for row in data if row[0] in link_ids]
```
%% Cell type:code id: tags:
``` python
street_lengths = [row[5] for row in full_links]
street_lengths = list(map(float, street_lengths)) # str -> float
print('Mean =', statistics.mean(street_lengths))
print('Median =', statistics.median(street_lengths))
print('Mode =', statistics.mode(street_lengths))
print('Std.Dev =', statistics.stdev(street_lengths))
print('Minimum =', min(street_lengths))
print('Maximum =', max(street_lengths))
print('Total =', len(street_lengths))
```
%% Output
Mean = 106.40565682015638
Median = 79.981
Mode = 78.523
Std.Dev = 133.08834141783507
Minimum = 40.005
Maximum = 2676.248
Total = 2302
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment