Skip to content

Commit ad1f092

Browse files
authored
feat: Support analysis over multiple service days (#9)
* feat: Support analysis over multiple service days * Update notebooks to support analysis over multiple service days * fix: Use aliased module name * Add more thorough documentation of output tables
1 parent 3dfe833 commit ad1f092

File tree

10 files changed

+386
-563
lines changed

10 files changed

+386
-563
lines changed

lib/transit_data/glides_report/departure.ex

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -6,27 +6,26 @@ defmodule TransitData.GlidesReport.Departure do
66

77
alias TransitData.GlidesReport.Spec.Common
88
alias TransitData.GlidesReport.Terminal
9-
alias TransitData.GlidesReport.Util
109

1110
@type t :: %__MODULE__{
1211
trip: Common.trip_id(),
1312
terminal: Terminal.id(),
1413
timestamp: Common.timestamp(),
15-
# Hour part of the timestamp (in Eastern TZ)
16-
hour: 0..23,
17-
# Minute part of the timestamp
18-
minute: 0..59
14+
# `timestamp` as a DateTime in Eastern time, truncated to minutes
15+
local_dt: DateTime.t()
1916
}
2017

21-
@type minute :: 0..59
22-
23-
@enforce_keys [:trip, :terminal, :timestamp, :hour, :minute]
18+
@enforce_keys [:trip, :terminal, :timestamp, :local_dt]
2419
defstruct @enforce_keys
2520

2621
@spec new(Common.trip_id(), Terminal.id(), Common.timestamp()) :: t()
2722
def new(trip, {:terminal, _} = terminal, timestamp) do
28-
hour = Util.unix_timestamp_to_local_hour(timestamp)
29-
minute = Util.unix_timestamp_to_local_minute(timestamp)
30-
%__MODULE__{trip: trip, terminal: terminal, timestamp: timestamp, hour: hour, minute: minute}
23+
local_dt =
24+
timestamp
25+
|> DateTime.from_unix!()
26+
|> DateTime.shift_zone!("America/New_York")
27+
|> then(&%{&1 | second: 0, microsecond: {0, 0}})
28+
29+
%__MODULE__{trip: trip, terminal: terminal, timestamp: timestamp, local_dt: local_dt}
3130
end
3231
end

lib/transit_data/glides_report/loader.ex

Lines changed: 96 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -10,17 +10,12 @@ defmodule TransitData.GlidesReport.Loader do
1010

1111
@doc """
1212
Loads data into ETS tables, and returns counts of files found locally vs downloaded.
13-
14-
`start_dt` and `end_dt` are NaiveDateTimes assumed to be in UTC.
1513
"""
16-
@spec load_data(
17-
NaiveDateTime.t(),
18-
NaiveDateTime.t(),
19-
String.t(),
20-
pos_integer,
21-
pos_integer | :all
22-
) :: %{local: non_neg_integer, downloaded: non_neg_integer}
23-
def load_data(start_dt, end_dt, env_suffix, sample_rate, sample_count) do
14+
@spec load_data(Date.t(), Date.t(), String.t(), pos_integer, pos_integer | :all) :: %{
15+
local: non_neg_integer,
16+
downloaded: non_neg_integer
17+
}
18+
def load_data(start_date, end_date, env_suffix, sample_rate, sample_count) do
2419
dir = local_dir(env_suffix)
2520

2621
IO.puts(
@@ -35,17 +30,24 @@ defmodule TransitData.GlidesReport.Loader do
3530

3631
s3_bucket = "mbta-gtfs-s3#{env_suffix}"
3732

38-
start_dt_utc = DateTime.from_naive!(start_dt, "Etc/UTC")
39-
end_dt_utc = DateTime.from_naive!(end_dt, "Etc/UTC")
33+
start_dt =
34+
DateTime.new!(start_date, ~T[04:00:00], "America/New_York")
35+
|> DateTime.shift_zone!("Etc/UTC")
4036

41-
total_minutes = DateTime.diff(end_dt_utc, start_dt_utc, :minute)
42-
total_increments = div(total_minutes, sample_rate)
37+
end_dt =
38+
end_date
39+
# The service day ends on the following calendar date.
40+
|> Date.shift(day: 1)
41+
|> DateTime.new!(~T[03:59:59], "America/New_York")
42+
|> DateTime.shift_zone!("Etc/UTC")
43+
44+
total_minutes = DateTime.diff(end_dt, start_dt, :minute)
4345

4446
# Prefixes used to list S3 objects timestamped within the same minute.
4547
minute_prefixes =
46-
Enum.map(0..total_increments, fn increment ->
47-
start_dt_utc
48-
|> DateTime.add(increment * sample_rate, :minute)
48+
Enum.map(0..total_minutes//sample_rate, fn increment ->
49+
start_dt
50+
|> DateTime.add(increment, :minute)
4951
|> Calendar.strftime("%Y/%m/%d/%Y-%m-%dT%H:%M")
5052
end)
5153

@@ -68,27 +70,19 @@ defmodule TransitData.GlidesReport.Loader do
6870
end
6971

7072
# Loads data into a table.
71-
# Returns the number of files that were found locally,
72-
# the number that were newly downloaded,
73-
# and a list of minutes for which not enough data could be found.
73+
# Returns the number of files that were found locally and the number
74+
# that were newly downloaded:
75+
# %{local: integer, downloaded: integer}
7476
defp populate_table(table_name, path_prefixes, s3_bucket, sample_count) do
7577
IO.puts("Loading #{table_name}...")
7678

79+
prefix_count = length(path_prefixes)
80+
7781
{total, insufficients} =
7882
path_prefixes
79-
|> Stream.with_index(fn prefix, i ->
80-
IO.write([
81-
IO.ANSI.clear_line(),
82-
"\r",
83-
moons_of_progress()[rem(i, 8)],
84-
" Loading data for ",
85-
prefix_to_local_minute(prefix)
86-
])
87-
88-
prefix
89-
end)
83+
|> Stream.with_index(&update_progress(&1, &2, prefix_count))
9084
|> Task.async_stream(
91-
&load_minute(&1, s3_bucket, table_name, sample_count),
85+
fn prefix -> load_minute(prefix, s3_bucket, table_name, sample_count) end,
9286
ordered: false,
9387
timeout: 60_000
9488
)
@@ -101,7 +95,7 @@ defmodule TransitData.GlidesReport.Loader do
10195

10296
insufficients =
10397
if is_integer(sample_count) and counts.local + counts.downloaded < sample_count,
104-
do: [prefix_to_local_minute(counts.prefix) | insufficients],
98+
do: [prefix_to_local_dt(counts.prefix) | insufficients],
10599
else: insufficients
106100

107101
{total, insufficients}
@@ -110,20 +104,8 @@ defmodule TransitData.GlidesReport.Loader do
110104
IO.puts("#{IO.ANSI.clear_line()}\r🌝 Done")
111105

112106
unless Enum.empty?(insufficients) do
113-
time_ranges =
114-
insufficients
115-
|> Enum.sort()
116-
|> Enum.split_while(&(&1 < "04"))
117-
|> then(fn {after_midnight_service_day, service_day} ->
118-
service_day ++ after_midnight_service_day
119-
end)
120-
|> Stream.map(&Time.from_iso8601!(&1 <> ":00"))
121-
# Chunk the individual times into ranges of consecutive times for better human readability.
122-
|> Stream.chunk_while(nil, &chunk_time_ranges/2, &{:cont, hh_mm_range(&1), nil})
123-
|> Stream.reject(&is_nil/1)
124-
|> Enum.join(", ")
125-
126-
IO.puts("#{table_name}: Insufficient data available for minute(s): #{time_ranges}")
107+
time_ranges = datetimes_to_time_ranges(insufficients)
108+
IO.puts("#{table_name}: Insufficient data available for minute(s):\n#{time_ranges}")
127109
end
128110

129111
IO.puts("")
@@ -139,11 +121,60 @@ defmodule TransitData.GlidesReport.Loader do
139121
"""x
140122
end
141123

124+
defp update_progress(prefix, i, total) do
125+
pct =
126+
(100 * i / total)
127+
|> trunc()
128+
|> Integer.to_string()
129+
|> String.pad_leading(3)
130+
131+
IO.write([
132+
IO.ANSI.clear_line(),
133+
"\r",
134+
moons_of_progress()[rem(i, 8)],
135+
" Loading data for ",
136+
Calendar.strftime(prefix_to_local_dt(prefix), "%x %H:%M"),
137+
" ",
138+
pct,
139+
"%"
140+
])
141+
142+
prefix
143+
end
144+
142145
# 🌝
143146
defp moons_of_progress do
144147
%{0 => "🌕", 1 => "🌖", 2 => "🌗", 3 => "🌘", 4 => "🌑", 5 => "🌒", 6 => "🌓", 7 => "🌔"}
145148
end
146149

150+
@doc ~S'''
151+
Returns a human-readable string describing a list of minute-granularity
152+
local-timezone DateTimes as comma-separated time ranges.
153+
154+
iex> [~U[2025-01-01T18:00:00Z], ~U[2025-01-01T18:01:00Z], ~U[2025-01-02T08:03:00Z], ~U[2025-01-02T12:00:00Z]]
155+
...> |> Enum.map(&DateTime.shift_zone!(&1, "America/New_York"))
156+
...> |> datetimes_to_time_ranges()
157+
"""
158+
• 2025-01-01: 13:00-13:01, 03:03
159+
• 2025-01-02: 07:00\
160+
"""
161+
'''
162+
def datetimes_to_time_ranges(datetimes) do
163+
datetimes
164+
|> Enum.sort(DateTime)
165+
|> Stream.chunk_by(&service_day/1)
166+
|> Stream.map(fn dts ->
167+
time_ranges =
168+
dts
169+
|> Stream.map(fn %DateTime{time_zone: "America/New_York"} = dt -> DateTime.to_time(dt) end)
170+
|> Stream.chunk_while(nil, &chunk_time_ranges/2, &{:cont, hh_mm_range(&1), nil})
171+
|> Enum.join(", ")
172+
173+
"• #{service_day(hd(dts))}: #{time_ranges}"
174+
end)
175+
|> Enum.join("\n")
176+
end
177+
147178
defp chunk_time_ranges(time, nil) do
148179
# This is the first time in the list.
149180
{:cont, {time, time}}
@@ -182,11 +213,10 @@ defmodule TransitData.GlidesReport.Loader do
182213

183214
defp hh_mm(time), do: Calendar.strftime(time, "%H:%M")
184215

185-
defp prefix_to_local_minute(prefix) do
216+
defp prefix_to_local_dt(prefix) do
186217
prefix
187218
|> prefix_to_dt()
188219
|> DateTime.shift_zone!("America/New_York")
189-
|> Calendar.strftime("%H:%M")
190220
end
191221

192222
defp prefix_to_dt(prefix) do
@@ -199,6 +229,23 @@ defmodule TransitData.GlidesReport.Loader do
199229
dt
200230
end
201231

232+
def service_day(%{time_zone: "America/New_York"} = dt) do
233+
# Service day starts at 4am.
234+
# Times before that on a calendar date are part of the previous
235+
# calendar date's service day.
236+
if dt.hour >= 4 do
237+
DateTime.to_date(dt)
238+
else
239+
dt |> DateTime.to_date() |> Date.add(-1)
240+
end
241+
end
242+
243+
def service_day(%DateTime{} = dt) do
244+
dt
245+
|> DateTime.shift_zone!("America/New_York")
246+
|> service_day()
247+
end
248+
202249
# Loads data for a specific minute of the service day, either by reading existing local files,
203250
# by downloading and reading new files from S3, or a mix of both.
204251
#

lib/transit_data/glides_report/settings/load.ex

Lines changed: 9 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -5,32 +5,20 @@ defmodule TransitData.GlidesReport.Settings.Load do
55

66
@type t :: %__MODULE__{
77
env_suffix: String.t(),
8-
start_dt: DateTime.t(),
9-
end_dt: DateTime.t(),
10-
sample_rate: integer,
11-
sample_count: integer | :all
8+
start_date: Date.t(),
9+
end_date: Date.t(),
10+
sample_rate: pos_integer,
11+
sample_count: pos_integer | :all
1212
}
1313

14-
defstruct [
15-
:env_suffix,
16-
:start_dt,
17-
:end_dt,
18-
:sample_rate,
19-
:sample_count
20-
]
14+
defstruct [:env_suffix, :start_date, :end_date, :sample_rate, :sample_count]
2115

22-
@spec new(String.t(), Date.t(), integer, integer | nil) :: t()
23-
def new(env, date, sample_rate, samples_per_minute) do
24-
{start_dt, end_dt} = date_to_start_end_dt(date)
25-
new(env, start_dt, end_dt, sample_rate, samples_per_minute)
26-
end
27-
28-
@spec new(String.t(), DateTime.t(), DateTime.t(), integer, integer | nil) :: t()
29-
def new(env, start_dt, end_dt, sample_rate, samples_per_minute) do
16+
@spec new(String.t(), Date.t(), Date.t(), integer, integer | nil) :: t()
17+
def new(env, start_date, end_date, sample_rate, samples_per_minute) do
3018
%__MODULE__{
3119
env_suffix: env,
32-
start_dt: start_dt,
33-
end_dt: end_dt,
20+
start_date: start_date,
21+
end_date: end_date,
3422
sample_rate: sample_rate |> trunc(),
3523
sample_count:
3624
case samples_per_minute do
@@ -39,20 +27,4 @@ defmodule TransitData.GlidesReport.Settings.Load do
3927
end
4028
}
4129
end
42-
43-
defp date_to_start_end_dt(date) do
44-
# We assume they want a full service day on the given date.
45-
start_dt =
46-
date
47-
|> DateTime.new!(~T[04:00:00], "America/New_York")
48-
|> DateTime.shift_zone!("Etc/UTC")
49-
50-
end_dt =
51-
date
52-
|> Date.add(1)
53-
|> DateTime.new!(~T[03:59:59], "America/New_York")
54-
|> DateTime.shift_zone!("Etc/UTC")
55-
56-
{start_dt, end_dt}
57-
end
5830
end

lib/transit_data/glides_report/util.ex

Lines changed: 5 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -100,22 +100,6 @@ defmodule TransitData.GlidesReport.Util do
100100
"#{p}%"
101101
end
102102

103-
@spec unix_timestamp_to_local_hour(integer) :: 0..23
104-
def unix_timestamp_to_local_hour(timestamp) do
105-
unix_timestamp_to_local_datetime(timestamp).hour
106-
end
107-
108-
@spec unix_timestamp_to_local_hour(integer) :: 0..59
109-
def unix_timestamp_to_local_minute(timestamp) do
110-
unix_timestamp_to_local_datetime(timestamp).minute
111-
end
112-
113-
defp unix_timestamp_to_local_datetime(timestamp) do
114-
timestamp
115-
|> DateTime.from_unix!()
116-
|> DateTime.shift_zone!("America/New_York")
117-
end
118-
119103
@doc """
120104
Returns /absolute/path/to/transit_data_reports/dataset.
121105
"""
@@ -154,8 +138,8 @@ defmodule TransitData.GlidesReport.Util do
154138
def build_csv_name(table_name, loader_settings, filter_settings) do
155139
%{
156140
env_suffix: env_suffix,
157-
start_dt: start_dt,
158-
end_dt: end_dt,
141+
start_date: start_date,
142+
end_date: end_date,
159143
sample_rate: sample_rate,
160144
sample_count: sample_count
161145
} = loader_settings
@@ -168,12 +152,9 @@ defmodule TransitData.GlidesReport.Util do
168152
env = if env_suffix == "", do: "prod", else: String.slice(env_suffix, 1..-1//1)
169153

170154
dt_range =
171-
[start_dt, end_dt]
172-
|> Enum.map(&DateTime.shift_zone!(&1, "America/New_York"))
173-
|> Enum.map_join(
174-
"-",
175-
&(&1 |> DateTime.shift_zone!("America/New_York") |> Calendar.strftime("%xT%H:%M"))
176-
)
155+
if Date.compare(start_date, end_date) == :eq,
156+
do: "#{start_date}",
157+
else: "#{start_date} to #{end_date}"
177158

178159
terminals_filter =
179160
Enum.find_value(Terminal.labeled_terminal_groups(), fn {_id, label, terminals_in_group} ->

mix.exs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ defmodule TransitData.MixProject do
3838
{:ex_aws_s3, "~> 2.5"},
3939
{:hackney, "~> 1.20"},
4040
{:jaxon, "~> 2.0"},
41-
{:kino, "~> 0.12.0"},
41+
{:kino, "~> 0.14.2"},
4242
{:stream_gzip, "~> 0.4.2"},
4343
{:sweet_xml, "~> 0.7.4"},
4444
# TEST-ENV DEPS

0 commit comments

Comments
 (0)