11# %%
2-
2+ # first cell computes the stopping fraction given the assumed utility function, estimates, etc
33import pandas as pd
44import numpy as np
55import os
@@ -329,21 +329,40 @@ def utility_value(r):
329329 return VALUE_BASE ** r
330330
331331
332- def error_sigma ( f ):
333- """Error half‑width at progress f of uniform noise """
334- return max ( 2 - 2 * f , 0.3 ) # ±2.5 at start → ±0.5 by halfway
332+ def inverse_utility_value ( u ):
333+ """Transform utility into rating. """
334+ return np . log ( u ) / np . log ( VALUE_BASE )
335335
336336
337337# Not sure which to go with
338338def utility_value2 (r ):
339339 return r ** VALUE_BASE
340340
341341
342+ def inverse_utility_value2 (u ):
343+ return u ** (1 / VALUE_BASE )
344+
345+
346+ # had 4/207 books I'd say were "average" at 1/3 (or 1/2 way thru?) and 5 at end;
347+ # z=2.07 and mean of 2.5 implies at half way sd=(5-2.5)/z =1.21 or 1.45 if use mean =2
348+ # at 0.95 I'd say 0.25 is correct noise, my own ratings aren't even that good
349+ # at 0 2.25 is correct noise, I generally have a bit of info after decising to pick book
350+ # and need to adjust for general range restrictions around means?
351+ # assume has power law form
352+ # 1.21 = 2 * ((1 - 1/3) ** k) + 0.25
353+ # np.log(((1.21-0.25)/2))/np.log(0.667) = 1.8
354+ # np.log(((1.21-0.25)/2))/np.log(0.5) = 1.06
355+ def error_sigma (f ):
356+ """Error half‑width at progress f of uniform noise"""
357+ return 2 * (1 - f ) ** 1.8 + 0.25
358+ # return max(1 - 2 * f, 0.3) # ±2.5 at start → ±0.3 by halfway
359+
360+
342361def error_sigma2 (f ):
343362 return 1 - f ** 0.5
344363
345364
346- def simulate_estimates (true_ratings : np .ndarray ) -> np .ndarray :
365+ def simulate_estimates (true_ratings : np .ndarray , error_fn = error_sigma , rho = 0.9 ) -> np .ndarray :
347366 """
348367 Simulate noisy rating estimates for each book × each f in F_GRID using an AR(1) process.
349368 The noise variance decreases as reading progress increases.
@@ -356,8 +375,8 @@ def simulate_estimates(true_ratings: np.ndarray) -> np.ndarray:
356375 """
357376 n_books = true_ratings .shape [0 ]
358377 est = np .zeros ((n_books , len (F_GRID )))
359- sigmas = np .array ([error_sigma (f ) for f in F_GRID ])
360- rho = 0.9 # autocorrelation coefficient
378+ sigmas = np .array ([error_fn (f ) for f in F_GRID ])
379+ # autocorrelation coefficient
361380
362381 # Generate all random numbers upfront for each book
363382 # This ensures each book has its own independent sequence
@@ -367,7 +386,7 @@ def simulate_estimates(true_ratings: np.ndarray) -> np.ndarray:
367386 e_prev = np .random .uniform (- sigmas [0 ], sigmas [0 ], size = n_books )
368387 est [:, 0 ] = np .clip (true_ratings + e_prev , 1 , 5 )
369388
370- # Subsequent errors with autocorrelation
389+ # Subsequent errors with autocorrelatio9n
371390 for j in range (1 , len (F_GRID )):
372391 # Scale previous error by rho
373392 scale_factor = sigmas [j ] / sigmas [j - 1 ] if sigmas [j - 1 ] > 0 else 0
@@ -405,7 +424,8 @@ def _check_error_graph():
405424
406425# ---------------- Core optimiser ----------------
407426def optimise_schedule_greedy (
408- true_ratings : np .ndarray , hourly_opportunity = None
427+ true_ratings : np .ndarray ,
428+ hourly_opportunity = utility_value (2 ) / (READ_TIME_HOURS + SEARCH_COST_HOURS ),
409429) -> Dict [str , np .ndarray ]:
410430 """
411431 Using greedy approach at each time step find the drop fraction of estimated books that maximize
@@ -502,7 +522,10 @@ def optimise_schedule_greedy(
502522 if total_u > best_u :
503523 best_u = total_u
504524 best_drop = d
505- best_rating_cut = est_now [active_mask ][sort_idx [k_drop - 1 ]]
525+ if active_mask .sum () >= k_drop :
526+ best_rating_cut = est_now [active_mask ][sort_idx [k_drop - 1 ]]
527+ else :
528+ best_rating_cut = 5 # dropping more books than have left
506529 # util of now dropped books plus util of replacing them with hourly opportunity
507530 best_drop_u = (
508531 h_util_drop [~ keep_mask ].sum () * book_time + dropped_books_utils [idx_f - 1 ]
@@ -702,18 +725,19 @@ def plot_simulation_paths(
702725 df = pd .read_csv (DATA_PATH )
703726 df ["Bookshelf" ] = df ["Bookshelf" ].str .strip ().str .replace ("/" , "," ).str .replace ("&" , "and" )
704727
705- shelves = df ["Bookshelf" ].unique ()
728+ shelf_counts = df ["Bookshelf" ].value_counts ()
729+ shelves = shelf_counts [shelf_counts > 10 ].index .tolist ()
706730 out = {}
707731
708732 # Find indices closest to 10%, 30%, and 50% of reading
709733 target_fractions = [0.1 , 0.3 , 0.5 ]
710734 milestone_indices = [np .abs (F_GRID - target ).argmin () for target in target_fractions ]
711-
735+ rating_col = "Usefulness /5 to Me"
712736 for shelf in shelves :
713737 sub = df [df ["Bookshelf" ] == shelf ]
714738 if sub .empty :
715739 continue
716- out [shelf ] = simulate_category (sub , "Usefulness /5 to Me" )
740+ out [shelf ] = simulate_category (sub , rating_col )
717741
718742 print (f"\n { '=' * 80 } " )
719743 print (f"Optimising schedule for: { shelf } " )
@@ -748,7 +772,14 @@ def plot_simulation_paths(
748772 for target , idx in zip (target_fractions , milestone_indices ):
749773 print (f"{ F_GRID [idx ]:>12.2f} { out [shelf ]['cumulative_drop' ][idx ]* 100 :>20.2f} " )
750774 print (f"Final cumulative drop: { out [shelf ]['cumulative_drop' ][- 1 ]* 100 :.1f} %" )
751- print (f"Final utility: { out [shelf ]['true_avg_utils' ][optimal_idx , - 1 ]:.2f} " )
775+ best_u = out [shelf ]["true_avg_utils" ][optimal_idx , - 1 ]
776+ best_r = inverse_utility_value (best_u )
777+ current_u = utility_value (df [df ["Bookshelf" ] == shelf ][rating_col ]).mean ()
778+ current_r = inverse_utility_value (
779+ current_u
780+ ) # convex fn so must be calculated in the same way
781+ print (f"Final utility: { best_u :.2f} , current: { current_u :.2f} " )
782+ print (f"Final Rating: { best_r :.2f} , current: { current_r :.2f} " )
752783# %%
753784# Dynamic where check all options: D^F: 100B here
754785F_GRID = np .concatenate (
0 commit comments