1818# ---------------- Parameters ----------------
1919READ_TIME_HOURS = 3.5 # full reading time
2020SEARCH_COST_HOURS = 0.25 # discovery cost of getting a new book
21- PARTIAL_RATING_PENALTY = 0.1 # rating loss when abandoning, assumed utility loss linear
21+ PARTIAL_RATING_PENALTY = 0.05 # rating loss when abandoning, assumed utility loss linear
2222VALUE_BASE = 1.75 # utility = VALUE_BASE ** rating
2323
2424# Quit levels
4646# I quit some books since mid or bored, not because I hated them
4747QUIT_AT_FRACTION = 0.15 # but this would vary a lot?
4848
49-
5049# Static: O(D*F)
5150F_GRID = np .concatenate (
5251 [
5352 np .arange (0.01 , 0.4 , 0.02 ), # more precise in first half
54- np .arange (0.4 , 1.01 , 0.1 ), # less precise in second half
53+ np .arange (0.4 , 1.01 , 0.1 ), # less precise in second half, f=1 as temp hack for graphs
5554 ]
5655)
5756D_GRID = np .concatenate (
@@ -315,21 +314,54 @@ def optimise_schedule_greedy(
315314 return {"cur_drop" : best_cum_drop , "cutoffs" : best_cutoffs , "true_avg_utils" : true_avg_utils }
316315
317316
317+ def avg_hours_reading (cur_drop ):
318+ """Expected Hours of reading if I follow the current instant drop schedule for 1 book"""
319+ # cum_drop = 1 - np.prod(1 - cur_drop)
320+ if len (cur_drop .shape ) == 1 :
321+ have = np .concatenate ([[1 ], np .cumprod (1 - cur_drop )])
322+ else :
323+ # 2d, each row is a different run. Take average number remaining at each step
324+ have = np .concatenate ([[1 ], np .mean (np .cumprod (1 - cur_drop , axis = 1 ), axis = 0 )])
325+ n_dropped_at_step = [i - j for i , j in zip (have [:- 1 ], have [1 :])]
326+ assert np .isclose (F_GRID [- 1 ], 1 ), "last fraction read is 1"
327+ assert len (n_dropped_at_step ) == len (
328+ F_GRID
329+ ), f"n_dropped_at_step { len (n_dropped_at_step )} , F_GRID { len (F_GRID )} "
330+ n_dropped_at_step [- 1 ] = 1 - np .sum (
331+ n_dropped_at_step [:- 1 ]
332+ ) # last entry is fraction of 1, finished whole thing
333+ o = (
334+ np .sum ([d * f * READ_TIME_HOURS for d , f in zip (n_dropped_at_step , F_GRID )])
335+ + SEARCH_COST_HOURS
336+ )
337+ assert o <= READ_TIME_HOURS + SEARCH_COST_HOURS , o
338+ return o
339+
340+
318341def quit_u_h (df_cat : pd .DataFrame , rating_col : str ) -> float :
342+ """Total Hours and utility of books I never finsihed reading originally for category(s)
343+ problem is if quit at F=0.01 then because base utility =1
344+ I get way more utility from quiting many books faster
345+ so the utility is hourly rate of a book with that utility if read whole thing * hours actually read
346+ """
319347 category_quit_counts = dict (df_cat ["Bookshelf" ].value_counts ())
320348 expected_num_quit = sum (
321349 [(STARTED_TO_FINISHED_RATIO [k ] - 1 ) * v for k , v in category_quit_counts .items ()]
322350 )
323351 if rating_col == "Usefulness /5 to Me" :
324- quit_u = utility_value (QUIT_USEFULNESS )
352+ quit_u_if_read = utility_value (QUIT_USEFULNESS )
325353 else :
326- quit_u = utility_value (QUIT_ENJOYMENT )
327- quit_u *= expected_num_quit
354+ quit_u_if_read = utility_value (QUIT_ENJOYMENT )
355+ quit_hourly_rate = quit_u_if_read / ( READ_TIME_HOURS + SEARCH_COST_HOURS )
328356 quit_h = expected_num_quit * (QUIT_AT_FRACTION * READ_TIME_HOURS + SEARCH_COST_HOURS )
357+ quit_u = quit_hourly_rate * quit_h
329358 return quit_u , quit_h
330359
331360
332- def current_hourly_u (df_cat : pd .DataFrame , rating_col : str ) -> float :
361+ def current_hourly_u (df_cat : pd .DataFrame , rating_col : str , cur_drop = None ) -> float :
362+ """Utility per hour of current reading habits, including books I never finished
363+ Can't include cur_drop since then utility depents on which specific books I drop
364+ """
333365 true_ratings_original = df_cat [rating_col ].values # Original ratings for the category, finished
334366 assert np .all (
335367 true_ratings_original >= 1
@@ -339,13 +371,22 @@ def current_hourly_u(df_cat: pd.DataFrame, rating_col: str) -> float:
339371 ), f"some books are rated above 5 { true_ratings_original } "
340372
341373 finished_u = np .sum (utility_value (true_ratings_original ))
342- finished_h = len (true_ratings_original ) * (READ_TIME_HOURS + SEARCH_COST_HOURS )
374+ if cur_drop is None :
375+ finished_h = len (true_ratings_original ) * (READ_TIME_HOURS + SEARCH_COST_HOURS )
376+ else :
377+ assert (
378+ False
379+ ), "Can't include cur_drop since then utility depents on which specific books I drop"
380+ finished_h = len (true_ratings_original ) * avg_hours_reading (cur_drop )
343381
344382 quit_u , quit_h = quit_u_h (df_cat , rating_col )
345383 hourly_u = (finished_u + quit_u ) / (finished_h + quit_h )
346384 return hourly_u
347385
348386
387+ # %%
388+
389+
349390# -------------- Wrapper per category --------------
350391def simulate_category (df_cat : pd .DataFrame , rating_col : str ) -> Dict [str , np .ndarray ]:
351392 """df_cat: dataframe of books FINISHED in category
@@ -367,7 +408,7 @@ def simulate_category(df_cat: pd.DataFrame, rating_col: str) -> Dict[str, np.nda
367408 all_true_utils = []
368409
369410 for i in range (N_SIM ):
370- if True : # j == 0 and i == 0:
411+ if False : # j == 0 and i == 0:
371412 # on first run, use original ratings to match emperical utility from real number of books dropped
372413 # this prevents error correction?
373414 bootstrapped_ratings = true_ratings_original
@@ -383,20 +424,15 @@ def simulate_category(df_cat: pd.DataFrame, rating_col: str) -> Dict[str, np.nda
383424 all_drop_paths .append (res ["cur_drop" ])
384425 all_cutoffs .append (res ["cutoffs" ])
385426 all_true_utils .append (res ["true_avg_utils" ])
386-
387- # keeps growing since as drop more books less time is spent but dont' account for that
388- new_baseline_u = np .percentile ([i [- 1 ] for i in all_true_utils ], 30 )
389- new_baseline_r = inverse_utility_value (new_baseline_u )
390- d = df_cat .copy ()
391- d [rating_col ] = new_baseline_r
392- hourly_avg_u = current_hourly_u (d , rating_col )
393- # hourly_avg_u = (new_baseline_u * len(all_true_utils) + quit_u) / (
394- # (READ_TIME_HOURS + SEARCH_COST_HOURS) + quit_h
395- # )
396-
397- print (hourly_avg_u , new_baseline_u , quit_u , quit_h , READ_TIME_HOURS + SEARCH_COST_HOURS )
398427 cur_drop_acc /= N_SIM
399428 cutoff_acc /= N_SIM
429+
430+ # slight bias in that we're saying we can do better than 20th of previous last run, (var from sampling)
431+ # but since we're held by quits that prevents from spiralying
432+ new_baseline_u = np .percentile ([i [- 1 ] for i in all_true_utils ], 20 )
433+ hourly_avg_u = (len (true_ratings_original ) * new_baseline_u + quit_u ) / (
434+ len (true_ratings_original ) * (READ_TIME_HOURS + SEARCH_COST_HOURS ) + quit_h
435+ )
400436 print ("end" , baseline_hourly_u , hourly_avg_u , np .mean (cur_drop_acc ), np .mean (cutoff_acc ))
401437 baseline_hourly_u = hourly_avg_u
402438
@@ -555,13 +591,76 @@ def plot_simulation_paths(
555591 plt .show ()
556592
557593
594+ # ---------------- Result Printing Functions ----------------
595+ def print_drop_schedule_table (
596+ shelf_name : str ,
597+ f_grid : np .ndarray ,
598+ avg_instant_drops : np .ndarray ,
599+ avg_cumulative_drop : np .ndarray ,
600+ milestone_indices : List [int ],
601+ # cumulative_drop: np.ndarray, # No longer needed as input
602+ ):
603+ """Prints the average drop schedule for a given shelf, where each step was greddily optimised"""
604+ print (f"\n Average Optimal Drop Schedule for: { shelf_name } " ) # Title updated
605+ print (
606+ f"{ 'Fraction Read' :>12} { 'Avg Instant Drop %' :>20} { 'Avg Cumulative Drop %' :>25} "
607+ ) # Header updated
608+
609+ print ("-" * 60 ) # Adjusted width
610+ for i , (f_val , avg_drop_val , avg_cum_drop_val ) in enumerate (
611+ zip (f_grid , avg_instant_drops , avg_cumulative_drop )
612+ ):
613+ if i in milestone_indices or i % 10 == 0 : # Print every 10th point plus milestones
614+ print (f"{ f_val :>12.2f} { avg_drop_val * 100 :>20.2f} { avg_cum_drop_val * 100 :>25.2f} " )
615+
616+
617+ def print_all_shelves_summary (
618+ shelves : List [str ],
619+ out_results : Dict ,
620+ f_grid : np .ndarray ,
621+ milestone_indices : List [int ],
622+ target_fractions : List [float ],
623+ df_all_books : pd .DataFrame ,
624+ rating_col_name : str ,
625+ ):
626+ """Prints a summary of results across all shelves."""
627+ print ("\n \n " + "=" * 30 + " FINAL SUMMARY " + "=" * 30 )
628+ for shelf in shelves :
629+ if shelf not in out_results :
630+ continue
631+ shelf_data = out_results [shelf ]
632+ print (f"\n Summary for: { shelf } " )
633+ print (f"{ 'Fraction Read' :>12} { 'Cumulative Drop %' :>20} " )
634+ print ("-" * 50 )
635+ for target , idx in zip (target_fractions , milestone_indices ):
636+ print (f"{ f_grid [idx ]:>12.2f} { shelf_data ['cumulative_drop' ][idx ]* 100 :>20.2f} " )
637+ print (f"Final cumulative drop: { shelf_data ['cumulative_drop' ][- 1 ]* 100 :.1f} %" )
638+
639+ median_idx = shelf_data .get (
640+ "median_idx" ,
641+ np .argsort (shelf_data ["true_avg_utils" ][:, - 1 ])[len (shelf_data ["true_avg_utils" ]) // 2 ],
642+ )
643+
644+ median_u = shelf_data ["true_avg_utils" ][median_idx , - 1 ]
645+ median_r = inverse_utility_value (median_u )
646+ mean_u = shelf_data ["true_avg_utils" ][:, - 1 ].mean ()
647+ mean_r = inverse_utility_value (mean_u )
648+
649+ current_shelf_df = df_all_books [df_all_books ["Bookshelf" ] == shelf ]
650+ current_u = utility_value (current_shelf_df [rating_col_name ]).mean ()
651+ current_r = inverse_utility_value (current_u )
652+
653+ print (f"Median Final Utility (simulated): { median_u :.2f} (Rating: { median_r :.2f} )" )
654+ print (f"Mean Final Utility (simulated): { mean_u :.2f} (Rating: { mean_r :.2f} )" )
655+ print (f"Current Avg Utility (empirical): { current_u :.2f} (Rating: { current_r :.2f} )" )
656+
657+
558658# ---------------- Main ----------------
559659if __name__ == "__main__" :
560- # Then run the main simulation
660+ rating_col = "Usefulness /5 to Me"
561661 DATA_PATH = Path ("data/Books Read and their effects - Play Export.csv" )
562662 if not DATA_PATH .exists ():
563663 print ("CSV not found – replace DATA_PATH with your local file path." )
564- exit ()
565664 df = pd .read_csv (DATA_PATH )
566665 df ["Bookshelf" ] = df ["Bookshelf" ].str .strip ().str .replace ("/" , "," ).str .replace ("&" , "and" )
567666
@@ -572,65 +671,42 @@ def plot_simulation_paths(
572671 # Find indices closest to 10%, 30%, and 50% of reading
573672 target_fractions = [0.1 , 0.3 , 0.5 ]
574673 milestone_indices = [np .abs (F_GRID - target ).argmin () for target in target_fractions ]
575- rating_col = "Usefulness /5 to Me"
674+
576675 for shelf in shelves :
577676 sub = df [df ["Bookshelf" ] == shelf ]
578677 if sub .empty :
579678 continue
580679 out [shelf ] = simulate_category (sub , rating_col )
581680
582- print (f"\n { '=' * 80 } " )
583- print (f"Optimising schedule for: { shelf } " )
584- print (f"{ '=' * 80 } " )
585-
586681 # Get the optimal path (path with highest final utility)
587- optimal_idx = np .argmax (out [shelf ]["true_avg_utils" ][:, - 1 ])
588- out [shelf ]["optimal_drops" ] = out [shelf ]["cur_drop_path" ][optimal_idx ]
589- # Get the median path
590- median_idx = np .argsort (out [shelf ]["true_avg_utils" ][:, - 1 ])[
591- len (out [shelf ]["true_avg_utils" ]) // 2
592- ]
593- out [shelf ]["median_drops" ] = out [shelf ]["cur_drop_path" ][median_idx ]
594- out [shelf ]["cumulative_drop" ] = 1 - np .cumprod (1 - out [shelf ]["optimal_drops" ])
595-
596- print ("\n Optimal Drop Schedule:" )
597- print (f"{ 'Fraction Read' :>12} { 'Instant Drop %' :>15} { 'Cumulative Drop %' :>20} " )
598- print ("-" * 50 )
599- for i , (f , drop , cum_drop ) in enumerate (
600- zip (F_GRID , out [shelf ]["optimal_drops" ], out [shelf ]["cumulative_drop" ])
601- ):
602- if i in milestone_indices or i % 10 == 0 : # Print every 10th point plus milestones
603- print (f"{ f :>12.2f} { drop * 100 :>15.2f} { cum_drop * 100 :>20.2f} " )
682+ shelf_results = out [shelf ]
683+ # Calculate average cummulative remaining at each fraction read
684+ shelf_results ["avg_cumulative_drop" ] = 1 - np .mean (
685+ np .cumprod (1 - shelf_results ["cur_drop_path" ], axis = 1 ), axis = 0
686+ )
687+ # dont care about the specific path that got a specific utility, but what's best drop path
688+
689+ print_drop_schedule_table (
690+ shelf_name = shelf ,
691+ f_grid = F_GRID ,
692+ avg_instant_drops = shelf_results ["cur_drop_path" ],
693+ avg_cumulative_drop = shelf_results ["avg_cumulative_drop" ],
694+ milestone_indices = milestone_indices ,
695+ )
604696
605697 # Plot simulation paths for this shelf
606698 plot_simulation_paths (
607- out [ shelf ] ["cur_drop_path" ],
699+ shelf_results ["cur_drop_path" ],
608700 F_GRID ,
609- out [ shelf ] ["true_avg_utils" ],
610- out [ shelf ] ["cutoffs_all" ],
701+ shelf_results ["true_avg_utils" ],
702+ shelf_results ["cutoffs_all" ],
611703 f"Simulation Paths - { shelf } " ,
612704 )
705+ print_all_shelves_summary (
706+ shelves , out , F_GRID , milestone_indices , target_fractions , df , rating_col
707+ )
708+
613709
614- for shelf in shelves :
615- print (f"\n { shelf } " )
616- print (f"{ 'Fraction Read' :>12} { 'Cumulative Drop %' :>20} " )
617- print ("-" * 50 )
618- for target , idx in zip (target_fractions , milestone_indices ):
619- print (f"{ F_GRID [idx ]:>12.2f} { out [shelf ]['cumulative_drop' ][idx ]* 100 :>20.2f} " )
620- print (f"Final cumulative drop: { out [shelf ]['cumulative_drop' ][- 1 ]* 100 :.1f} %" )
621- best_u = out [shelf ]["true_avg_utils" ][optimal_idx , - 1 ]
622- best_r = inverse_utility_value (best_u )
623- median_idx = np .argsort (out [shelf ]["true_avg_utils" ][:, - 1 ])[
624- len (out [shelf ]["true_avg_utils" ]) // 2
625- ]
626- median_u = out [shelf ]["true_avg_utils" ][median_idx , - 1 ]
627- median_r = inverse_utility_value (median_u )
628- current_u = utility_value (df [df ["Bookshelf" ] == shelf ][rating_col ]).mean ()
629- current_r = inverse_utility_value (
630- current_u
631- ) # convex fn so must be calculated in the same way
632- print (f"Final utility: { median_u :.2f} , current: { current_u :.2f} " )
633- print (f"Final Rating: { median_r :.2f} , current: { current_r :.2f} " )
634710# %%
635711# Dynamic where check all options: D^F: 100M here at 8**9
636712F_GRID = np .concatenate (
0 commit comments