program-researcher/state.py at main · MyFriendBen/program-researcher · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
"""
State definitions for the Program Research Agent.

This module defines all the Pydantic models and TypedDict state used
throughout the LangGraph workflow.
"""

from datetime import date
from enum import Enum
from typing import Annotated, Any, Literal

from pydantic import BaseModel, Field


# -----------------------------------------------------------------------------
# Enums
# -----------------------------------------------------------------------------


class LinkCategory(str, Enum):
    """Categories for discovered documentation links."""

    OFFICIAL_PROGRAM = "Official Program"
    LEGISLATION = "Legislation"
    REGULATION = "Regulation"
    APPLICATION = "Application"
    RESEARCH = "Research"
    NAVIGATOR = "Navigator"


class ImpactLevel(str, Enum):
    """Impact level for data gaps or issues."""

    HIGH = "High"
    MEDIUM = "Medium"
    LOW = "Low"


class IssueSeverity(str, Enum):
    """Severity levels for QA issues."""

    CRITICAL = "critical"
    MAJOR = "major"
    MINOR = "minor"


class WorkflowStatus(str, Enum):
    """Overall workflow status."""

    IN_PROGRESS = "in_progress"
    COMPLETED = "completed"
    FAILED = "failed"
    MAX_ITERATIONS_REACHED = "max_iterations_reached"
    AWAITING_INPUT = "awaiting_input"


# -----------------------------------------------------------------------------
# Step 1: Link Catalog Models
# -----------------------------------------------------------------------------


class LinkCatalogEntry(BaseModel):
    """A single link discovered during research."""

    category: LinkCategory = Field(description="Category of the link")
    title: str = Field(description="Descriptive title for the link")
    url: str = Field(description="Full URL")
    source_type: str = Field(description="Type of source (Federal Agency, State Agency, etc.)")
    found_in: str = Field(
        description="Where the link was found ('Provided' or 'Referenced in [source]')"
    )
    accessible: bool = Field(default=True, description="Whether the URL is accessible")
    content_summary: str | None = Field(
        default=None, description="Brief summary of the content at this URL"
    )


class LinkCatalog(BaseModel):
    """Complete catalog of links for a program."""

    program_name: str
    state_code: str
    research_date: date
    sources_provided: int
    links: list[LinkCatalogEntry] = Field(default_factory=list)


# -----------------------------------------------------------------------------
# Step 2: Screener Field Models
# -----------------------------------------------------------------------------


class ScreenerField(BaseModel):
    """A field available in the screener."""

    name: str = Field(description="Field name as it appears in code")
    field_type: str = Field(description="Data type (CharField, IntegerField, etc.)")
    description: str = Field(description="Human-readable description")
    valid_values: list[str] | None = Field(
        default=None, description="Valid values for choice fields"
    )
    model: str = Field(description="Which model this field belongs to")


class ScreenerFieldCatalog(BaseModel):
    """Complete catalog of available screener fields."""

    screen_fields: list[ScreenerField] = Field(default_factory=list)
    household_member_fields: list[ScreenerField] = Field(default_factory=list)
    income_fields: list[ScreenerField] = Field(default_factory=list)
    expense_fields: list[ScreenerField] = Field(default_factory=list)
    insurance_fields: list[ScreenerField] = Field(default_factory=list)
    helper_methods: list[str] = Field(
        default_factory=list, description="Available helper methods for calculations"
    )
    last_updated: date | None = Field(default=None)


# -----------------------------------------------------------------------------
# Step 3: Eligibility Criteria Models
# -----------------------------------------------------------------------------


class EligibilityCriterion(BaseModel):
    """A single eligibility criterion extracted from documentation."""

    criterion: str = Field(description="The eligibility requirement in plain language")
    source_reference: str = Field(
        description="Citation to source (e.g., '7 CFR 247.9(a)' or 'State manual p.12')"
    )
    source_url: str | None = Field(default=None, description="URL where this was found")
    screener_fields: list[str] | None = Field(
        default=None,
        description="Screener field(s) that can evaluate this. None = data gap",
    )
    evaluation_logic: str | None = Field(
        default=None, description="How to evaluate (e.g., 'member.age >= 60')"
    )
    notes: str = Field(default="", description="Additional notes about this criterion")
    impact: ImpactLevel = Field(
        default=ImpactLevel.MEDIUM,
        description="Impact if this criterion cannot be evaluated",
    )


class FieldMapping(BaseModel):
    """Complete mapping of program criteria to screener fields."""

    program_name: str
    criteria_can_evaluate: list[EligibilityCriterion] = Field(default_factory=list)
    criteria_cannot_evaluate: list[EligibilityCriterion] = Field(default_factory=list)
    summary: str = Field(default="", description="Summary of mapping coverage")
    recommendations: list[str] = Field(
        default_factory=list, description="Recommendations for gaps"
    )


# -----------------------------------------------------------------------------
# Step 5: Test Case Models
# -----------------------------------------------------------------------------


class ScenarioStep(BaseModel):
    """A single step in a test case."""

    section: str = Field(description="Form section (Location, Household, Person 1, etc.)")
    instructions: list[str] = Field(description="Specific instructions for this section")


class HumanTestCase(BaseModel):
    """A human-readable test scenario."""

    scenario_number: int = Field(description="Scenario identifier")
    title: str = Field(description="Brief descriptive title")
    what_checking: str = Field(description="Plain language explanation of what this tests")
    category: str = Field(
        description="Category: happy_path, income_threshold, age_threshold, geographic, exclusion, edge_case, multi_member"
    )
    expected_eligible: bool = Field(description="Should this person/household qualify?")
    expected_amount: float | None = Field(
        default=None, description="Expected benefit amount per year"
    )
    expected_time: str | None = Field(
        default=None, description="Expected application time"
    )
    steps: list[ScenarioStep] = Field(description="Step-by-step instructions")
    what_to_look_for: list[str] = Field(
        description="What to verify on the results page"
    )
    why_matters: str = Field(description="Plain language explanation of why this test matters")

    # Data needed for JSON conversion
    zip_code: str = Field(description="ZIP code to use")
    county: str = Field(description="County name")
    household_size: int = Field(description="Number of household members")
    household_assets: float = Field(default=0, description="Total household assets")
    members_data: list[dict[str, Any]] = Field(
        description="Structured data for each household member"
    )
    current_benefits: dict[str, bool] = Field(
        default_factory=dict, description="Current benefits checkboxes"
    )
    citizenship_status: str = Field(default="citizen", description="Citizenship/legal status")


class ScenarioSuite(BaseModel):
    """Complete suite of test cases for a program."""

    program_name: str
    white_label: str
    test_cases: list[HumanTestCase] = Field(default_factory=list)
    coverage_summary: str = Field(default="", description="Summary of what's covered")


# -----------------------------------------------------------------------------
# QA Models
# -----------------------------------------------------------------------------


class QAIssue(BaseModel):
    """An issue found during QA validation."""

    severity: IssueSeverity = Field(description="How serious is this issue")
    issue_type: str = Field(
        description="Type: missed_criterion, incorrect_mapping, wrong_threshold, missing_test, incorrect_value, schema_mismatch"
    )
    description: str = Field(description="What the issue is")
    location: str = Field(description="Where in the output the issue was found")
    source_reference: str | None = Field(
        default=None, description="Citation supporting this issue"
    )
    suggested_fix: str = Field(description="How to fix this issue")
    resolved: bool = Field(default=False, description="Whether this issue has been resolved")


class QAValidationResult(BaseModel):
    """Result of a QA validation pass."""

    validation_type: str = Field(
        description="What was validated: research, test_cases, json"
    )
    overall_status: Literal["VALIDATED", "VALIDATED_WITH_CONCERNS", "NEEDS_REVISION"] = Field(
        description="Overall assessment"
    )
    issues: list[QAIssue] = Field(default_factory=list)
    summary: str = Field(description="Summary of validation")
    recommendation: str = Field(description="Proceed or revise")


# -----------------------------------------------------------------------------
# JSON Output Models (matches benefits-api test_case_schema.json)
# -----------------------------------------------------------------------------


class JSONTestCaseIncomeStream(BaseModel):
    """A single income stream for a household member."""

    type: str
    amount: float
    frequency: str
    hours_worked: int | None = None


class JSONTestCaseExpense(BaseModel):
    """A screen-level expense."""

    type: str
    amount: float
    frequency: str


class JSONTestCaseMemberInsurance(BaseModel):
    """Insurance data for a household member in JSON format."""

    none: bool = False
    employer: bool = False
    private: bool = False
    medicaid: bool = False
    medicare: bool = False
    chp: bool = False
    va: bool = False


class JSONTestCaseMember(BaseModel):
    """A household member in JSON test case format."""

    relationship: str
    birth_month: int
    birth_year: int
    age: int | None = None  # Calculated
    gender: str | None = None
    pregnant: bool | None = None
    student: bool | None = None
    disabled: bool | None = None
    veteran: bool | None = None
    visually_impaired: bool | None = None
    unemployed: bool | None = None
    has_income: bool | None = None
    income_streams: list[JSONTestCaseIncomeStream] = Field(default_factory=list)
    insurance: JSONTestCaseMemberInsurance = Field(default_factory=JSONTestCaseMemberInsurance)


class JSONTestCaseHousehold(BaseModel):
    """Household data in JSON test case format."""

    white_label: str
    household_size: int
    zipcode: str
    county: str
    household_assets: float = 0
    agree_to_tos: bool = True
    is_13_or_older: bool = True
    household_members: list[JSONTestCaseMember]
    expenses: list[JSONTestCaseExpense] = Field(default_factory=list)
    # Current benefits (has_* fields)
    has_tanf: bool | None = None
    has_wic: bool | None = None
    has_snap: bool | None = None
    has_sunbucks: bool | None = None
    has_lifeline: bool | None = None
    has_acp: bool | None = None
    has_eitc: bool | None = None
    has_coeitc: bool | None = None
    has_nslp: bool | None = None
    has_ctc: bool | None = None
    has_il_eitc: bool | None = None
    has_il_ctc: bool | None = None
    has_medicaid: bool | None = None
    has_rtdlive: bool | None = None
    has_cccap: bool | None = None
    has_chp: bool | None = None
    has_ssi: bool | None = None
    has_ssdi: bool | None = None
    has_aca: bool | None = None
    has_section_8: bool | None = None
    has_ma_homebridge: bool | None = None
    has_ma_door_to_door: bool | None = None
    has_csfp: bool | None = None


class JSONTestCaseExpectedResults(BaseModel):
    """Expected results in JSON test case format."""

    program_name: str
    eligible: bool
    value: float | None = None


class JSONTestCase(BaseModel):
    """A complete JSON test case matching benefits-api test_case_schema.json."""

    notes: str
    household: JSONTestCaseHousehold
    expected_results: JSONTestCaseExpectedResults


# -----------------------------------------------------------------------------
# Program Configuration Model (for Django admin import)
# -----------------------------------------------------------------------------


class ProgramConfig(BaseModel):
    """
    Program configuration for Django admin import.

    Matches the format used by import_program_config_data management command.
    """

    white_label: dict[str, str] = Field(description="White label config: {'code': 'il'}")
    program_category: dict[str, str] = Field(
        description="Program category: {'external_name': 'il_food'}"
    )
    program: dict[str, Any] = Field(
        description="Program metadata: name, description, links, etc."
    )
    warning_message: dict[str, Any] | None = Field(
        default=None, description="Optional warning message configuration"
    )
    documents: list[dict[str, str]] = Field(
        default_factory=list, description="Required documents list"
    )
    navigators: list[dict[str, Any]] = Field(
        default_factory=list, description="Local navigator/contact information"
    )


# -----------------------------------------------------------------------------
# Linear Ticket Model
# -----------------------------------------------------------------------------


class LinearTicketContent(BaseModel):
    """Content for creating a Linear ticket."""

    title: str
    description: str
    acceptance_criteria: list[str]
    test_scenarios_summary: str
    source_documentation: list[str]
    json_test_file_path: str | None = None
    program_config_file_path: str | None = Field(
        default=None, description="Path to program config JSON file"
    )


# -----------------------------------------------------------------------------
# Main Graph State
# -----------------------------------------------------------------------------


class ResearchState(BaseModel):
    """
    Complete state for the research workflow.

    This is the main state object that flows through the LangGraph.
    """

    # ----- Input -----
    program_name: str = Field(description="Name of the benefit program")
    state_code: str = Field(description="State code (e.g., 'il', 'co', 'nc')")
    white_label: str = Field(description="White label identifier")
    source_urls: list[str] = Field(description="Source documentation URLs provided by user")

    # ----- Step 1: Link Discovery -----
    link_catalog: LinkCatalog | None = Field(default=None)
    fetched_content_refs: dict[str, str] | None = Field(
        default=None,
        description="Map of source URL to file path where fetched content is saved",
    )

    # ----- Step 2: Screener Fields -----
    screener_fields: ScreenerFieldCatalog | None = Field(default=None)

    # ----- Step 3: Field Mapping -----
    field_mapping: FieldMapping | None = Field(default=None)

    # ----- Research QA Loop -----
    research_qa_result: QAValidationResult | None = Field(default=None)
    research_iteration: int = Field(default=0)

    # ----- Step 5: Test Cases -----
    test_suite: ScenarioSuite | None = Field(default=None)

    # ----- Test Case QA Loop -----
    test_case_qa_result: QAValidationResult | None = Field(default=None)
    test_case_iteration: int = Field(default=0)

    # ----- JSON Conversion -----
    json_test_cases: list[JSONTestCase] = Field(default_factory=list)

    # ----- JSON QA Loop -----
    json_qa_result: QAValidationResult | None = Field(default=None)
    json_iteration: int = Field(default=0)

    # ----- Program Configuration -----
    program_config: ProgramConfig | None = Field(default=None)

    # ----- Linear Ticket -----
    linear_ticket: LinearTicketContent | None = Field(default=None)
    linear_ticket_url: str | None = Field(default=None)
    linear_ticket_id: str | None = Field(default=None)

    # ----- Control -----
    max_iterations: int = Field(default=3, description="Max QA iterations before proceeding")
    status: WorkflowStatus = Field(default=WorkflowStatus.IN_PROGRESS)
    error_message: str | None = Field(default=None)
    messages: list[str] = Field(
        default_factory=list, description="Log of workflow progress messages"
    )

    # ----- Output -----
    output_dir: str | None = Field(
        default=None, description="Directory where step outputs are saved"
    )

    model_config = {"use_enum_values": True}


# -----------------------------------------------------------------------------
# State update helpers for LangGraph reducers
# -----------------------------------------------------------------------------


def add_message(messages: list[str], new_message: str) -> list[str]:
    """Reducer to append a message to the messages list."""
    return messages + [new_message]


def increment_counter(current: int, _: Any) -> int:
    """Reducer to increment a counter."""
    return current + 1