DataIntegrationGroup · jirhiker · Jan 16, 2026 · Jan 15, 2026 · Jan 15, 2026
diff --git a/tests/test_transfer_preserve_empty_strings.py b/tests/test_transfer_preserve_empty_strings.py
@@ -0,0 +1,217 @@
+# ===============================================================================
+# Copyright 2026 ross
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ===============================================================================
+from __future__ import annotations
+
+from types import SimpleNamespace
+import uuid
+
+from transfers.chemistry_sampleinfo import ChemistrySampleInfoTransferer
+from transfers.hydraulicsdata import HydraulicsDataTransferer
+from transfers.major_chemistry import MajorChemistryTransferer
+from transfers.minor_trace_chemistry_transfer import MinorTraceChemistryTransferer
+from transfers.radionuclides import RadionuclidesTransferer
+from transfers.surface_water_data import SurfaceWaterDataTransferer
+from transfers.weather_data import WeatherDataTransferer
+
+
+def _make_transferer(cls, **attrs):
+    transferer = cls.__new__(cls)
+    transferer.errors = []
+    transferer.flags = {}
+    transferer.source_table = getattr(cls, "source_table", "Test")
+    for key, value in attrs.items():
+        setattr(transferer, key, value)
+    return transferer
+
+
+def test_major_chemistry_preserves_empty_strings():
+    transferer = _make_transferer(MajorChemistryTransferer)
+    sample_pt_id = uuid.uuid4()
+    global_id = uuid.uuid4()
+    row = {
+        "SamplePtID": sample_pt_id,
+        "GlobalID": global_id,
+        "SamplePointID": "",
+        "Analyte": "",
+        "Symbol": "",
+        "Units": "",
+        "Notes": "",
+    }
+
+    result = transferer._row_dict(row)
+
+    assert result["SamplePointID"] == ""
+    assert result["Analyte"] == ""
+    assert result["Symbol"] == ""
+    assert result["Units"] == ""
+    assert result["Notes"] == ""
+
+
+def test_minor_trace_chemistry_preserves_empty_strings():
+    sample_pt_id = uuid.uuid4()
+    global_id = uuid.uuid4()
+    transferer = _make_transferer(
+        MinorTraceChemistryTransferer, _sample_pt_ids={sample_pt_id}
+    )
+    row = SimpleNamespace(
+        SamplePtID=sample_pt_id,
+        GlobalID=global_id,
+        Analyte="",
+        Units="",
+        Symbol="",
+        AnalysisMethod="",
+        Notes="",
+        AnalysesAgency="",
+        VolumeUnit="",
+    )
+
+    result = transferer._row_to_dict(row)
+
+    assert result["analyte"] == ""
+    assert result["units"] == ""
+    assert result["symbol"] == ""
+    assert result["analysis_method"] == ""
+    assert result["notes"] == ""
+    assert result["analyses_agency"] == ""
+    assert result["volume_unit"] == ""
+
+
+def test_radionuclides_preserves_empty_strings():
+    transferer = _make_transferer(RadionuclidesTransferer, _thing_id_by_sample_pt_id={})
+    sample_pt_id = uuid.uuid4()
+    global_id = uuid.uuid4()
+    row = {
+        "SamplePtID": sample_pt_id,
+        "GlobalID": global_id,
+        "SamplePointID": "",
+        "Analyte": "",
+        "Symbol": "",
+        "Units": "",
+        "AnalysesAgency": "",
+        "WCLab_ID": "",
+    }
+
+    result = transferer._row_dict(row)
+
+    assert result["SamplePointID"] == ""
+    assert result["Analyte"] == ""
+    assert result["Symbol"] == ""
+    assert result["Units"] == ""
+    assert result["AnalysesAgency"] == ""
+    assert result["WCLab_ID"] == ""
+
+
+def test_weather_data_preserves_empty_strings():
+    transferer = _make_transferer(WeatherDataTransferer)
+    row = {
+        "LocationId": None,
+        "PointID": "",
+        "WeatherID": None,
+        "OBJECTID": 1,
+    }
+
+    result = transferer._row_dict(row)
+
+    assert result["PointID"] == ""
+
+
+def test_surface_water_data_preserves_empty_strings():
+    transferer = _make_transferer(SurfaceWaterDataTransferer)
+    row = {
+        "SurfaceID": uuid.uuid4(),
+        "PointID": "",
+        "OBJECTID": 1,
+        "Discharge": "",
+        "DischargeMethod": "",
+        "DischargeUnits": "",
+        "DischargeSource": "",
+        "SiteNotes": "",
+        "FieldMethodNotes": "",
+        "FormationZone": "",
+        "AqClass": "",
+        "SourceNotes": "",
+        "DataSource": "",
+    }
+
+    result = transferer._row_dict(row)
+
+    assert result["Discharge"] == ""
+    assert result["DischargeMethod"] == ""
+    assert result["DischargeUnits"] == ""
+    assert result["DischargeSource"] == ""
+    assert result["SiteNotes"] == ""
+    assert result["FieldMethodNotes"] == ""
+    assert result["FormationZone"] == ""
+    assert result["AqClass"] == ""
+    assert result["SourceNotes"] == ""
+    assert result["DataSource"] == ""
+
+
+def test_hydraulics_preserves_empty_strings():
+    transferer = _make_transferer(HydraulicsDataTransferer, _thing_id_cache={"TEST": 1})
+    row = {
+        "GlobalID": uuid.uuid4(),
+        "WellID": uuid.uuid4(),
+        "PointID": "TEST",
+        "HydraulicUnit": "",
+        "HydraulicUnitType": "",
+        "Hydraulic Remarks": "",
+        "Data Source": "",
+        "TestTop": 1,
+        "TestBottom": 2,
+    }
+
+    result = transferer._row_dict(row)
+
+    assert result["HydraulicUnit"] == ""
+    assert result["HydraulicUnitType"] == ""
+    assert result["Hydraulic Remarks"] == ""
+    assert result["Data Source"] == ""
+
+
+def test_chemistry_sampleinfo_preserves_empty_strings():
+    transferer = _make_transferer(
+        ChemistrySampleInfoTransferer, _thing_id_cache={"TEST": 1}
+    )
+    row = {
+        "SamplePtID": uuid.uuid4(),
+        "SamplePointID": "TEST",
+        "WCLab_ID": "",
+        "CollectionMethod": "",
+        "CollectedBy": "",
+        "AnalysesAgency": "",
+        "SampleType": "",
+        "SampleMaterialNotH2O": "",
+        "WaterType": "",
+        "StudySample": "",
+        "DataSource": "",
+        "SampleNotes": "",
+        "LocationId": uuid.uuid4(),
+        "OBJECTID": 1,
+    }
+
+    result = transferer._row_dict(row)
+
+    assert result["WCLab_ID"] == ""
+    assert result["CollectionMethod"] == ""
+    assert result["CollectedBy"] == ""
+    assert result["AnalysesAgency"] == ""
+    assert result["SampleType"] == ""
+    assert result["SampleMaterialNotH2O"] == ""
+    assert result["WaterType"] == ""
+    assert result["StudySample"] == ""
+    assert result["DataSource"] == ""
+    assert result["SampleNotes"] == ""
diff --git a/transfers/chemistry_sampleinfo.py b/transfers/chemistry_sampleinfo.py
@@ -72,7 +72,9 @@ def _build_thing_id_cache(self):
         logger.info(f"Built Thing ID cache with {len(self._thing_id_cache)} entries")
 
     def _get_dfs(self) -> tuple[pd.DataFrame, pd.DataFrame]:
-        input_df = read_csv(self.source_table, parse_dates=["CollectionDate"])
+        input_df = read_csv(
+            self.source_table, parse_dates=["CollectionDate"], keep_default_na=False
+        )
         # Filter to only include rows where Thing exists (prevent orphan records)
         cleaned_df = self._filter_to_valid_things(input_df)
         cleaned_df = self._filter_to_valid_sample_pt_ids(cleaned_df)
@@ -242,6 +244,9 @@ def _transfer_hook(self, session: Session) -> None:
             session.expunge_all()
 
     def _row_dict(self, row: dict[str, Any]) -> dict[str, Any]:
+        def is_blank(value: Any) -> bool:
+            return isinstance(value, str) and value.strip() == ""
+
         def val(key: str) -> Optional[Any]:
             v = row.get(key)
             if pd.isna(v):
@@ -287,6 +292,8 @@ def bool_val(key: str) -> Optional[bool]:
 
         # Convert pandas Timestamp to datetime; native datetime stays unchanged.
         collection_date = val("CollectionDate")
+        if is_blank(collection_date):
+            collection_date = None
         if hasattr(collection_date, "to_pydatetime"):
             collection_date = collection_date.to_pydatetime()
 

diff --git a/transfers/hydraulicsdata.py b/transfers/hydraulicsdata.py
@@ -50,7 +50,7 @@ def _build_thing_id_cache(self) -> None:
         logger.info(f"Built Thing ID cache with {len(self._thing_id_cache)} entries")
 
     def _get_dfs(self) -> tuple[pd.DataFrame, pd.DataFrame]:
-        df = read_csv(self.source_table)
+        df = read_csv(self.source_table, keep_default_na=False)
         cleaned_df = self._filter_to_valid_things(df)
         return df, cleaned_df
 
@@ -129,6 +129,9 @@ def _transfer_hook(self, session: Session) -> None:
             session.expunge_all()
 
     def _row_dict(self, row: dict[str, Any]) -> dict[str, Any]:
+        def is_blank(value: Any) -> bool:
+            return isinstance(value, str) and value.strip() == ""
+
         def val(key: str) -> Optional[Any]:
             v = row.get(key)
             if pd.isna(v):
@@ -148,13 +151,22 @@ def as_uuid(key: str) -> Optional[uuid.UUID]:
 
         def as_int(key: str) -> Optional[int]:
             v = val(key)
-            if v is None:
+            if v is None or is_blank(v):
                 return None
             try:
                 return int(v)
             except (TypeError, ValueError):
                 return None
 
+        def as_float(key: str) -> Optional[float]:
+            v = val(key)
+            if v is None or is_blank(v):
+                return None
+            try:
+                return float(v)
+            except (TypeError, ValueError):
+                return None
+
         return {
             "GlobalID": as_uuid("GlobalID"),
             "WellID": as_uuid("WellID"),
@@ -165,17 +177,17 @@ def as_int(key: str) -> Optional[int]:
             "TestBottom": as_int("TestBottom"),
             "HydraulicUnitType": val("HydraulicUnitType"),
             "Hydraulic Remarks": val("Hydraulic Remarks"),
-            "T (ft2/d)": val("T (ft2/d)"),
-            "S (dimensionless)": val("S (dimensionless)"),
-            "Ss (ft-1)": val("Ss (ft-1)"),
-            "Sy (decimalfractn)": val("Sy (decimalfractn)"),
-            "KH (ft/d)": val("KH (ft/d)"),
-            "KV (ft/d)": val("KV (ft/d)"),
-            "HL (day-1)": val("HL (day-1)"),
-            "HD (ft2/d)": val("HD (ft2/d)"),
-            "Cs (gal/d/ft)": val("Cs (gal/d/ft)"),
-            "P (decimal fraction)": val("P (decimal fraction)"),
-            "k (darcy)": val("k (darcy)"),
+            "T (ft2/d)": as_float("T (ft2/d)"),
+            "S (dimensionless)": as_float("S (dimensionless)"),
+            "Ss (ft-1)": as_float("Ss (ft-1)"),
+            "Sy (decimalfractn)": as_float("Sy (decimalfractn)"),
+            "KH (ft/d)": as_float("KH (ft/d)"),
+            "KV (ft/d)": as_float("KV (ft/d)"),
+            "HL (day-1)": as_float("HL (day-1)"),
+            "HD (ft2/d)": as_float("HD (ft2/d)"),
+            "Cs (gal/d/ft)": as_float("Cs (gal/d/ft)"),
+            "P (decimal fraction)": as_float("P (decimal fraction)"),
+            "k (darcy)": as_float("k (darcy)"),
             "Data Source": val("Data Source"),
             "OBJECTID": as_int("OBJECTID"),
         }

diff --git a/transfers/major_chemistry.py b/transfers/major_chemistry.py
@@ -53,7 +53,9 @@ def _build_sample_pt_id_cache(self) -> None:
         )
 
     def _get_dfs(self) -> tuple[pd.DataFrame, pd.DataFrame]:
-        input_df = read_csv(self.source_table, parse_dates=["AnalysisDate"])
+        input_df = read_csv(
+            self.source_table, parse_dates=["AnalysisDate"], keep_default_na=False
+        )
         cleaned_df = self._filter_to_valid_sample_infos(input_df)
         return input_df, cleaned_df
 
@@ -131,6 +133,9 @@ def _transfer_hook(self, session: Session) -> None:
             session.expunge_all()
 
     def _row_dict(self, row: dict[str, Any]) -> Optional[dict[str, Any]]:
+        def is_blank(value: Any) -> bool:
+            return isinstance(value, str) and value.strip() == ""
+
         def val(key: str) -> Optional[Any]:
             v = row.get(key)
             if pd.isna(v):
@@ -139,7 +144,7 @@ def val(key: str) -> Optional[Any]:
 
         def float_val(key: str) -> Optional[float]:
             v = val(key)
-            if v is None:
+            if v is None or is_blank(v):
                 return None
             try:
                 return float(v)
@@ -148,14 +153,16 @@ def float_val(key: str) -> Optional[float]:
 
         def int_val(key: str) -> Optional[int]:
             v = val(key)
-            if v is None:
+            if v is None or is_blank(v):
                 return None
             try:
                 return int(v)
             except (TypeError, ValueError):
                 return None
 
         analysis_date = val("AnalysisDate")
+        if is_blank(analysis_date):
+            analysis_date = None
         if hasattr(analysis_date, "to_pydatetime"):
             analysis_date = analysis_date.to_pydatetime()
         if isinstance(analysis_date, datetime):

diff --git a/transfers/minor_trace_chemistry_transfer.py b/transfers/minor_trace_chemistry_transfer.py
@@ -65,7 +65,7 @@ def _build_sample_pt_id_cache(self):
         )
 
     def _get_dfs(self) -> tuple[pd.DataFrame, pd.DataFrame]:
-        input_df = read_csv(self.source_table)
+        input_df = read_csv(self.source_table, keep_default_na=False)
         # Filter to only include rows where ChemistrySampleInfo exists
         cleaned_df = self._filter_to_valid_sample_infos(input_df)
         return input_df, cleaned_df
@@ -223,13 +223,17 @@ def _safe_float(self, row, attr: str) -> Optional[float]:
         val = getattr(row, attr, None)
         if val is None or pd.isna(val):
             return None
+        if isinstance(val, str) and val.strip() == "":
+            return None
         return float(val)
 
     def _parse_date(self, row, attr: str) -> Optional[date]:
         """Parse a date value from the row."""
         val = getattr(row, attr, None)
         if val is None or pd.isna(val):
             return None
+        if isinstance(val, str) and val.strip() == "":
+            return None
 
         # Handle pandas Timestamp
         if hasattr(val, "date"):