Skip to content

Commit 5c12574

Browse files
committed
Add type info to serialization format, move logic to dedicated serializer class, and improve test coverage
- Serialization now includes encoded type information to prevent cross-type deserialization. - Moved serialization/deserialization logic to a dedicated `HllSerializer` class for better maintainability. - Added guard clause to detect type mismatches during deserialization. - Introduced unit test to verify type safety during deserialization. - Updated internal documentation to reflect new format and structure.
1 parent 1f00c83 commit 5c12574

File tree

8 files changed

+165
-42
lines changed

8 files changed

+165
-42
lines changed

HLL.NET.Examples/Program.cs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,5 +20,4 @@
2020
Console.WriteLine($"Error: {Math.Abs(trueUniqueCount - estimatedCount) / trueUniqueCount:P2}");
2121

2222

23-
2423
AccuracyTester.RunAllTests();

HLL.NET.Tests/HyperLogLogSerializationTests.cs

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,23 @@ public void Deserialize_UnsupportedVersion_ShouldThrow()
4747
Assert.Throws<NotSupportedException>(() =>
4848
HyperLogLog<string>.Deserialize(data, _stringHasher));
4949
}
50+
51+
[Fact]
52+
public void Deserialize_WrongType_ShouldThrow()
53+
{
54+
var stringHasher = new StringHasher();
55+
var intHasher = new IntHasher(); // Assume you have one
56+
57+
var hllString = new HyperLogLog<string>(new HllPrecision(14), stringHasher);
58+
hllString.Add("test");
59+
60+
byte[] serialized = hllString.Serialize();
61+
62+
// Attempt to deserialize as HyperLogLog<int>
63+
Assert.Throws<InvalidOperationException>(() =>
64+
HyperLogLog<int>.Deserialize(serialized, intHasher));
65+
}
66+
5067
}
5168
}
5269

HLL.NET.sln

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11

22
Microsoft Visual Studio Solution File, Format Version 12.00
33
# Visual Studio Version 17
4-
VisualStudioVersion = 17.13.35919.96 d17.13
4+
VisualStudioVersion = 17.13.35919.96
55
MinimumVisualStudioVersion = 10.0.40219.1
66
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "HLL.NET", "HLL.NET\HLL.NET.csproj", "{E44905AB-CE39-4D45-82A8-938D58DDE890}"
77
EndProject
@@ -15,6 +15,11 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "HLL.NET.Examples", "HLL.NET
1515
EndProject
1616
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "examples", "examples", "{CD93B0C7-3D16-41EE-93B3-BD1E08FB9117}"
1717
EndProject
18+
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "docs", "docs", "{3699A8BF-2E93-4E00-8C78-6287149E7846}"
19+
ProjectSection(SolutionItems) = preProject
20+
serialization_format.md = serialization_format.md
21+
EndProjectSection
22+
EndProject
1823
Global
1924
GlobalSection(SolutionConfigurationPlatforms) = preSolution
2025
Debug|Any CPU = Debug|Any CPU

HLL.NET/HLL.NET.csproj

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
<PropertyGroup>
44
<TargetFramework>netstandard2.0</TargetFramework>
55
<PackageId>HLL.NET</PackageId>
6-
<Version>1.0.3</Version>
6+
<Version>1.0.4</Version>
77
<Authors>MCUnderground</Authors>
88
<Description>A simple and efficient C# implementation of HyperLogLog for approximate cardinality estimation.</Description>
99
<PackageLicenseExpression>MIT</PackageLicenseExpression>

HLL.NET/HyperLogLog.cs

Lines changed: 8 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,16 @@
33
using System.Linq;
44
using System;
55
using HLL.NET.Maths;
6+
using System.Text;
7+
using HLL.NET.Serialization;
68

79
namespace HLL.NET
810
{
911
public class HyperLogLog<T>
1012
{
13+
internal HllRegister[] Registers => _registers;
14+
internal HllPrecision Precision => _precision;
15+
1116
private readonly HllPrecision _precision;
1217
private readonly int _numRegisters;
1318
private readonly HllRegister[] _registers;
@@ -74,36 +79,9 @@ private double GetAlphaMM()
7479
default: return (0.7213 / (1 + 1.079 / _numRegisters)) * _numRegisters * _numRegisters;
7580
}
7681
}
82+
public byte[] Serialize() => HllSerializer.Serialize(this);
7783

78-
const byte FormatVersion = 1;
79-
80-
public byte[] Serialize()
81-
{
82-
var data = new byte[3 + _registers.Length]; // version + hasherId + precision + registers
83-
data[0] = FormatVersion;
84-
data[1] = (byte)_precision.Value;
85-
for (int i = 0; i < _registers.Length; i++)
86-
data[i + 2] = _registers[i].Value;
87-
88-
return data;
89-
}
90-
public static HyperLogLog<T> Deserialize(byte[] data, IHasher<T> hasher)
91-
{
92-
if (data == null || data.Length < 4)
93-
throw new ArgumentException("Invalid serialized data");
94-
95-
var version = data[0];
96-
var precision = new HllPrecision(data[1]);
97-
98-
if (version != FormatVersion)
99-
throw new NotSupportedException($"Unsupported HLL format version: {version}");
100-
101-
var hll = new HyperLogLog<T>(precision, hasher);
102-
for (int i = 0; i < hll._registers.Length; i++)
103-
hll._registers[i] = new HllRegister(data[i + 2]);
104-
105-
return hll;
106-
}
107-
84+
public static HyperLogLog<T> Deserialize(byte[] data, IHasher<T> hasher) =>
85+
HllSerializer.Deserialize<T>(data, hasher);
10886
}
10987
}
Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
using HLL.NET.Hashing;
2+
using HLL.NET.Models;
3+
using System;
4+
using System.Text;
5+
6+
namespace HLL.NET.Serialization
7+
{
8+
9+
//docs/serialization_format.md
10+
internal static class HllSerializer
11+
{
12+
private const byte FormatVersion = 2;
13+
14+
public static byte[] Serialize<T>(HyperLogLog<T> hll)
15+
{
16+
var typeNameBytes = EncodeTypeName(typeof(T));
17+
byte typeNameLength = (byte)typeNameBytes.Length;
18+
19+
var data = new byte[1 + 1 + typeNameLength + 1 + hll.Registers.Length];
20+
int index = 0;
21+
22+
data[index++] = FormatVersion;
23+
data[index++] = typeNameLength;
24+
Array.Copy(typeNameBytes, 0, data, index, typeNameLength);
25+
index += typeNameLength;
26+
27+
data[index++] = (byte)hll.Precision.Value;
28+
WriteRegisters(hll, data, ref index);
29+
30+
return data;
31+
}
32+
33+
public static HyperLogLog<T> Deserialize<T>(byte[] data, IHasher<T> hasher)
34+
{
35+
if (data == null || data.Length < 4)
36+
throw new ArgumentException("Invalid serialized data");
37+
38+
int index = 0;
39+
40+
ValidateVersion(data[index++]);
41+
42+
var typeName = ReadTypeName(data, ref index);
43+
EnsureCorrectType<T>(typeName);
44+
45+
var precision = new HllPrecision(data[index++]);
46+
var hll = new HyperLogLog<T>(precision, hasher);
47+
48+
ReadRegisters(hll, data, ref index);
49+
50+
return hll;
51+
}
52+
53+
54+
private static byte[] EncodeTypeName(Type type) =>
55+
Encoding.UTF8.GetBytes(type.FullName);
56+
57+
private static void WriteRegisters<T>(HyperLogLog<T> hll, byte[] data, ref int index)
58+
{
59+
foreach (var reg in hll.Registers)
60+
data[index++] = reg.Value;
61+
}
62+
63+
private static void ReadRegisters<T>(HyperLogLog<T> hll, byte[] data, ref int index)
64+
{
65+
for (int i = 0; i < hll.Registers.Length; i++)
66+
hll.Registers[i] = new HllRegister(data[index++]);
67+
}
68+
69+
private static void ValidateVersion(byte version)
70+
{
71+
if (version != FormatVersion)
72+
throw new NotSupportedException($"Unsupported HLL format version: {version}");
73+
}
74+
75+
private static string ReadTypeName(byte[] data, ref int index)
76+
{
77+
var length = data[index++];
78+
var typeName = Encoding.UTF8.GetString(data, index, length);
79+
index += length;
80+
return typeName;
81+
}
82+
83+
private static void EnsureCorrectType<T>(string serializedTypeName)
84+
{
85+
var expected = typeof(T).FullName;
86+
if (serializedTypeName != expected)
87+
throw new InvalidOperationException($"Type mismatch. Serialized for '{serializedTypeName}', but deserializing as '{expected}'.");
88+
}
89+
}
90+
}

README.md

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -16,15 +16,6 @@ A fast and memory-efficient implementation of the [HyperLogLog](https://en.wikip
1616
- 🧱 Configurable precision (4–16) with built-in validation
1717
- 💼 Fully compatible with .NET Standard for broad platform support
1818

19-
---
20-
21-
## 📦 Installation
22-
23-
```bash
24-
dotnet add package HLL.NET
25-
```
26-
27-
---
2819

2920

3021
### ✅ Accuracy Highlights (based on benchmark tests)
@@ -39,6 +30,15 @@ Handles duplicate values correctly, estimating only **unique** cardinality. Exam
3930
- Input: 10,000 identical values
4031
- Estimated: ~10,000 (error: <1%)
4132

33+
---
34+
35+
## 📦 Installation
36+
37+
```bash
38+
dotnet add package HLL.NET
39+
```
40+
41+
---
4242

4343
## 🧑‍💻 Usage
4444

serialization_format.md

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
# HyperLogLog Serialization Format (HLL.NET)
2+
3+
This document describes the binary format used for serializing `HyperLogLog<T>` instances.
4+
5+
## Format Structure
6+
7+
The serialized data is a byte array with the following layout:
8+
9+
| Offset | Size | Description |
10+
|--------|----------|--------------------------------------------------|
11+
| 0 | 1 byte | Format version (currently `2`) |
12+
| 1 | 1 byte | Type name length (`L`) |
13+
| 2 | 1 byte | Precision value (`p`, between 4–16) |
14+
| 3 | `L` | UTF-8 encoded full type name (`typeof(T).FullName`) |
15+
| 3+L | N bytes | HLL registers (one byte per register; length = `2^p`) |
16+
17+
### Example
18+
19+
For a `HyperLogLog<string>` with precision 14:
20+
21+
- Format version = `2`
22+
- Type name = `System.String`, which has length = `13`
23+
- Registers count = `2^14 = 16384`
24+
- Total bytes = `1 + 1 + 1 + 13 + 16384 = 16400 bytes`
25+
26+
## Versioning
27+
28+
- Current format version: `2`
29+
- Future versions must increment the first byte and adjust parsing logic accordingly.
30+
31+
## Notes
32+
33+
- If the deserialized type `T` does not match the stored type name, deserialization will fail.
34+
- Type names are used to avoid logic errors when deserializing to the wrong generic type.

0 commit comments

Comments
 (0)