11"""
22CID (Content Identifier) utilities for Bitswap.
33
4- This module provides simplified CID encoding/decoding for different Bitswap
5- protocol versions.
6- Note: This is a simplified implementation for demonstration. In production,
7- use a proper CID library like py-cid or multiformats.
4+ This module provides py-cid-backed CID encoding/decoding helpers for Bitswap.
5+ Byte-returning functions are preserved for compatibility with existing callers,
6+ and object-returning variants are provided for new code paths.
87
98====================================
109IMPORTANT: Breaking Change in v1.0
2322"""
2423
2524import hashlib
26- from typing import Any
2725
2826from cid import CIDv0 , CIDv1 , V0Builder , V1Builder , from_string , make_cid
2927from cid .prefix import Prefix
4240CIDObject = CIDv0 | CIDv1
4341
4442
45- def _compute_multihash_sha256 (data : bytes ) -> bytes :
46- """Compute multihash (SHA2-256) for data."""
47- digest = hashlib .sha256 (data ).digest ()
48- # Multihash format: <hash-type><hash-length><hash-digest>
49- return bytes ([int (HASH_SHA256 ), len (digest )]) + digest
50-
51-
5243def _normalise_codec (codec : Code | str | int ) -> Code :
5344 """Normalise codec input to a Code object with validation."""
5445 if isinstance (codec , Code ):
@@ -59,41 +50,14 @@ def _normalise_codec(codec: Code | str | int) -> Code:
5950 raise ValueError (f"Unknown codec: { codec } " )
6051 return Code .from_string (codec )
6152
62- # Integer code path
53+ # Integer code path.
6354 normalised = Code (codec )
6455 # If the name is unknown, the code is not registered
6556 if normalised .name in ("<unknown>" , "" , None ):
6657 raise ValueError (f"Unknown codec code: 0x{ codec :x} " )
6758 return normalised
6859
6960
70- def _parse_varint (data : bytes , offset : int = 0 ) -> tuple [int , int ] | None :
71- """
72- Parse an unsigned varint from data starting at offset.
73-
74- Returns:
75- (value, length) on success, or None on failure.
76-
77- """
78- value = 0
79- shift = 0
80- length = 0
81-
82- # Varints for multicodec are at most 10 bytes.
83- for i in range (offset , min (len (data ), offset + 10 )):
84- byte = data [i ]
85- value |= (byte & 0x7F ) << shift
86- length += 1
87-
88- if (byte & 0x80 ) == 0 :
89- # MSB clear => last byte of varint
90- return value , length
91-
92- shift += 7
93-
94- return None
95-
96-
9761def compute_cid_v0_obj (data : bytes ) -> CIDv0 :
9862 """Compute a CIDv0 object for data."""
9963 return V0Builder ().sum (data )
@@ -124,21 +88,9 @@ def compute_cid_v1_obj(data: bytes, codec: Code | str | int = CODEC_RAW) -> CIDv
12488
12589def compute_cid_v1 (data : bytes , codec : Code | str | int = CODEC_RAW ) -> bytes :
12690 """
127- Compute a CIDv1 for data using py-cid builders.
128-
129- CIDv1 format: <version><codec-varint><multihash>
130-
131- .. note:: **Breaking Change - CIDv1 Encoding Format**
132- This function now uses varint-encoded multicodec prefixes via `add_prefix()`.
133- Previously, CIDv1 used a single-byte codec representation.
134-
135- **Compatibility:**
136- - Codecs < 128 (e.g., raw=0x55, dag-pb=0x70): Formats are **identical**
137- (backward compatible, no migration needed).
138- - Codecs >= 128: Formats **differ** (breaking change, requires migration).
91+ Compute a CIDv1 for data and return raw CID bytes.
13992
140- See :func:`detect_cid_encoding_version` and :func:`migrate_legacy_cid`
141- for migration utilities.
93+ This is the compatibility wrapper over :func:`compute_cid_v1_obj`.
14294
14395 Args:
14496 data: The data to hash
@@ -184,7 +136,7 @@ def get_cid_prefix(cid: bytes) -> bytes:
184136
185137def reconstruct_cid_from_prefix_and_data (prefix : bytes , data : bytes ) -> bytes :
186138 """
187- Reconstruct a CID from prefix and data using py-multihash v3 API .
139+ Reconstruct a CID from prefix and data using py-cid Prefix APIs .
188140
189141 Used when receiving v1.1.0+ Block messages with prefix.
190142
@@ -364,175 +316,3 @@ def parse_cid_codec(cid: bytes) -> str:
364316 return DAG_PB .name
365317
366318 return cid_obj .codec
367-
368-
369- # ============================================================================
370- # Migration and Version Encoding Detection Utilities
371- # ============================================================================
372-
373-
374- def detect_cid_encoding_format (cid : bytes ) -> dict [str , Any ]:
375- """
376- Detect CID encoding format and codec details.
377-
378- Returns:
379- {
380- 'version': 0 or 1,
381- 'codec_value': int,
382- 'codec_name': str,
383- 'encoding': 'legacy' or 'varint',
384- 'needs_migration': bool,
385- 'is_breaking': bool
386- }
387-
388- """
389- from multicodec import Code
390-
391- if len (cid ) < 2 :
392- return {"version" : None , "error" : "CID too short" }
393-
394- version = cid [0 ]
395-
396- if version == 0x12 : # CIDv0 (multihash only)
397- return {
398- "version" : 0 ,
399- "codec_value" : 0x70 , # dag-pb
400- "codec_name" : "dag-pb" ,
401- "encoding" : "legacy" ,
402- "needs_migration" : False ,
403- "is_breaking" : False ,
404- }
405-
406- if version != 0x01 : # Not CIDv1
407- return {"version" : version , "error" : "Unknown CID version" }
408-
409- # Parse codec value from varint
410- codec_value = 0
411- shift = 0
412- codec_length = 0
413-
414- for i in range (1 , min (len (cid ), 11 )): # Max varint is 10 bytes
415- byte = cid [i ]
416- codec_value |= (byte & 0x7F ) << shift
417- shift += 7
418- codec_length += 1
419-
420- if (byte & 0x80 ) == 0 : # Last byte
421- break
422-
423- # Get codec name
424- try :
425- codec = Code (codec_value )
426- codec_name = str (codec )
427- except Exception :
428- codec_name = f"0x{ codec_value :x} "
429-
430- # Determine if this uses legacy or varint encoding
431- # Legacy: single byte for all codecs
432- # Varint: matches codec_value encoding
433- is_breaking = codec_value >= 128
434-
435- # For codecs < 128, legacy and varint are identical (both 1 byte)
436- # For codecs ≥ 128, we can't definitively tell without the original data
437- # But we assume varint if properly implemented
438- encoding = "varint" if codec_length > 1 else "legacy-or-varint"
439-
440- return {
441- "version" : 1 ,
442- "codec_value" : codec_value ,
443- "codec_name" : codec_name ,
444- "codec_length" : codec_length ,
445- "encoding" : encoding ,
446- "needs_migration" : False , # Can't migrate without data
447- "is_breaking" : is_breaking ,
448- }
449-
450-
451- def recompute_cid_from_data (old_cid : bytes , data : bytes ) -> bytes :
452- """
453- Recompute CID with proper varint encoding.
454-
455- Note: Original data is required because CIDs use cryptographic hashes
456- (one-way functions that cannot be reversed).
457-
458- Args:
459- old_cid: Existing CID (used to extract codec)
460- data: Original data that was hashed
461-
462- Returns:
463- New CID with proper varint-encoded codec
464-
465- Raises:
466- ValueError: If old_cid is invalid or doesn't match data
467-
468- """
469- # Detect old CID format
470- info = detect_cid_encoding_format (old_cid )
471-
472- if info .get ("error" ):
473- raise ValueError (f"Invalid CID: { info ['error' ]} " )
474-
475- # First, ensure the provided data actually matches the original CID.
476- # If this fails, the caller is not supplying the correct original data.
477- if not verify_cid (old_cid , data ):
478- raise ValueError ("Recomputed CID does not verify with provided data" )
479-
480- # Extract codec from the old CID encoding
481- codec_value = info ["codec_value" ]
482-
483- # Recompute with proper varint encoding
484- new_cid = compute_cid_v1 (data , codec = codec_value )
485-
486- # Sanity check: new CID must also verify against the same data
487- if not verify_cid (new_cid , data ):
488- raise ValueError ("Recomputed CID does not verify with provided data" )
489-
490- return new_cid
491-
492-
493- def analyze_cid_collection (cids : list [bytes ]) -> dict [str , Any ]:
494- """
495- Analyze a collection of CIDs for migration impact.
496-
497- Returns:
498- {
499- 'total': int,
500- 'backward_compatible': int,
501- 'breaking_change': int,
502- 'by_codec': {codec_name: count},
503- 'breaking_cids': [bytes]
504- }
505-
506- """
507- results : dict [str , Any ] = {
508- "total" : len (cids ),
509- "backward_compatible" : 0 ,
510- "breaking_change" : 0 ,
511- "by_codec" : {},
512- "breaking_cids" : [],
513- }
514-
515- by_codec : dict [str , int ] = {}
516- breaking_cids : list [bytes ] = []
517-
518- for cid in cids :
519- try :
520- info = detect_cid_encoding_format (cid )
521-
522- if info .get ("error" ):
523- continue
524-
525- codec_name = info ["codec_name" ]
526- by_codec [codec_name ] = by_codec .get (codec_name , 0 ) + 1
527-
528- if info ["is_breaking" ]:
529- results ["breaking_change" ] += 1
530- breaking_cids .append (cid )
531- else :
532- results ["backward_compatible" ] += 1
533- except Exception :
534- continue
535-
536- results ["by_codec" ] = by_codec
537- results ["breaking_cids" ] = breaking_cids
538- return results
0 commit comments