Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
81 commits
Select commit Hold shift + click to select a range
87ca2b9
feat: Adding filter_cdx and warc_by_cdx commands
malteos Aug 15, 2025
e331bee
Adding unit test for filter_cdx command, some refactoring
malteos Aug 19, 2025
4dcf3a1
Adding unit tests for warc_by_cdx and index resource record
malteos Aug 20, 2025
3496c93
Adding more unit tests
malteos Aug 22, 2025
dd1e4c6
Added unit tests for matcher
malteos Aug 22, 2025
c8dbcf0
Include subpackages
malteos Aug 22, 2025
77ae6ca
Added parallelization to filter_cdx command
malteos Aug 25, 2025
bfded06
removed test file
malteos Aug 25, 2025
eeb3fbb
minor fix
malteos Aug 25, 2025
016c586
bug fix
malteos Aug 25, 2025
44f3b09
Adding S3 writter and reader support for WARCs
malteos Aug 27, 2025
5f8d9e0
added comment on CDX format
malteos Aug 27, 2025
c53562f
Making index record optional, fixing prefix S3 handling
malteos Aug 27, 2025
b48b191
adding --parallel for warc by cdx command
malteos Aug 27, 2025
c58c883
fixed progress bar
malteos Aug 27, 2025
d1d2c76
disable progress bar
malteos Aug 27, 2025
5154e70
Added aioboto3 implementation for warcer
malteos Sep 1, 2025
d13f1a8
Small clean up
malteos Sep 10, 2025
aa69c54
updated format and feat CI
malteos Sep 17, 2025
d45a3da
fixing type hints for py38
malteos Sep 17, 2025
a81a2b4
fixed types and fail fast
malteos Sep 17, 2025
72c3201
fixed types and fail fast
malteos Sep 17, 2025
d9adf03
adding max file size to aioboto3 implementation; improving test coverage
malteos Sep 17, 2025
24b263e
S3 access to CI, more unit tests
malteos Sep 19, 2025
5840c4c
fix s3 access in action
malteos Sep 19, 2025
841fe07
disable s3 tests for py < 39
malteos Sep 19, 2025
eb0a4eb
fixed syntax
malteos Sep 19, 2025
3ddf1a4
fixed bad s3 bucket
malteos Sep 19, 2025
d8a627f
adding more tests
malteos Sep 19, 2025
155db05
more tests
malteos Sep 19, 2025
c57ac3d
Merge branch 'main' into feat/warc-by-cdx
malteos Sep 22, 2025
e670d12
fix CI
malteos Sep 23, 2025
807a39d
fix CI (2)
malteos Sep 23, 2025
5d208d6
fixing Ci for windows
malteos Sep 23, 2025
c8f984c
fixing Ci for windows
malteos Sep 23, 2025
63db23c
removed duplicated code
malteos Sep 23, 2025
e75f143
more windows test fixes
malteos Sep 23, 2025
578a8ae
more windows test fixes (2)
malteos Sep 23, 2025
4eaf366
more windows test fixes (3)
malteos Sep 23, 2025
ed63d2c
re-renable other platforms
malteos Sep 23, 2025
bc5ed79
adding s3_tmpdir fixture
malteos Sep 24, 2025
6441bf6
Adding docs to README and disable duplicated test matrix
malteos Sep 24, 2025
ada22ce
WIP: Refactor for Athena integration
malteos Sep 26, 2025
d0bbd9a
Adding log arg
malteos Sep 30, 2025
dfdefbc
WIP: unified implementation
malteos Oct 1, 2025
faca33e
WIP: unified implementation (2)
malteos Oct 1, 2025
9d03d80
WIP: unified implementation (3)
malteos Oct 1, 2025
7e4c80b
Adding keyboard interupt handling
malteos Oct 1, 2025
bb3f3a7
Adding wild card test
malteos Oct 1, 2025
ad2b326
Refactor to imap
malteos Oct 1, 2025
cca857a
Refactor to imap (2)
malteos Oct 1, 2025
0e03475
Make sure tests run with empty cache dir
malteos Oct 2, 2025
3664db1
CI tests only feature
malteos Oct 2, 2025
ced19e5
Adding args; minor fix
malteos Oct 2, 2025
d6e5a25
force consitent multiprocessing behaviour across platforms
malteos Oct 2, 2025
9b2aa35
Adding resource records with warcinfo id
malteos Oct 2, 2025
494fbc5
Adding tests for unified implementation
malteos Oct 8, 2025
a8e493e
fix type hints
malteos Oct 9, 2025
f7011bd
Adding settings via environment variables
malteos Oct 9, 2025
1d8d3ff
Re-enabled all unit tests
malteos Oct 9, 2025
cc6b35e
Adding MOCK_TIME env variable
malteos Oct 9, 2025
b0bb17f
Removed cdx fetcher from filter warc command
malteos Oct 9, 2025
6020dff
Adding float tol
malteos Oct 9, 2025
7796796
WIP athena integration
malteos Oct 13, 2025
56af9d2
Merge branch 'main' into feat/warc-by-cdx
malteos Oct 13, 2025
fac56ce
Adding Athena PoC
malteos Oct 15, 2025
e58da48
Fix type hints for py38
malteos Oct 15, 2025
1f17ecf
Fixed stats
malteos Oct 15, 2025
ed7046f
Fixed Athena check
malteos Oct 15, 2025
3e85de9
Fixed doc string
malteos Oct 27, 2025
8d1cb29
Merge branch 'main' into feat/warc-by-cdx
malteos Oct 27, 2025
95cd3f1
Fixed lint check
malteos Oct 27, 2025
166a0a9
Add athena query check
malteos Oct 27, 2025
36f9f88
disable athena unit tests
malteos Oct 27, 2025
0a577d3
Added test decorator
malteos Oct 27, 2025
6c316f7
Refactored resource to metadata records
malteos Oct 27, 2025
fa5dfaa
fixed CDX paths resolution from S3
malteos Oct 29, 2025
408e80a
Fixed WARC headers for digests and file name
malteos Oct 31, 2025
73a13c4
Merge branch 'main' into feat/warc-by-cdx
malteos Oct 31, 2025
e1cd976
Fixed test case
malteos Oct 31, 2025
a82af06
changed block digest to payload digest
malteos Nov 4, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Adding MOCK_TIME env variable
  • Loading branch information
malteos committed Oct 9, 2025
commit cc6b35e96bf7441f5117e40c6672c5ae2b782bc4
12 changes: 8 additions & 4 deletions cdx_toolkit/commoncrawl.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

from .myrequests import myrequests_get
from .timeutils import time_to_timestamp, timestamp_to_time, pad_timestamp, pad_timestamp_up, cc_index_to_time, cc_index_to_time_special
from .settings import MOCK_TIME

LOGGER = logging.getLogger(__name__)

Expand Down Expand Up @@ -70,7 +71,7 @@ def get_cc_endpoints(cc_mirror):
url = cc_mirror.rstrip('/') + '/collinfo.json'
r = myrequests_get(url)
if r.status_code != 200:
raise RuntimeError('error {} getting list of cc indices from {}'.format(r.status_code, collinfo)) # pragma: no cover
raise RuntimeError('error {} getting list of cc indices from {}'.format(r.status_code, url)) # pragma: no cover
set_collinfo_cache(cc_mirror, r.text)
col = r.json()

Expand Down Expand Up @@ -119,9 +120,12 @@ def apply_cc_defaults(params, crawl_present=False, now=None):
LOGGER.info('to but no from_ts, setting from_ts=%s', params['from_ts'])
else:
if not now:
# now is passed in by tests. if not set, use actual now.
# XXX could be changed to mock
now = time.time()
# Check for test/override time first
if MOCK_TIME:
now = float(MOCK_TIME)
else:
# now is passed in by tests. if not set, use actual now.
now = time.time()
params['from_ts'] = time_to_timestamp(now - year)
LOGGER.info('no from or to, setting default 1 year ago from_ts=%s', params['from_ts'])
else:
Expand Down
2 changes: 2 additions & 0 deletions cdx_toolkit/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,5 @@
CC_INDEX_MIN_RETRY_INTERVAL = float(os.environ.get("CDXT_CC_INDEX_MIN_RETRY_INTERVAL", 1.0))
CC_DATA_MIN_RETRY_INTERVAL = float(os.environ.get("CDXT_CC_DATA_MIN_RETRY_INTERVAL", 0.55))
IA_MIN_RETRY_INTERVAL = float(os.environ.get("CDXT_IA_MIN_RETRY_INTERVAL", 6.0))

MOCK_TIME = os.environ.get("CDXT_MOCK_TIME")
8 changes: 8 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,14 @@ def cleanup_cache():
shutil.rmtree(cache_dir)


@pytest.fixture(scope="session", autouse=True)
def set_mock_time():
"""Set CDXT_MOCK_TIME environment variable for consistent test results"""
# August 15, 2025 - ensures tests use CC-MAIN-2025-33 which exists in mock data
if 'CDXT_MOCK_TIME' not in os.environ:
os.environ['CDXT_MOCK_TIME'] = '1755259200'


# Cache for AWS S3 access check to avoid repeated network calls
_aws_s3_access_cache = None

Expand Down
Loading