diff --git a/requirements-dev.txt b/requirements-dev.txt index cf27e63..fcc9b3c 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,3 +1,6 @@ pytest>=7.4.0 pytest-cov>=4.1.0 pytest-mock>=3.11.0 +selenium>=4.10.0 +webdriver-manager>=4.0.0 +numpy>=1.24.0 diff --git a/requirements.txt b/requirements.txt index 4cca127..1688ed0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ requests>=2.31.0 beautifulsoup4>=4.12.0 lxml>=4.9.0 +Pillow>=10.0.0 diff --git a/tests/test_cli.py b/tests/test_cli.py index c0151a9..c7a1dc8 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1,66 +1,686 @@ """Tests for CLI.""" +import json import pytest import sys -from unittest.mock import patch, Mock -from wayback_diff.cli import format_output +from unittest.mock import patch, Mock, MagicMock +from pathlib import Path +import tempfile +import os +from wayback_diff.cli import format_output, main + + +class TestFormatOutput: + """Test cases for format_output function.""" + + def _make_summary(self, total=5, added=2, removed=1, modified=2, + high=1, medium=2, low=2): + return { + 'total_changes': total, + 'added': added, + 'removed': removed, + 'modified': modified, + 'high_significance': high, + 'medium_significance': medium, + 'low_significance': low, + } + + def _make_change(self, change_type='modified', old_text='Old', new_text='New', + significance='high'): + return { + 'type': change_type, + 'old_text': old_text, + 'new_text': new_text, + 'significance': significance, + } -class TestCLI: - """Test cases for CLI.""" - def test_format_output_text(self): """Test text output formatting.""" - summary = { - 'total_changes': 5, - 'added': 2, - 'removed': 1, - 'modified': 2, - 'high_significance': 1, - 'medium_significance': 2, - 'low_significance': 2, - } - - changes = [ - { - 'type': 'modified', - 'old_text': 'Old', - 'new_text': 'New', - 'significance': 'high' - } - ] - + summary = self._make_summary() + changes = [self._make_change()] + output = format_output(changes, summary, 'text') - + assert 'WAYBACK DIFF SUMMARY' in output assert 'Total changes: 5' in output + assert 'Added: 2' in output + assert 'Removed: 1' in output + assert 'Modified: 2' in output + assert 'High: 1' in output + assert 'Medium: 2' in output + assert 'Low: 2' in output assert 'HIGH SIGNIFICANCE CHANGES' in output - + def test_format_output_json(self): """Test JSON output formatting.""" - summary = { - 'total_changes': 1, - 'added': 0, - 'removed': 0, - 'modified': 1, - 'high_significance': 1, - 'medium_significance': 0, - 'low_significance': 0, - } - - changes = [ - { - 'type': 'modified', - 'old_text': 'Old', - 'new_text': 'New', - 'significance': 'high' - } - ] - + summary = self._make_summary(total=1, added=0, removed=0, modified=1, + high=1, medium=0, low=0) + changes = [self._make_change()] + output = format_output(changes, summary, 'json') - - import json + data = json.loads(output) assert 'summary' in data assert 'changes' in data assert data['summary']['total_changes'] == 1 + + def test_format_output_unified_returns_empty(self): + """Test unified format returns empty string.""" + summary = self._make_summary() + changes = [self._make_change()] + + output = format_output(changes, summary, 'unified') + assert output == "" + + def test_format_output_text_no_changes(self): + """Test text output with no changes.""" + summary = self._make_summary(total=0, added=0, removed=0, modified=0, + high=0, medium=0, low=0) + output = format_output([], summary, 'text') + assert 'Total changes: 0' in output + assert 'HIGH SIGNIFICANCE CHANGES' not in output + assert 'MEDIUM SIGNIFICANCE CHANGES' not in output + + def test_format_output_text_medium_changes(self): + """Test text output with medium significance changes.""" + summary = self._make_summary(total=3, added=0, removed=0, modified=3, + high=0, medium=3, low=0) + changes = [self._make_change(significance='medium') for _ in range(3)] + output = format_output(changes, summary, 'text') + assert 'MEDIUM SIGNIFICANCE CHANGES' in output + + def test_format_output_text_more_than_10_medium(self): + """Test text output with more than 10 medium changes (truncation).""" + summary = self._make_summary(total=15, added=0, removed=0, modified=15, + high=0, medium=15, low=0) + changes = [self._make_change(significance='medium') for _ in range(15)] + output = format_output(changes, summary, 'text') + assert 'MEDIUM SIGNIFICANCE CHANGES' in output + assert '... and 5 more medium significance changes' in output + + def test_format_output_text_more_than_20_high(self): + """Test text output with more than 20 high changes (truncation).""" + summary = self._make_summary(total=25, added=0, removed=0, modified=25, + high=25, medium=0, low=0) + changes = [self._make_change(significance='high') for _ in range(25)] + output = format_output(changes, summary, 'text') + assert 'HIGH SIGNIFICANCE CHANGES' in output + assert '... and 5 more high significance changes' in output + + def test_format_output_text_low_changes_small_count(self): + """Test text output with a small number of low significance changes.""" + summary = self._make_summary(total=3, added=0, removed=0, modified=3, + high=0, medium=0, low=3) + changes = [self._make_change(significance='low') for _ in range(3)] + output = format_output(changes, summary, 'text') + assert 'LOW SIGNIFICANCE CHANGES' in output + assert '3 low significance changes' in output + + def test_format_output_text_low_changes_large_count(self): + """Test text output with more than 50 low significance changes (hidden).""" + summary = self._make_summary(total=55, added=0, removed=0, modified=55, + high=0, medium=0, low=55) + changes = [self._make_change(significance='low') for _ in range(55)] + output = format_output(changes, summary, 'text') + # Low changes > 50 are not shown + assert 'LOW SIGNIFICANCE CHANGES' not in output + + def test_format_output_text_added_change(self): + """Test text output with added change (no old_text).""" + summary = self._make_summary(total=1, added=1, removed=0, modified=0, + high=1, medium=0, low=0) + changes = [self._make_change(change_type='added', old_text='', new_text='New Content')] + output = format_output(changes, summary, 'text') + assert 'NEW:' in output + + def test_format_output_text_removed_change(self): + """Test text output with removed change (no new_text).""" + summary = self._make_summary(total=1, added=0, removed=1, modified=0, + high=1, medium=0, low=0) + changes = [self._make_change(change_type='removed', old_text='Old Content', new_text='')] + output = format_output(changes, summary, 'text') + assert 'OLD:' in output + + def test_format_output_json_unicode(self): + """Test JSON output with unicode characters.""" + summary = self._make_summary(total=1, added=0, removed=0, modified=1, + high=1, medium=0, low=0) + changes = [self._make_change(old_text='Texto viejo', new_text='Texto nuevo')] + output = format_output(changes, summary, 'json') + data = json.loads(output) + assert 'viejo' in data['changes'][0]['old_text'] + + +class TestMain: + """Test cases for CLI main function.""" + + @patch('wayback_diff.cli.WebFetcher') + def test_main_basic_comparison(self, mock_fetcher_cls): + """Test basic URL comparison flow.""" + mock_fetcher = Mock() + mock_fetcher_cls.return_value = mock_fetcher + mock_fetcher.fetch.side_effect = [ + (b'

Old

', 'text/html', {'status_code': 200}), + (b'

New

', 'text/html', {'status_code': 200}), + ] + mock_fetcher.is_html.return_value = True + + with patch('sys.argv', ['wayback-diff', 'https://example.com/old', 'https://example.com/new']): + with pytest.raises(SystemExit) as exc_info: + main() + # Should exit with 1 or 2 (changes detected) + assert exc_info.value.code in (1, 2) + + @patch('wayback_diff.cli.WebFetcher') + def test_main_identical_pages(self, mock_fetcher_cls): + """Test comparison of identical pages exits with 0.""" + mock_fetcher = Mock() + mock_fetcher_cls.return_value = mock_fetcher + content = b'

Same content

' + mock_fetcher.fetch.side_effect = [ + (content, 'text/html', {'status_code': 200}), + (content, 'text/html', {'status_code': 200}), + ] + mock_fetcher.is_html.return_value = True + + with patch('sys.argv', ['wayback-diff', 'https://example.com/a', 'https://example.com/b']): + with pytest.raises(SystemExit) as exc_info: + main() + assert exc_info.value.code == 0 + + @patch('wayback_diff.cli.WebFetcher') + def test_main_fetch_failure_url1(self, mock_fetcher_cls): + """Test exit code 1 when URL1 fetch fails.""" + mock_fetcher = Mock() + mock_fetcher_cls.return_value = mock_fetcher + mock_fetcher.fetch.return_value = (None, None, {'error': 'Connection failed'}) + + with patch('sys.argv', ['wayback-diff', 'https://example.com/bad', 'https://example.com/good']): + with pytest.raises(SystemExit) as exc_info: + main() + assert exc_info.value.code == 1 + + @patch('wayback_diff.cli.WebFetcher') + def test_main_fetch_failure_url2(self, mock_fetcher_cls): + """Test exit code 1 when URL2 fetch fails.""" + mock_fetcher = Mock() + mock_fetcher_cls.return_value = mock_fetcher + mock_fetcher.fetch.side_effect = [ + (b'OK', 'text/html', {'status_code': 200}), + (None, None, {'error': 'Timeout'}), + ] + mock_fetcher.is_html.return_value = True + + with patch('sys.argv', ['wayback-diff', 'https://example.com/a', 'https://example.com/bad']): + with pytest.raises(SystemExit) as exc_info: + main() + assert exc_info.value.code == 1 + + @patch('wayback_diff.cli.WebFetcher') + def test_main_fetch_failure_url1_no_error_key(self, mock_fetcher_cls): + """Test exit code 1 when URL1 fetch fails with no error key in metadata.""" + mock_fetcher = Mock() + mock_fetcher_cls.return_value = mock_fetcher + mock_fetcher.fetch.return_value = (None, None, {}) + + with patch('sys.argv', ['wayback-diff', 'https://example.com/bad', 'https://example.com/good']): + with pytest.raises(SystemExit) as exc_info: + main() + assert exc_info.value.code == 1 + + @patch('wayback_diff.cli.WebFetcher') + def test_main_json_format(self, mock_fetcher_cls): + """Test --format json flag.""" + mock_fetcher = Mock() + mock_fetcher_cls.return_value = mock_fetcher + content = b'

Same

' + mock_fetcher.fetch.side_effect = [ + (content, 'text/html', {'status_code': 200}), + (content, 'text/html', {'status_code': 200}), + ] + mock_fetcher.is_html.return_value = True + + with patch('sys.argv', ['wayback-diff', 'https://a.com', 'https://b.com', '--format', 'json']): + with pytest.raises(SystemExit) as exc_info: + main() + assert exc_info.value.code == 0 + + @patch('wayback_diff.cli.WebFetcher') + def test_main_unified_format(self, mock_fetcher_cls): + """Test --format unified flag.""" + mock_fetcher = Mock() + mock_fetcher_cls.return_value = mock_fetcher + mock_fetcher.fetch.side_effect = [ + (b'Old', 'text/html', {'status_code': 200}), + (b'New', 'text/html', {'status_code': 200}), + ] + mock_fetcher.is_html.return_value = True + + with patch('sys.argv', ['wayback-diff', 'https://a.com', 'https://b.com', '--format', 'unified']): + with pytest.raises(SystemExit) as exc_info: + main() + # unified diff with changes exits non-zero + assert exc_info.value.code in (1, 2) + + @patch('wayback_diff.cli.WebFetcher') + def test_main_output_to_file(self, mock_fetcher_cls): + """Test --output flag writes to file.""" + mock_fetcher = Mock() + mock_fetcher_cls.return_value = mock_fetcher + content = b'Same' + mock_fetcher.fetch.side_effect = [ + (content, 'text/html', {'status_code': 200}), + (content, 'text/html', {'status_code': 200}), + ] + mock_fetcher.is_html.return_value = True + + with tempfile.TemporaryDirectory() as tmpdir: + outfile = os.path.join(tmpdir, 'output.txt') + with patch('sys.argv', ['wayback-diff', 'https://a.com', 'https://b.com', '-o', outfile]): + with pytest.raises(SystemExit): + main() + assert os.path.exists(outfile) + + @patch('wayback_diff.cli.WebFetcher') + def test_main_verbose(self, mock_fetcher_cls): + """Test --verbose flag.""" + mock_fetcher = Mock() + mock_fetcher_cls.return_value = mock_fetcher + content = b'Same' + mock_fetcher.fetch.side_effect = [ + (content, 'text/html', {'status_code': 200}), + (content, 'text/html', {'status_code': 200}), + ] + mock_fetcher.is_html.return_value = True + + with patch('sys.argv', ['wayback-diff', 'https://a.com', 'https://b.com', '--verbose']): + with pytest.raises(SystemExit): + main() + + @patch('wayback_diff.cli.WebFetcher') + def test_main_non_html_warning(self, mock_fetcher_cls): + """Test warning when content is not HTML.""" + mock_fetcher = Mock() + mock_fetcher_cls.return_value = mock_fetcher + content = b'{"key": "value"}' + mock_fetcher.fetch.side_effect = [ + (content, 'application/json', {'status_code': 200}), + (content, 'application/json', {'status_code': 200}), + ] + mock_fetcher.is_html.return_value = False + + with patch('sys.argv', ['wayback-diff', 'https://a.com/api', 'https://b.com/api']): + with pytest.raises(SystemExit): + main() + + @patch('wayback_diff.cli.WaybackCleaner') + @patch('wayback_diff.cli.WebFetcher') + def test_main_wayback_url_cleaning(self, mock_fetcher_cls, mock_cleaner_cls): + """Test Wayback URL auto-cleaning.""" + mock_fetcher = Mock() + mock_fetcher_cls.return_value = mock_fetcher + content = b'Content' + mock_fetcher.fetch.side_effect = [ + (content, 'text/html', {'status_code': 200}), + (content, 'text/html', {'status_code': 200}), + ] + mock_fetcher.is_html.return_value = True + mock_cleaner_cls.is_wayback_url.side_effect = [True, False] + mock_cleaner_cls.clean_wayback_html.return_value = content + + wb_url = 'https://web.archive.org/web/20230101/https://example.com/' + with patch('sys.argv', ['wayback-diff', wb_url, 'https://example.com/', '--verbose']): + with pytest.raises(SystemExit): + main() + mock_cleaner_cls.clean_wayback_html.assert_called_once() + + @patch('wayback_diff.cli.WaybackCleaner') + @patch('wayback_diff.cli.WebFetcher') + def test_main_no_clean_wayback_flag(self, mock_fetcher_cls, mock_cleaner_cls): + """Test --no-clean-wayback flag skips cleaning.""" + mock_fetcher = Mock() + mock_fetcher_cls.return_value = mock_fetcher + content = b'Content' + mock_fetcher.fetch.side_effect = [ + (content, 'text/html', {'status_code': 200}), + (content, 'text/html', {'status_code': 200}), + ] + mock_fetcher.is_html.return_value = True + + wb_url = 'https://web.archive.org/web/20230101/https://example.com/' + with patch('sys.argv', ['wayback-diff', wb_url, 'https://example.com/', '--no-clean-wayback']): + with pytest.raises(SystemExit): + main() + mock_cleaner_cls.clean_wayback_html.assert_not_called() + + @patch('wayback_diff.cli.MarkdownReportGenerator') + @patch('wayback_diff.cli.WebFetcher') + def test_main_markdown_report(self, mock_fetcher_cls, mock_report_cls): + """Test --markdown flag generates report.""" + mock_fetcher = Mock() + mock_fetcher_cls.return_value = mock_fetcher + content = b'Same' + mock_fetcher.fetch.side_effect = [ + (content, 'text/html', {'status_code': 200}), + (content, 'text/html', {'status_code': 200}), + ] + mock_fetcher.is_html.return_value = True + + mock_gen = Mock() + mock_report_cls.return_value = mock_gen + mock_gen.generate_comparison_report.return_value = "# Report" + mock_gen.save_report.return_value = "/tmp/report.md" + + with tempfile.TemporaryDirectory() as tmpdir: + with patch('sys.argv', ['wayback-diff', 'https://a.com', 'https://b.com', + '--markdown', '--report-dir', tmpdir]): + with pytest.raises(SystemExit): + main() + mock_gen.generate_comparison_report.assert_called_once() + mock_gen.save_report.assert_called_once() + + @patch('wayback_diff.cli.VISUAL_COMPARISON_AVAILABLE', False) + @patch('wayback_diff.cli.WebFetcher') + def test_main_visual_not_available(self, mock_fetcher_cls): + """Test --visual flag when dependencies not available.""" + mock_fetcher = Mock() + mock_fetcher_cls.return_value = mock_fetcher + content = b'Content' + mock_fetcher.fetch.side_effect = [ + (content, 'text/html', {'status_code': 200}), + (content, 'text/html', {'status_code': 200}), + ] + mock_fetcher.is_html.return_value = True + + with patch('sys.argv', ['wayback-diff', 'https://a.com', 'https://b.com', '--visual']): + with pytest.raises(SystemExit) as exc_info: + main() + assert exc_info.value.code == 1 + + @patch('wayback_diff.cli.VISUAL_COMPARISON_AVAILABLE', True) + @patch('wayback_diff.cli.VisualComparison') + @patch('wayback_diff.cli.MarkdownReportGenerator') + @patch('wayback_diff.cli.WebFetcher') + def test_main_visual_comparison_success(self, mock_fetcher_cls, mock_report_cls, + mock_visual_cls): + """Test --visual flag with successful comparison.""" + mock_fetcher = Mock() + mock_fetcher_cls.return_value = mock_fetcher + content = b'Same' + mock_fetcher.fetch.side_effect = [ + (content, 'text/html', {'status_code': 200}), + (content, 'text/html', {'status_code': 200}), + ] + mock_fetcher.is_html.return_value = True + + mock_visual = Mock() + mock_visual_cls.return_value = mock_visual + mock_visual.compare_urls.return_value = { + 'chrome': { + 'difference_ratio': 0.02, + 'different_pixels': 100, + 'screenshot1': '/tmp/s1.png', + 'screenshot2': '/tmp/s2.png', + 'comparison': '/tmp/comp.png', + } + } + + mock_gen = Mock() + mock_report_cls.return_value = mock_gen + mock_gen.generate_comparison_report.return_value = "# Report" + mock_gen.save_report.return_value = "/tmp/report.md" + + with patch('sys.argv', ['wayback-diff', 'https://a.com', 'https://b.com', '--visual']): + with pytest.raises(SystemExit): + main() + + @patch('wayback_diff.cli.VISUAL_COMPARISON_AVAILABLE', True) + @patch('wayback_diff.cli.VisualComparison') + @patch('wayback_diff.cli.MarkdownReportGenerator') + @patch('wayback_diff.cli.WebFetcher') + def test_main_visual_comparison_with_error_result(self, mock_fetcher_cls, + mock_report_cls, mock_visual_cls): + """Test --visual flag when browser returns error.""" + mock_fetcher = Mock() + mock_fetcher_cls.return_value = mock_fetcher + content = b'Same' + mock_fetcher.fetch.side_effect = [ + (content, 'text/html', {'status_code': 200}), + (content, 'text/html', {'status_code': 200}), + ] + mock_fetcher.is_html.return_value = True + + mock_visual = Mock() + mock_visual_cls.return_value = mock_visual + mock_visual.compare_urls.return_value = { + 'chrome': {'error': 'No chrome found'} + } + + mock_gen = Mock() + mock_report_cls.return_value = mock_gen + mock_gen.generate_comparison_report.return_value = "# Report" + mock_gen.save_report.return_value = "/tmp/report.md" + + with patch('sys.argv', ['wayback-diff', 'https://a.com', 'https://b.com', '--visual']): + with pytest.raises(SystemExit): + main() + + @patch('wayback_diff.cli.VISUAL_COMPARISON_AVAILABLE', True) + @patch('wayback_diff.cli.VisualComparison') + @patch('wayback_diff.cli.WebFetcher') + def test_main_visual_comparison_exception(self, mock_fetcher_cls, mock_visual_cls): + """Test --visual flag when visual comparison raises exception.""" + mock_fetcher = Mock() + mock_fetcher_cls.return_value = mock_fetcher + content = b'Same' + mock_fetcher.fetch.side_effect = [ + (content, 'text/html', {'status_code': 200}), + (content, 'text/html', {'status_code': 200}), + ] + mock_fetcher.is_html.return_value = True + + mock_visual_cls.side_effect = Exception("Browser crash") + + with patch('sys.argv', ['wayback-diff', 'https://a.com', 'https://b.com', + '--visual', '--verbose']): + with pytest.raises(SystemExit): + main() + + @patch('wayback_diff.cli.VISUAL_COMPARISON_AVAILABLE', True) + @patch('wayback_diff.cli.VisualComparison') + @patch('wayback_diff.cli.WebFetcher') + def test_main_visual_import_error(self, mock_fetcher_cls, mock_visual_cls): + """Test --visual flag when ImportError occurs.""" + mock_fetcher = Mock() + mock_fetcher_cls.return_value = mock_fetcher + content = b'Same' + mock_fetcher.fetch.side_effect = [ + (content, 'text/html', {'status_code': 200}), + (content, 'text/html', {'status_code': 200}), + ] + mock_fetcher.is_html.return_value = True + + mock_visual_cls.side_effect = ImportError("No selenium") + + with patch('sys.argv', ['wayback-diff', 'https://a.com', 'https://b.com', '--visual']): + with pytest.raises(SystemExit): + main() + + @patch('wayback_diff.cli.LinkTraverser') + @patch('wayback_diff.cli.WebFetcher') + def test_main_traverse_mode(self, mock_fetcher_cls, mock_traverser_cls): + """Test --traverse flag.""" + mock_fetcher = Mock() + mock_fetcher_cls.return_value = mock_fetcher + content = b'Content' + mock_fetcher.fetch.side_effect = [ + (content, 'text/html', {'status_code': 200}), + (content, 'text/html', {'status_code': 200}), + ] + mock_fetcher.is_html.return_value = True + + mock_traverser = Mock() + mock_traverser_cls.return_value = mock_traverser + mock_traverser.traverse_and_compare.return_value = [] + mock_traverser.generate_report.return_value = "Traversal report" + + with patch('sys.argv', ['wayback-diff', 'https://a.com', 'https://b.com', '--traverse']): + with pytest.raises(SystemExit) as exc_info: + main() + assert exc_info.value.code == 0 + + @patch('wayback_diff.cli.LinkTraverser') + @patch('wayback_diff.cli.WebFetcher') + def test_main_traverse_with_high_diffs(self, mock_fetcher_cls, mock_traverser_cls): + """Test --traverse with high significance differences exits with 2.""" + mock_fetcher = Mock() + mock_fetcher_cls.return_value = mock_fetcher + content = b'Content' + mock_fetcher.fetch.side_effect = [ + (content, 'text/html', {'status_code': 200}), + (content, 'text/html', {'status_code': 200}), + ] + mock_fetcher.is_html.return_value = True + + mock_traverser = Mock() + mock_traverser_cls.return_value = mock_traverser + mock_traverser.traverse_and_compare.return_value = [ + {'status': 'compared', 'high_significance': 5} + ] + mock_traverser.generate_report.return_value = "Report" + + with patch('sys.argv', ['wayback-diff', 'https://a.com', 'https://b.com', '--traverse']): + with pytest.raises(SystemExit) as exc_info: + main() + assert exc_info.value.code == 2 + + @patch('wayback_diff.cli.LinkTraverser') + @patch('wayback_diff.cli.WebFetcher') + def test_main_traverse_compared_no_high(self, mock_fetcher_cls, mock_traverser_cls): + """Test --traverse with compared pages but no high significance exits with 1.""" + mock_fetcher = Mock() + mock_fetcher_cls.return_value = mock_fetcher + content = b'Content' + mock_fetcher.fetch.side_effect = [ + (content, 'text/html', {'status_code': 200}), + (content, 'text/html', {'status_code': 200}), + ] + mock_fetcher.is_html.return_value = True + + mock_traverser = Mock() + mock_traverser_cls.return_value = mock_traverser + mock_traverser.traverse_and_compare.return_value = [ + {'status': 'compared', 'high_significance': 0} + ] + mock_traverser.generate_report.return_value = "Report" + + with patch('sys.argv', ['wayback-diff', 'https://a.com', 'https://b.com', '--traverse']): + with pytest.raises(SystemExit) as exc_info: + main() + assert exc_info.value.code == 1 + + @patch('wayback_diff.cli.MarkdownReportGenerator') + @patch('wayback_diff.cli.LinkTraverser') + @patch('wayback_diff.cli.WebFetcher') + def test_main_traverse_with_markdown(self, mock_fetcher_cls, mock_traverser_cls, + mock_report_cls): + """Test --traverse --markdown generates report.""" + mock_fetcher = Mock() + mock_fetcher_cls.return_value = mock_fetcher + content = b'Content' + mock_fetcher.fetch.side_effect = [ + (content, 'text/html', {'status_code': 200}), + (content, 'text/html', {'status_code': 200}), + ] + mock_fetcher.is_html.return_value = True + + mock_traverser = Mock() + mock_traverser_cls.return_value = mock_traverser + mock_traverser.traverse_and_compare.return_value = [] + mock_traverser.generate_report.return_value = "Report" + + mock_gen = Mock() + mock_report_cls.return_value = mock_gen + mock_gen.generate_comparison_report.return_value = "# Report" + mock_gen.save_report.return_value = "/tmp/report.md" + + with tempfile.TemporaryDirectory() as tmpdir: + with patch('sys.argv', ['wayback-diff', 'https://a.com', 'https://b.com', + '--traverse', '--markdown', '--report-dir', tmpdir]): + with pytest.raises(SystemExit): + main() + mock_gen.save_report.assert_called_once() + + @patch('wayback_diff.cli.LinkTraverser') + @patch('wayback_diff.cli.WebFetcher') + def test_main_traverse_output_to_file(self, mock_fetcher_cls, mock_traverser_cls): + """Test --traverse -o flag writes report to file.""" + mock_fetcher = Mock() + mock_fetcher_cls.return_value = mock_fetcher + content = b'Content' + mock_fetcher.fetch.side_effect = [ + (content, 'text/html', {'status_code': 200}), + (content, 'text/html', {'status_code': 200}), + ] + mock_fetcher.is_html.return_value = True + + mock_traverser = Mock() + mock_traverser_cls.return_value = mock_traverser + mock_traverser.traverse_and_compare.return_value = [] + mock_traverser.generate_report.return_value = "Traversal report text" + + with tempfile.TemporaryDirectory() as tmpdir: + outfile = os.path.join(tmpdir, 'out.txt') + with patch('sys.argv', ['wayback-diff', 'https://a.com', 'https://b.com', + '--traverse', '-o', outfile, '--verbose']): + with pytest.raises(SystemExit): + main() + assert os.path.exists(outfile) + with open(outfile) as f: + assert 'Traversal report text' in f.read() + + @patch('wayback_diff.cli.DiffEngine') + @patch('wayback_diff.cli.WebFetcher') + def test_main_high_significance_exits_2(self, mock_fetcher_cls, mock_engine_cls): + """Test exit code 2 for high significance changes.""" + mock_fetcher = Mock() + mock_fetcher_cls.return_value = mock_fetcher + mock_fetcher.fetch.side_effect = [ + (b'Old', 'text/html', {'status_code': 200}), + (b'New', 'text/html', {'status_code': 200}), + ] + mock_fetcher.is_html.return_value = True + + mock_engine = Mock() + mock_engine_cls.return_value = mock_engine + mock_engine.extract_meaningful_changes.return_value = [ + {'type': 'modified', 'significance': 'high', 'old_text': 'Old', 'new_text': 'New'} + ] + mock_engine.get_summary.return_value = { + 'total_changes': 1, 'added': 0, 'removed': 0, 'modified': 1, + 'high_significance': 1, 'medium_significance': 0, 'low_significance': 0, + } + + with patch('sys.argv', ['wayback-diff', 'https://a.com', 'https://b.com']): + with pytest.raises(SystemExit) as exc_info: + main() + assert exc_info.value.code == 2 + + @patch('wayback_diff.cli.WebFetcher') + def test_main_browsers_auto(self, mock_fetcher_cls): + """Test --browsers auto flag parsing.""" + mock_fetcher = Mock() + mock_fetcher_cls.return_value = mock_fetcher + content = b'Same' + mock_fetcher.fetch.side_effect = [ + (content, 'text/html', {'status_code': 200}), + (content, 'text/html', {'status_code': 200}), + ] + mock_fetcher.is_html.return_value = True + + # Just verify parsing works (no --visual so browsers arg is stored but not used) + with patch('sys.argv', ['wayback-diff', 'https://a.com', 'https://b.com', + '--browsers', 'chrome', 'firefox']): + with pytest.raises(SystemExit): + main() diff --git a/tests/test_diff_engine.py b/tests/test_diff_engine.py index 1e14432..fae8552 100644 --- a/tests/test_diff_engine.py +++ b/tests/test_diff_engine.py @@ -1,92 +1,458 @@ """Tests for diff engine.""" import pytest -from wayback_diff.diff_engine import DiffEngine +from wayback_diff.diff_engine import DiffEngine, HTMLStructureParser + + +class TestHTMLStructureParser: + """Test cases for HTMLStructureParser.""" + + def test_parse_basic_html(self): + """Test parsing basic HTML structure.""" + parser = HTMLStructureParser() + parser.feed("

Title

Text

") + + assert len(parser.structure) > 0 + tags = [s['tag'] for s in parser.structure if s['type'] == 'start'] + assert 'h1' in tags + assert 'p' in tags + + def test_parse_text_content(self): + """Test extracting text content.""" + parser = HTMLStructureParser() + parser.feed("

Hello World

") + + assert 'Hello World' in parser.text_content + + def test_parse_ignores_whitespace_text(self): + """Test that whitespace-only text is ignored.""" + parser = HTMLStructureParser() + parser.feed("

Real text

") + + assert 'Real text' in parser.text_content + assert ' ' not in parser.text_content + + def test_parse_attributes(self): + """Test parsing element attributes.""" + parser = HTMLStructureParser() + parser.feed('Link') + + start_tags = [s for s in parser.structure if s['type'] == 'start' and s['tag'] == 'a'] + assert len(start_tags) == 1 + assert start_tags[0]['attrs']['href'] == 'https://example.com' + + def test_parse_depth_tracking(self): + """Test depth tracking.""" + parser = HTMLStructureParser() + parser.feed("

Deep

") + + start_tags = [s for s in parser.structure if s['type'] == 'start'] + depths = [s['depth'] for s in start_tags] + assert len(depths) >= 2 + # Deeper elements should have higher depth + assert max(depths) >= 2 + + def test_parse_important_tags(self): + """Test that important tags are captured.""" + important_tags = ['div', 'section', 'article', 'header', 'footer', + 'nav', 'main', 'aside', 'h1', 'h2', 'h3', 'h4', + 'h5', 'h6', 'p', 'a', 'img', 'script', 'style', + 'link', 'meta', 'title'] + for tag in important_tags: + parser = HTMLStructureParser() + if tag in ('img', 'link', 'meta'): + parser.feed(f'<{tag} src="x">') + else: + parser.feed(f'<{tag}>content') + start_tags = [s['tag'] for s in parser.structure if s['type'] == 'start'] + assert tag in start_tags, f"Tag {tag} should be captured" + + def test_parse_non_important_tags_ignored(self): + """Test that non-important tags are not captured in structure.""" + parser = HTMLStructureParser() + parser.feed("textboldemphasis") + + start_tags = [s['tag'] for s in parser.structure if s['type'] == 'start'] + assert 'span' not in start_tags + assert 'b' not in start_tags + assert 'em' not in start_tags + + def test_parse_end_tags(self): + """Test end tag handling.""" + parser = HTMLStructureParser() + parser.feed("

Title

") + + end_tags = [s for s in parser.structure if s['type'] == 'end'] + assert len(end_tags) >= 1 + assert end_tags[0]['tag'] == 'h1' + + def test_parse_nested_text(self): + """Test multiple text nodes.""" + parser = HTMLStructureParser() + parser.feed("
First

Second

") + + assert 'First' in parser.text_content + assert 'Second' in parser.text_content class TestDiffEngine: """Test cases for DiffEngine.""" - - def test_normalize_content(self): - """Test content normalization.""" + + def test_init_defaults(self): + """Test default initialization.""" + engine = DiffEngine() + assert engine.ignore_whitespace is True + assert engine.ignore_case is False + + def test_init_custom(self): + """Test custom initialization.""" + engine = DiffEngine(ignore_whitespace=False, ignore_case=True) + assert engine.ignore_whitespace is False + assert engine.ignore_case is True + + def test_normalize_content_whitespace(self): + """Test content normalization with whitespace.""" engine = DiffEngine(ignore_whitespace=True) - + content1 = b"
Test
" content2 = b"
Test
" - + norm1 = engine.normalize_content(content1) norm2 = engine.normalize_content(content2) - - # After normalization, whitespace should be similar + assert b'
' in norm1 assert b'Test' in norm1 - + + def test_normalize_content_no_whitespace(self): + """Test normalization with whitespace handling disabled.""" + engine = DiffEngine(ignore_whitespace=False) + content = b"
Test
" + normalized = engine.normalize_content(content) + assert b' Test ' in normalized + + def test_normalize_content_case(self): + """Test case normalization.""" + engine = DiffEngine(ignore_case=True) + content = b"
Test
" + normalized = engine.normalize_content(content) + assert b'
' in normalized + assert b'test' in normalized + + def test_normalize_content_tag_whitespace(self): + """Test whitespace between tags is normalized.""" + engine = DiffEngine(ignore_whitespace=True) + content = b"
\n

Text

" + normalized = engine.normalize_content(content) + assert b'><' in normalized or b'> <' in normalized + def test_extract_meaningful_changes(self): """Test change extraction.""" engine = DiffEngine() - + old_content = b"

Old Title

Content

" new_content = b"

New Title

Content

" - + changes = engine.extract_meaningful_changes(old_content, new_content) - + assert len(changes) > 0 - # Should detect the title change - the diff engine extracts the changed parts - # which may be just "Old" and "New" rather than full "Old Title" and "New Title" all_text = ' '.join([c.get('old_text', '') + c.get('new_text', '') for c in changes]) all_context = ' '.join([c.get('old_context', '') + c.get('new_context', '') for c in changes]) - # Check if the change or context contains the title text - assert ('Old' in all_text and 'New' in all_text) or 'Title' in all_context or any('Title' in c.get('old_context', '') or 'Title' in c.get('new_context', '') for c in changes) - + assert ('Old' in all_text and 'New' in all_text) or 'Title' in all_context + + def test_extract_meaningful_changes_identical(self): + """Test no changes for identical content.""" + engine = DiffEngine() + content = b"

Same content

" + changes = engine.extract_meaningful_changes(content, content) + assert len(changes) == 0 + + def test_extract_meaningful_changes_added(self): + """Test detecting added content.""" + engine = DiffEngine() + old = b"" + new = b"

New paragraph

" + changes = engine.extract_meaningful_changes(old, new) + assert len(changes) > 0 + types = [c['type'] for c in changes] + assert 'added' in types or 'modified' in types + + def test_extract_meaningful_changes_removed(self): + """Test detecting removed content.""" + engine = DiffEngine() + old = b"

Old paragraph

" + new = b"" + changes = engine.extract_meaningful_changes(old, new) + assert len(changes) > 0 + types = [c['type'] for c in changes] + assert 'removed' in types or 'modified' in types + + def test_extract_meaningful_changes_large_content(self): + """Test autojunk for large content (> 100000 bytes).""" + engine = DiffEngine() + old = b"" + b"

Content paragraph

\n" * 10000 + b"" + new = b"" + b"

Content paragraph

\n" * 9999 + b"

Different

" + changes = engine.extract_meaningful_changes(old, new) + # Should complete without error + assert isinstance(changes, list) + + def test_extract_meaningful_changes_has_positions(self): + """Test that changes have position information.""" + engine = DiffEngine() + old = b"Old" + new = b"New" + changes = engine.extract_meaningful_changes(old, new) + assert len(changes) > 0 + for change in changes: + assert 'old_position' in change + assert 'new_position' in change + assert 'old_context' in change + assert 'new_context' in change + + def test_extract_meaningful_changes_has_significance(self): + """Test that changes have significance levels.""" + engine = DiffEngine() + old = b"Old" + new = b"New" + changes = engine.extract_meaningful_changes(old, new) + for change in changes: + assert change['significance'] in ('high', 'medium', 'low') + def test_get_summary(self): """Test summary generation.""" engine = DiffEngine() - + old_content = b"

Old

" new_content = b"

New

Added

" - + changes = engine.extract_meaningful_changes(old_content, new_content) summary = engine.get_summary(changes) - + assert 'total_changes' in summary assert 'added' in summary assert 'removed' in summary assert 'modified' in summary + assert 'high_significance' in summary + assert 'medium_significance' in summary + assert 'low_significance' in summary assert summary['total_changes'] > 0 - - def test_assess_significance(self): - """Test significance assessment.""" + + def test_get_summary_empty(self): + """Test summary for empty changes list.""" + engine = DiffEngine() + summary = engine.get_summary([]) + assert summary['total_changes'] == 0 + assert summary['added'] == 0 + assert summary['removed'] == 0 + assert summary['modified'] == 0 + + def test_assess_significance_high_title(self): + """Test high significance for title changes.""" engine = DiffEngine() - - # High significance - title change - high_change = engine._assess_significance( + result = engine._assess_significance( b"Old", b"New" ) - assert high_change == 'high' - - # Medium significance - div change - medium_change = engine._assess_significance( + assert result == 'high' + + def test_assess_significance_high_heading(self): + """Test high significance for heading changes.""" + engine = DiffEngine() + for i in range(1, 7): + result = engine._assess_significance( + f"Old".encode(), + f"New".encode() + ) + assert result == 'high', f"h{i} should be high significance" + + def test_assess_significance_high_meta(self): + """Test high significance for meta tag changes.""" + engine = DiffEngine() + result = engine._assess_significance( + b'', + b'' + ) + assert result == 'high' + + def test_assess_significance_high_script(self): + """Test high significance for script changes.""" + engine = DiffEngine() + result = engine._assess_significance( + b'', + b'' + ) + assert result == 'high' + + def test_assess_significance_high_stylesheet(self): + """Test high significance for stylesheet changes.""" + engine = DiffEngine() + result = engine._assess_significance( + b'', + b'' + ) + assert result == 'high' + + def test_assess_significance_high_body(self): + """Test high significance for body tag changes.""" + engine = DiffEngine() + result = engine._assess_significance(b'', b'') + assert result == 'high' + + def test_assess_significance_high_main(self): + """Test high significance for main/article/section changes.""" + engine = DiffEngine() + for tag in ['main', 'article', 'section']: + result = engine._assess_significance( + f'<{tag}>old'.encode(), + f'<{tag}>new'.encode() + ) + assert result == 'high', f"{tag} should be high significance" + + def test_assess_significance_medium_class(self): + """Test medium significance for class changes.""" + engine = DiffEngine() + result = engine._assess_significance( + b'class="old-style"', + b'class="new-style"' + ) + assert result == 'medium' + + def test_assess_significance_medium_div(self): + """Test medium significance for div changes.""" + engine = DiffEngine() + result = engine._assess_significance( b"
Old
", b"
New
" ) - assert medium_change == 'medium' - - # Low significance - whitespace - low_change = engine._assess_significance( - b" ", - b" " + assert result == 'medium' + + def test_assess_significance_medium_style(self): + """Test medium significance for style attribute changes.""" + engine = DiffEngine() + result = engine._assess_significance( + b'style="color: red"', + b'style="color: blue"' + ) + assert result == 'medium' + + def test_assess_significance_medium_span(self): + """Test medium significance for span changes.""" + engine = DiffEngine() + result = engine._assess_significance( + b'old', + b'new' ) - assert low_change == 'low' - + assert result == 'medium' + + def test_assess_significance_medium_id(self): + """Test medium significance for id changes.""" + engine = DiffEngine() + result = engine._assess_significance(b'id="old"', b'id="new"') + assert result == 'medium' + + def test_assess_significance_low(self): + """Test low significance for whitespace/minor changes.""" + engine = DiffEngine() + result = engine._assess_significance(b" ", b" ") + assert result == 'low' + + def test_assess_significance_low_plain_text(self): + """Test low significance for plain text without structural markers.""" + engine = DiffEngine() + result = engine._assess_significance(b"old text", b"new text") + assert result == 'low' + def test_generate_unified_diff(self): """Test unified diff generation.""" engine = DiffEngine() - + old_content = b"Line 1\nLine 2\nLine 3" new_content = b"Line 1\nLine 2 Modified\nLine 3" - + diff = engine.generate_unified_diff(old_content, new_content, "old.txt", "new.txt") - + assert len(diff) > 0 assert any('Line 2' in line for line in diff) + + def test_generate_unified_diff_identical(self): + """Test unified diff for identical content.""" + engine = DiffEngine() + content = b"Line 1\nLine 2\nLine 3" + diff = engine.generate_unified_diff(content, content) + assert len(diff) == 0 + + def test_generate_unified_diff_custom_context(self): + """Test unified diff with custom context lines.""" + engine = DiffEngine() + old = b"A\nB\nC\nD\nE\nF\nG" + new = b"A\nB\nC\nX\nE\nF\nG" + diff = engine.generate_unified_diff(old, new, n=1) + assert len(diff) > 0 + + def test_generate_unified_diff_labels(self): + """Test unified diff labels.""" + engine = DiffEngine() + old = b"old line" + new = b"new line" + diff = engine.generate_unified_diff(old, new, old_label="file1.html", new_label="file2.html") + joined = '\n'.join(diff) + assert 'file1.html' in joined + assert 'file2.html' in joined + + def test_compare_structures(self): + """Test HTML structure comparison returns expected keys.""" + engine = DiffEngine() + old = b"

Title

Text

" + new = b"

Title

Text

New
" + + result = engine.compare_structures(old, new) + + assert 'structural_changes' in result + assert 'old_structure' in result + assert 'new_structure' in result + assert 'similarity' in result + assert 0.0 <= result['similarity'] <= 1.0 + # Different structures should have some changes + assert len(result['structural_changes']) > 0 + + def test_compare_structures_identical(self): + """Test structure comparison for identical HTML.""" + engine = DiffEngine() + html = b"

Title

" + result = engine.compare_structures(html, html) + assert result['similarity'] == 1.0 + assert len(result['structural_changes']) == 0 + + def test_compare_structures_unicode(self): + """Test structure comparison with unicode content.""" + engine = DiffEngine() + old = '

Texto en espanol

'.encode('utf-8') + new = '

Texto en ingles

'.encode('utf-8') + result = engine.compare_structures(old, new) + assert 'similarity' in result + # Same structure, different text content (text is not in structure) + assert result['similarity'] == 1.0 + + def test_compare_structures_empty(self): + """Test structure comparison with empty content.""" + engine = DiffEngine() + result = engine.compare_structures(b"", b"") + # Empty content produces no structure elements, so SequenceMatcher + # works fine on empty lists + assert 'similarity' in result + assert isinstance(result['structural_changes'], list) + + def test_compare_structures_no_important_tags(self): + """Test structure comparison with no important tags (empty structures).""" + engine = DiffEngine() + # span/b/em are not captured, so structures are empty lists + old = b"text1" + new = b"text2" + result = engine.compare_structures(old, new) + assert result['similarity'] == 1.0 + assert len(result['structural_changes']) == 0 + + def test_compare_structures_returns_dict(self): + """Test compare_structures always returns a dict.""" + engine = DiffEngine() + result = engine.compare_structures(b"

a

", b"

b

") + assert isinstance(result, dict) diff --git a/tests/test_fetcher.py b/tests/test_fetcher.py index 4210e44..e5197fa 100644 --- a/tests/test_fetcher.py +++ b/tests/test_fetcher.py @@ -1,21 +1,43 @@ """Tests for web fetcher.""" import pytest -from unittest.mock import Mock, patch +from unittest.mock import Mock, patch, PropertyMock +import requests from wayback_diff.fetcher import WebFetcher class TestWebFetcher: """Test cases for WebFetcher.""" - + + def test_init_defaults(self): + """Test default initialization.""" + fetcher = WebFetcher() + assert fetcher.timeout == 30 + assert fetcher.max_retries == 3 + + def test_init_custom(self): + """Test custom initialization.""" + fetcher = WebFetcher(timeout=10, max_retries=5) + assert fetcher.timeout == 10 + assert fetcher.max_retries == 5 + + def test_default_headers_set(self): + """Test that default headers are set on session.""" + fetcher = WebFetcher() + assert 'User-Agent' in fetcher.session.headers + def test_is_html(self): """Test HTML content type detection.""" fetcher = WebFetcher() - assert fetcher.is_html("text/html") - assert fetcher.is_html("text/html; charset=utf-8") - assert not fetcher.is_html("application/json") - assert not fetcher.is_html(None) - + assert fetcher.is_html("text/html") is True + assert fetcher.is_html("text/html; charset=utf-8") is True + assert fetcher.is_html("TEXT/HTML") is True + assert fetcher.is_html("application/json") is False + assert fetcher.is_html("application/xml") is False + assert fetcher.is_html("image/png") is False + assert fetcher.is_html(None) is False + assert fetcher.is_html("") is False + @patch('wayback_diff.fetcher.requests.Session.get') def test_fetch_success(self, mock_get): """Test successful fetch.""" @@ -25,14 +47,46 @@ def test_fetch_success(self, mock_get): mock_response.headers = {'Content-Type': 'text/html; charset=utf-8'} mock_response.encoding = 'utf-8' mock_get.return_value = mock_response - + fetcher = WebFetcher() content, content_type, metadata = fetcher.fetch("https://example.com") - + assert content == b'Test' assert content_type == 'text/html; charset=utf-8' assert metadata['status_code'] == 200 - + + @patch('wayback_diff.fetcher.requests.Session.get') + def test_fetch_success_adds_charset(self, mock_get): + """Test that charset is added when missing from content type.""" + mock_response = Mock() + mock_response.status_code = 200 + mock_response.content = b'Test' + mock_response.headers = {'Content-Type': 'text/html'} + mock_response.encoding = 'utf-8' + mock_get.return_value = mock_response + + fetcher = WebFetcher() + content, content_type, metadata = fetcher.fetch("https://example.com") + + assert 'charset=utf-8' in content_type + + @patch('wayback_diff.fetcher.requests.Session.get') + def test_fetch_binary_content(self, mock_get): + """Test fetching binary content that cannot decode as utf-8.""" + mock_response = Mock() + mock_response.status_code = 200 + # Create content that fails strict utf-8 decode + mock_response.content = bytes([0xFF, 0xFE, 0x00, 0x01, 0x80, 0x81]) + mock_response.headers = {'Content-Type': 'application/octet-stream'} + mock_response.encoding = None + mock_get.return_value = mock_response + + fetcher = WebFetcher() + content, content_type, metadata = fetcher.fetch("https://example.com/file.bin") + + assert content is not None + assert 'application/octet-stream' in content_type + @patch('wayback_diff.fetcher.requests.Session.get') def test_fetch_404(self, mock_get): """Test 404 response.""" @@ -40,17 +94,160 @@ def test_fetch_404(self, mock_get): mock_response.status_code = 404 mock_response.headers = {} mock_get.return_value = mock_response - + fetcher = WebFetcher() content, content_type, metadata = fetcher.fetch("https://example.com/notfound") - + assert content is None assert metadata['status_code'] == 404 - - def test_url_normalization(self): - """Test URL normalization.""" - fetcher = WebFetcher() - - # Test adding https:// - content, _, _ = fetcher.fetch("example.com") - # Should not raise an error (will fail in actual request, but URL is normalized) + + @patch('wayback_diff.fetcher.requests.Session.get') + def test_fetch_500(self, mock_get): + """Test 500 response.""" + mock_response = Mock() + mock_response.status_code = 500 + mock_response.headers = {'Server': 'nginx'} + mock_get.return_value = mock_response + + fetcher = WebFetcher() + content, content_type, metadata = fetcher.fetch("https://example.com/error") + + assert content is None + assert metadata['status_code'] == 500 + + @patch('wayback_diff.fetcher.requests.Session.get') + def test_fetch_timeout_retries(self, mock_get): + """Test timeout with retries.""" + mock_get.side_effect = requests.exceptions.Timeout("Connection timed out") + + fetcher = WebFetcher(timeout=1, max_retries=2) + content, content_type, metadata = fetcher.fetch("https://example.com") + + assert content is None + assert mock_get.call_count == 2 + + @patch('wayback_diff.fetcher.requests.Session.get') + def test_fetch_request_exception_retries(self, mock_get): + """Test request exception with retries.""" + mock_get.side_effect = requests.exceptions.ConnectionError("Connection refused") + + fetcher = WebFetcher(timeout=1, max_retries=3) + content, content_type, metadata = fetcher.fetch("https://example.com") + + assert content is None + assert mock_get.call_count == 3 + assert 'error' in metadata + + @patch('wayback_diff.fetcher.requests.Session.get') + def test_fetch_timeout_then_success(self, mock_get): + """Test timeout on first attempt then success.""" + mock_response = Mock() + mock_response.status_code = 200 + mock_response.content = b'OK' + mock_response.headers = {'Content-Type': 'text/html'} + mock_response.encoding = 'utf-8' + + mock_get.side_effect = [ + requests.exceptions.Timeout("timeout"), + mock_response, + ] + + fetcher = WebFetcher(timeout=1, max_retries=3) + content, content_type, metadata = fetcher.fetch("https://example.com") + + assert content == b'OK' + assert mock_get.call_count == 2 + + def test_url_auto_https_prefix(self): + """Test URL normalization adds https://.""" + fetcher = WebFetcher() + # We can't fully test since it will try to make a real request, + # but we can verify the URL validation logic via mocking. + with patch.object(fetcher.session, 'get') as mock_get: + mock_response = Mock() + mock_response.status_code = 200 + mock_response.content = b'OK' + mock_response.headers = {'Content-Type': 'text/html'} + mock_response.encoding = 'utf-8' + mock_get.return_value = mock_response + + fetcher.fetch("example.com") + # Should have added https:// prefix + call_url = mock_get.call_args[0][0] + assert call_url.startswith("https://") + + @patch('wayback_diff.fetcher.requests.Session.get') + def test_fetch_metadata_includes_headers(self, mock_get): + """Test that metadata includes response headers.""" + mock_response = Mock() + mock_response.status_code = 200 + mock_response.content = b'OK' + mock_response.headers = { + 'Content-Type': 'text/html', + 'X-Custom': 'value' + } + mock_response.encoding = 'utf-8' + mock_get.return_value = mock_response + + fetcher = WebFetcher() + _, _, metadata = fetcher.fetch("https://example.com") + + assert metadata['headers']['X-Custom'] == 'value' + + @patch('wayback_diff.fetcher.requests.Session.get') + def test_fetch_metadata_includes_encoding(self, mock_get): + """Test that metadata includes encoding.""" + mock_response = Mock() + mock_response.status_code = 200 + mock_response.content = b'OK' + mock_response.headers = {'Content-Type': 'text/html; charset=iso-8859-1'} + mock_response.encoding = 'iso-8859-1' + mock_get.return_value = mock_response + + fetcher = WebFetcher() + _, _, metadata = fetcher.fetch("https://example.com") + + assert metadata['encoding'] == 'iso-8859-1' + + @patch('wayback_diff.fetcher.requests.Session.get') + def test_fetch_no_encoding(self, mock_get): + """Test fetch when response has no encoding.""" + mock_response = Mock() + mock_response.status_code = 200 + mock_response.content = b'OK' + mock_response.headers = {'Content-Type': 'text/html'} + mock_response.encoding = None + mock_get.return_value = mock_response + + fetcher = WebFetcher() + _, _, metadata = fetcher.fetch("https://example.com") + + assert metadata['encoding'] is None + + @patch('wayback_diff.fetcher.requests.Session.get') + def test_fetch_private_ip_allowed(self, mock_get): + """Test that private IP addresses pass through (logged but allowed).""" + mock_response = Mock() + mock_response.status_code = 200 + mock_response.content = b'OK' + mock_response.headers = {'Content-Type': 'text/html'} + mock_response.encoding = 'utf-8' + mock_get.return_value = mock_response + + fetcher = WebFetcher() + content, _, _ = fetcher.fetch("https://192.168.1.1/page") + assert content is not None + + @patch('wayback_diff.fetcher.requests.Session.get') + def test_fetch_localhost_allowed(self, mock_get): + """Test that localhost passes through.""" + mock_response = Mock() + mock_response.status_code = 200 + mock_response.content = b'OK' + mock_response.headers = {'Content-Type': 'text/html'} + mock_response.encoding = 'utf-8' + mock_get.return_value = mock_response + + fetcher = WebFetcher() + content, _, _ = fetcher.fetch("https://localhost:8080/page") + assert content is not None diff --git a/tests/test_link_traverser.py b/tests/test_link_traverser.py new file mode 100644 index 0000000..4ca412b --- /dev/null +++ b/tests/test_link_traverser.py @@ -0,0 +1,657 @@ +"""Tests for link traverser.""" + +import pytest +from unittest.mock import patch, Mock, MagicMock +from wayback_diff.link_traverser import LinkTraverser + + +class TestLinkTraverserInit: + """Test LinkTraverser initialization.""" + + def test_init_basic(self): + """Test basic initialization.""" + traverser = LinkTraverser("https://example.com", "https://example.org") + assert traverser.base_url1 == "https://example.com" + assert traverser.base_url2 == "https://example.org" + assert traverser.max_depth == 2 + assert traverser.max_pages == 50 + assert traverser.same_domain_only is True + + def test_init_custom(self): + """Test custom initialization.""" + traverser = LinkTraverser( + "https://a.com", "https://b.com", + max_depth=5, max_pages=100, same_domain_only=False + ) + assert traverser.max_depth == 5 + assert traverser.max_pages == 100 + assert traverser.same_domain_only is False + + def test_init_domain_extraction(self): + """Test domain extraction from URLs.""" + traverser = LinkTraverser("https://www.example.com/path", "https://example.org/path") + assert traverser.domain1 == "example.com" + assert traverser.domain2 == "example.org" + + def test_init_empty_results(self): + """Test that results start empty.""" + traverser = LinkTraverser("https://a.com", "https://b.com") + assert traverser.results == [] + assert len(traverser.visited) == 0 + + +class TestNormalizeUrl: + """Test URL normalization.""" + + def test_normalize_absolute_url(self): + """Test normalizing absolute URL.""" + traverser = LinkTraverser("https://a.com", "https://b.com") + result = traverser._normalize_url("https://example.com/page/") + assert result == "https://example.com/page" + + def test_normalize_removes_trailing_slash(self): + """Test trailing slash removal.""" + traverser = LinkTraverser("https://a.com", "https://b.com") + result = traverser._normalize_url("https://example.com/path/") + assert not result.endswith("/") + + def test_normalize_root_keeps_slash(self): + """Test root path keeps slash.""" + traverser = LinkTraverser("https://a.com", "https://b.com") + result = traverser._normalize_url("https://example.com/") + assert result.endswith("/") + + def test_normalize_relative_url(self): + """Test normalizing relative URL with base.""" + traverser = LinkTraverser("https://a.com", "https://b.com") + result = traverser._normalize_url("/page", "https://example.com/") + assert result == "https://example.com/page" + + def test_normalize_relative_url_no_base(self): + """Test normalizing relative URL without base returns as-is.""" + traverser = LinkTraverser("https://a.com", "https://b.com") + result = traverser._normalize_url("/page") + assert result == "/page" + + def test_normalize_sorts_query_params(self): + """Test query parameter sorting.""" + traverser = LinkTraverser("https://a.com", "https://b.com") + result = traverser._normalize_url("https://example.com/page?z=1&a=2") + assert "a=2&z=1" in result + + def test_normalize_lowercases_netloc(self): + """Test netloc lowercasing.""" + traverser = LinkTraverser("https://a.com", "https://b.com") + result = traverser._normalize_url("https://EXAMPLE.COM/page") + assert "example.com" in result + + +class TestIsSameDomain: + """Test same domain checking.""" + + def test_same_domain(self): + """Test matching domain.""" + traverser = LinkTraverser("https://a.com", "https://b.com") + assert traverser._is_same_domain("https://example.com/page", "example.com") is True + + def test_different_domain(self): + """Test non-matching domain.""" + traverser = LinkTraverser("https://a.com", "https://b.com") + assert traverser._is_same_domain("https://other.com/page", "example.com") is False + + def test_empty_domain(self): + """Test empty domain (relative URL).""" + traverser = LinkTraverser("https://a.com", "https://b.com") + assert traverser._is_same_domain("/relative/path", "example.com") is True + + def test_www_prefix(self): + """Test www prefix stripping.""" + traverser = LinkTraverser("https://a.com", "https://b.com") + assert traverser._is_same_domain("https://www.example.com/page", "example.com") is True + + +class TestExtractLinks: + """Test link extraction.""" + + def test_extract_basic_links(self): + """Test extracting basic links.""" + traverser = LinkTraverser("https://example.com", "https://other.com") + html = b'Link 1Link 2' + links = traverser._extract_links(html, "https://example.com/") + assert len(links) >= 1 + + def test_extract_links_skips_anchors(self): + """Test that anchor-only links are skipped.""" + traverser = LinkTraverser("https://example.com", "https://b.com") + html = b'Anchor' + links = traverser._extract_links(html, "https://example.com/") + assert len(links) == 0 + + def test_extract_links_skips_javascript(self): + """Test that javascript: links are skipped.""" + traverser = LinkTraverser("https://example.com", "https://b.com") + html = b'JS' + links = traverser._extract_links(html, "https://example.com/") + assert len(links) == 0 + + def test_extract_links_skips_mailto(self): + """Test that mailto: links are skipped.""" + traverser = LinkTraverser("https://example.com", "https://b.com") + html = b'Email' + links = traverser._extract_links(html, "https://example.com/") + assert len(links) == 0 + + def test_extract_links_skips_tel(self): + """Test that tel: links are skipped.""" + traverser = LinkTraverser("https://example.com", "https://b.com") + html = b'Call' + links = traverser._extract_links(html, "https://example.com/") + assert len(links) == 0 + + def test_extract_links_skips_sms(self): + """Test that sms: links are skipped.""" + traverser = LinkTraverser("https://example.com", "https://b.com") + html = b'SMS' + links = traverser._extract_links(html, "https://example.com/") + assert len(links) == 0 + + def test_extract_links_deduplicates(self): + """Test that duplicate links are removed.""" + traverser = LinkTraverser("https://example.com", "https://b.com") + html = b'AB' + links = traverser._extract_links(html, "https://example.com/") + # Should be deduplicated + assert len(links) == len(set(links)) + + def test_extract_links_same_domain_only(self): + """Test same domain filtering.""" + traverser = LinkTraverser("https://example.com", "https://b.com", + same_domain_only=True) + html = b'SameOther' + links = traverser._extract_links(html, "https://example.com/") + # Should only include same-domain links + for link in links: + assert 'other.com' not in link + + def test_extract_links_handles_error(self): + """Test error handling in link extraction.""" + traverser = LinkTraverser("https://example.com", "https://b.com") + # Invalid HTML should not crash + links = traverser._extract_links(b'not html at all <<<>>>', "https://example.com/") + assert isinstance(links, list) + + def test_extract_links_empty_href(self): + """Test handling of empty href.""" + traverser = LinkTraverser("https://example.com", "https://b.com") + html = b'Empty' + links = traverser._extract_links(html, "https://example.com/") + assert isinstance(links, list) + + def test_extract_links_wayback_url_extraction(self): + """Test extracting original URLs from Wayback links.""" + traverser = LinkTraverser( + "https://web.archive.org/web/20230101/https://example.com", + "https://example.com", + same_domain_only=False + ) + html = b'Link' + links = traverser._extract_links(html, "https://web.archive.org/web/20230101/https://example.com/") + # Should extract the original URL + if links: + assert any('example.com' in link for link in links) + + def test_extract_links_skips_email_at_sign(self): + """Test that URLs with @ but not mailto are skipped.""" + traverser = LinkTraverser("https://example.com", "https://b.com") + html = b'User' + links = traverser._extract_links(html, "https://example.com/") + assert len(links) == 0 + + def test_extract_links_relative(self): + """Test extracting relative links.""" + traverser = LinkTraverser("https://example.com", "https://b.com", + same_domain_only=True) + html = b'About' + links = traverser._extract_links(html, "https://example.com/") + if links: + assert any('example.com' in link for link in links) + + +class TestGetMatchingUrl: + """Test matching URL generation.""" + + def test_basic_matching(self): + """Test basic URL matching.""" + traverser = LinkTraverser("https://old.example.com", "https://new.example.com") + result = traverser._get_matching_url("https://old.example.com/page") + assert result == "https://new.example.com/page" + + def test_matching_with_query(self): + """Test URL matching preserves query string.""" + traverser = LinkTraverser("https://old.com", "https://new.com") + result = traverser._get_matching_url("https://old.com/page?q=test") + assert "?q=test" in result + + def test_matching_wayback_url(self): + """Test URL matching with Wayback URL.""" + traverser = LinkTraverser( + "https://web.archive.org/web/20230101/https://example.com", + "https://example.com" + ) + result = traverser._get_matching_url( + "https://web.archive.org/web/20230101/https://example.com/page" + ) + assert result is not None + assert "example.com" in result + + +class TestComparePage: + """Test page comparison.""" + + @patch.object(LinkTraverser, '__init__', lambda self, *a, **kw: None) + def test_compare_page_success(self): + """Test successful page comparison.""" + traverser = LinkTraverser.__new__(LinkTraverser) + traverser.fetcher = Mock() + traverser.diff_engine = Mock() + traverser.same_domain_only = True + + traverser.fetcher.fetch.side_effect = [ + (b'Page1', 'text/html', {}), + (b'Page2', 'text/html', {}), + ] + traverser.diff_engine.extract_meaningful_changes.return_value = [] + traverser.diff_engine.get_summary.return_value = { + 'total_changes': 0, 'high_significance': 0 + } + + result = traverser.compare_page("https://a.com", "https://b.com") + assert result['status'] == 'compared' + assert 'links1' in result + assert 'links2' in result + + @patch.object(LinkTraverser, '__init__', lambda self, *a, **kw: None) + def test_compare_page_fetch_failure(self): + """Test page comparison when fetch fails.""" + traverser = LinkTraverser.__new__(LinkTraverser) + traverser.fetcher = Mock() + traverser.diff_engine = Mock() + + traverser.fetcher.fetch.side_effect = [ + (None, None, {}), + (b'OK', 'text/html', {}), + ] + + result = traverser.compare_page("https://a.com/bad", "https://b.com/good") + assert result['status'] == 'error' + + @patch.object(LinkTraverser, '__init__', lambda self, *a, **kw: None) + def test_compare_page_both_fail(self): + """Test page comparison when both fetches fail.""" + traverser = LinkTraverser.__new__(LinkTraverser) + traverser.fetcher = Mock() + traverser.diff_engine = Mock() + + traverser.fetcher.fetch.return_value = (None, None, {}) + + result = traverser.compare_page("https://a.com/bad", "https://b.com/bad") + assert result['status'] == 'error' + + @patch.object(LinkTraverser, '__init__', lambda self, *a, **kw: None) + def test_compare_page_cleans_wayback(self): + """Test that Wayback artifacts are cleaned during comparison.""" + traverser = LinkTraverser.__new__(LinkTraverser) + traverser.fetcher = Mock() + traverser.diff_engine = Mock() + traverser.same_domain_only = True + + traverser.fetcher.fetch.side_effect = [ + (b'WB content', 'text/html', {}), + (b'Clean content', 'text/html', {}), + ] + traverser.diff_engine.extract_meaningful_changes.return_value = [] + traverser.diff_engine.get_summary.return_value = { + 'total_changes': 0, 'high_significance': 0 + } + + wb_url = "https://web.archive.org/web/20230101/https://example.com" + with patch('wayback_diff.link_traverser.WaybackCleaner') as mock_cleaner: + mock_cleaner.is_wayback_url.side_effect = [True, False] + mock_cleaner.clean_wayback_html.return_value = b'Cleaned' + + result = traverser.compare_page(wb_url, "https://example.com") + mock_cleaner.clean_wayback_html.assert_called_once() + + +class TestTraverseAndCompare: + """Test traversal logic.""" + + @patch.object(LinkTraverser, 'compare_page') + def test_traverse_single_page(self, mock_compare): + """Test traversal of single page (max_depth=0).""" + traverser = LinkTraverser("https://a.com", "https://b.com", max_depth=0, max_pages=10) + + mock_compare.return_value = { + 'url1': 'https://a.com', + 'url2': 'https://b.com', + 'status': 'compared', + 'summary': {'total_changes': 0, 'high_significance': 0}, + 'changes_count': 0, + 'high_significance': 0, + 'links1': [], + 'links2': [], + } + + results = traverser.traverse_and_compare() + assert len(results) == 1 + assert results[0]['status'] == 'compared' + + @patch.object(LinkTraverser, 'compare_page') + def test_traverse_max_pages_limit(self, mock_compare): + """Test that traversal respects max_pages limit.""" + traverser = LinkTraverser("https://a.com", "https://b.com", + max_depth=5, max_pages=2) + + call_count = [0] + def mock_compare_fn(url1, url2): + call_count[0] += 1 + links = [f"https://a.com/page{i}" for i in range(10)] + return { + 'url1': url1, + 'url2': url2, + 'status': 'compared', + 'summary': {'total_changes': 0, 'high_significance': 0}, + 'changes_count': 0, + 'high_significance': 0, + 'links1': links, + 'links2': [], + } + + mock_compare.side_effect = mock_compare_fn + + results = traverser.traverse_and_compare() + assert len(results) <= 2 + + @patch.object(LinkTraverser, 'compare_page') + def test_traverse_skips_visited(self, mock_compare): + """Test that already-visited URLs are skipped.""" + traverser = LinkTraverser("https://a.com", "https://b.com", + max_depth=2, max_pages=10) + + mock_compare.return_value = { + 'url1': 'https://a.com', + 'url2': 'https://b.com', + 'status': 'compared', + 'summary': {'total_changes': 0, 'high_significance': 0}, + 'changes_count': 0, + 'high_significance': 0, + 'links1': ['https://a.com/'], # Link back to base + 'links2': [], + } + + results = traverser.traverse_and_compare() + # Should only compare once (the initial page) + assert mock_compare.call_count == 1 + + @patch.object(LinkTraverser, 'compare_page') + def test_traverse_error_results_no_links(self, mock_compare): + """Test that error results don't produce link traversal.""" + traverser = LinkTraverser("https://a.com", "https://b.com", + max_depth=2, max_pages=10) + + mock_compare.return_value = { + 'url1': 'https://a.com', + 'url2': 'https://b.com', + 'status': 'error', + 'error': 'Fetch failed', + } + + results = traverser.traverse_and_compare() + assert len(results) == 1 + assert results[0]['status'] == 'error' + + +class TestExtractLinksWaybackBranches: + """Test Wayback-specific link extraction branches.""" + + def test_extract_links_wayback_relative_path(self): + """Test extracting relative Wayback path links.""" + traverser = LinkTraverser( + "https://web.archive.org/web/20230101/https://example.com", + "https://example.com", + same_domain_only=False + ) + # A relative Wayback link that has no full URL, just a path fragment + html = b'Link' + links = traverser._extract_links( + html, "https://web.archive.org/web/20230101/https://example.com/" + ) + # Should attempt to construct URL from base + assert isinstance(links, list) + + def test_extract_links_wayback_no_base_match(self): + """Test Wayback link extraction when base URL has no extractable domain.""" + traverser = LinkTraverser( + "https://web.archive.org/web/20230101/https://example.com", + "https://example.com", + same_domain_only=False + ) + html = b'Link' + # Use a base_url that doesn't match the expected pattern + links = traverser._extract_links(html, "https://web.archive.org/weird/path") + assert isinstance(links, list) + + def test_extract_links_wayback_domain_filtering(self): + """Test Wayback domain filtering for same_domain_only.""" + traverser = LinkTraverser( + "https://web.archive.org/web/20230101/https://example.com", + "https://example.com", + same_domain_only=True + ) + html = b''' + Same + Other + ''' + links = traverser._extract_links( + html, "https://web.archive.org/web/20230101/https://example.com/" + ) + for link in links: + assert 'other.com' not in link + + def test_extract_links_non_wayback_cross_domain_filtered(self): + """Test non-Wayback cross domain links are filtered.""" + traverser = LinkTraverser( + "https://example.com", + "https://example.org", + same_domain_only=True + ) + html = b'Ext' + links = traverser._extract_links(html, "https://example.com/") + for link in links: + assert 'external.com' not in link + + +class TestTraverseAndCompareAdvanced: + """Test advanced traversal scenarios.""" + + @patch.object(LinkTraverser, 'compare_page') + @patch.object(LinkTraverser, '_get_matching_url') + def test_traverse_follows_links(self, mock_match, mock_compare): + """Test that traversal follows links from first page.""" + traverser = LinkTraverser("https://a.com", "https://b.com", + max_depth=1, max_pages=5) + + mock_match.return_value = "https://b.com/page1" + + call_count = [0] + def compare_side_effect(url1, url2): + call_count[0] += 1 + if call_count[0] == 1: + return { + 'url1': url1, + 'url2': url2, + 'status': 'compared', + 'summary': {'total_changes': 0, 'high_significance': 0}, + 'changes_count': 0, + 'high_significance': 0, + 'links1': ['https://a.com/page1'], + 'links2': [], + } + return { + 'url1': url1, + 'url2': url2, + 'status': 'compared', + 'summary': {'total_changes': 0, 'high_significance': 0}, + 'changes_count': 0, + 'high_significance': 0, + 'links1': [], + 'links2': [], + } + + mock_compare.side_effect = compare_side_effect + + results = traverser.traverse_and_compare() + assert len(results) == 2 + + @patch.object(LinkTraverser, 'compare_page') + def test_traverse_depth_limit(self, mock_compare): + """Test that traversal respects depth limit.""" + traverser = LinkTraverser("https://a.com", "https://b.com", + max_depth=0, max_pages=100) + + mock_compare.return_value = { + 'url1': 'https://a.com', + 'url2': 'https://b.com', + 'status': 'compared', + 'summary': {'total_changes': 0, 'high_significance': 0}, + 'changes_count': 0, + 'high_significance': 0, + 'links1': ['https://a.com/deep1', 'https://a.com/deep2'], + 'links2': [], + } + + results = traverser.traverse_and_compare() + # With max_depth=0, should only compare the initial page + assert len(results) == 1 + + @patch.object(LinkTraverser, 'compare_page') + def test_traverse_link_processing_error(self, mock_compare): + """Test that errors in link processing are handled gracefully.""" + traverser = LinkTraverser("https://a.com", "https://b.com", + max_depth=1, max_pages=10) + + mock_compare.return_value = { + 'url1': 'https://a.com', + 'url2': 'https://b.com', + 'status': 'compared', + 'summary': {'total_changes': 0, 'high_significance': 0}, + 'changes_count': 0, + 'high_significance': 0, + 'links1': ['not-a-valid-url', '', None], + 'links2': [], + } + + # Should not crash + results = traverser.traverse_and_compare() + assert len(results) >= 1 + + +class TestGetMatchingUrlAdvanced: + """Test advanced URL matching scenarios.""" + + def test_matching_wayback_url_no_original(self): + """Test URL matching with Wayback URL that has no extractable original.""" + traverser = LinkTraverser( + "https://web.archive.org/web/20230101/https://old.com", + "https://new.com" + ) + result = traverser._get_matching_url( + "https://web.archive.org/web/20230101/https://old.com/page?q=1" + ) + assert result is not None + assert "new.com" in result + + def test_matching_non_wayback_with_path(self): + """Test URL matching preserves path.""" + traverser = LinkTraverser("https://old.com", "https://new.com") + result = traverser._get_matching_url("https://old.com/deep/path/page.html") + assert result == "https://new.com/deep/path/page.html" + + +class TestGenerateReport: + """Test report generation.""" + + def test_generate_report_empty(self): + """Test report with no results.""" + traverser = LinkTraverser("https://a.com", "https://b.com") + report = traverser.generate_report() + assert 'LINK TRAVERSAL COMPARISON REPORT' in report + assert 'Pages compared: 0' in report + + def test_generate_report_with_results(self): + """Test report with comparison results.""" + traverser = LinkTraverser("https://a.com", "https://b.com") + traverser.results = [ + { + 'url1': 'https://a.com/', + 'url2': 'https://b.com/', + 'status': 'compared', + 'summary': { + 'total_changes': 5, + 'high_significance': 2, + 'medium_significance': 2, + 'low_significance': 1, + }, + 'changes_count': 5, + 'high_significance': 2, + } + ] + report = traverser.generate_report() + assert 'Pages compared: 1' in report + assert 'Successfully compared: 1' in report + assert 'HIGH SIGNIFICANCE DIFFERENCES' in report + assert 'https://a.com/' in report + + def test_generate_report_with_errors(self): + """Test report with error results.""" + traverser = LinkTraverser("https://a.com", "https://b.com") + traverser.results = [ + { + 'url1': 'https://a.com/bad', + 'url2': 'https://b.com/bad', + 'status': 'error', + 'error': 'Connection refused', + } + ] + report = traverser.generate_report() + assert 'Errors: 1' in report + assert 'Connection refused' in report + + def test_generate_report_mixed_results(self): + """Test report with mixed compared and error results.""" + traverser = LinkTraverser("https://a.com", "https://b.com") + traverser.results = [ + { + 'url1': 'https://a.com/', + 'url2': 'https://b.com/', + 'status': 'compared', + 'summary': { + 'total_changes': 0, + 'high_significance': 0, + 'medium_significance': 0, + 'low_significance': 0, + }, + 'changes_count': 0, + 'high_significance': 0, + }, + { + 'url1': 'https://a.com/broken', + 'url2': 'https://b.com/broken', + 'status': 'error', + 'error': 'Timeout', + } + ] + report = traverser.generate_report() + assert 'Pages compared: 2' in report + assert 'Successfully compared: 1' in report + assert 'Errors: 1' in report diff --git a/tests/test_report_generator.py b/tests/test_report_generator.py new file mode 100644 index 0000000..8ba98a6 --- /dev/null +++ b/tests/test_report_generator.py @@ -0,0 +1,444 @@ +"""Tests for markdown report generator.""" + +import os +import pytest +import tempfile +import shutil +from pathlib import Path +from unittest.mock import patch +from wayback_diff.report_generator import MarkdownReportGenerator + + +class TestMarkdownReportGeneratorInit: + """Test report generator initialization.""" + + def test_init_creates_directory(self): + """Test that output directory is created.""" + with tempfile.TemporaryDirectory() as tmpdir: + output_dir = os.path.join(tmpdir, "reports") + gen = MarkdownReportGenerator(output_dir=output_dir) + assert os.path.isdir(output_dir) + + def test_init_default_dir(self): + """Test default output directory.""" + with tempfile.TemporaryDirectory() as tmpdir: + old_cwd = os.getcwd() + try: + os.chdir(tmpdir) + gen = MarkdownReportGenerator() + assert gen.output_dir == Path("./reports") + finally: + os.chdir(old_cwd) + # Clean up if created + reports_dir = os.path.join(tmpdir, "reports") + if os.path.exists(reports_dir): + shutil.rmtree(reports_dir) + + +class TestGenerateComparisonReport: + """Test report generation.""" + + def _make_summary(self, total=5, added=2, removed=1, modified=2, + high=1, medium=2, low=2): + return { + 'total_changes': total, + 'added': added, + 'removed': removed, + 'modified': modified, + 'high_significance': high, + 'medium_significance': medium, + 'low_significance': low, + } + + def _make_change(self, change_type='modified', old_text='Old content', + new_text='New content', significance='high'): + return { + 'type': change_type, + 'old_text': old_text, + 'new_text': new_text, + 'significance': significance, + } + + def test_basic_report(self): + """Test basic report generation.""" + with tempfile.TemporaryDirectory() as tmpdir: + gen = MarkdownReportGenerator(output_dir=tmpdir) + report = gen.generate_comparison_report( + "https://old.example.com", + "https://new.example.com", + [], + self._make_summary(total=0, added=0, removed=0, modified=0, + high=0, medium=0, low=0) + ) + + assert '# Website Comparison Report' in report + assert 'https://old.example.com' in report + assert 'https://new.example.com' in report + assert '**Total Changes:** 0' in report + + def test_report_with_high_changes(self): + """Test report with high significance changes.""" + with tempfile.TemporaryDirectory() as tmpdir: + gen = MarkdownReportGenerator(output_dir=tmpdir) + changes = [self._make_change(significance='high')] + report = gen.generate_comparison_report( + "https://a.com", "https://b.com", + changes, self._make_summary() + ) + + assert '## High Significance Changes' in report + assert 'MODIFIED' in report + assert 'Old content' in report + assert 'New content' in report + + def test_report_with_medium_changes(self): + """Test report with medium significance changes.""" + with tempfile.TemporaryDirectory() as tmpdir: + gen = MarkdownReportGenerator(output_dir=tmpdir) + changes = [self._make_change(significance='medium') for _ in range(3)] + report = gen.generate_comparison_report( + "https://a.com", "https://b.com", + changes, self._make_summary(high=0, medium=3, low=0) + ) + + assert '## Medium Significance Changes' in report + assert '**Total:** 3 changes' in report + + def test_report_medium_changes_truncation(self): + """Test medium changes are truncated after 10.""" + with tempfile.TemporaryDirectory() as tmpdir: + gen = MarkdownReportGenerator(output_dir=tmpdir) + changes = [self._make_change(significance='medium') for _ in range(15)] + report = gen.generate_comparison_report( + "https://a.com", "https://b.com", + changes, self._make_summary(total=15, high=0, medium=15, low=0) + ) + + assert '... and 5 more medium significance changes' in report + + def test_report_high_changes_truncation(self): + """Test high changes are truncated after 50.""" + with tempfile.TemporaryDirectory() as tmpdir: + gen = MarkdownReportGenerator(output_dir=tmpdir) + changes = [self._make_change(significance='high') for _ in range(55)] + report = gen.generate_comparison_report( + "https://a.com", "https://b.com", + changes, self._make_summary(total=55, high=55, medium=0, low=0) + ) + + assert '... and 5 more high significance changes' in report + + def test_report_added_change(self): + """Test report with added change (no old_text).""" + with tempfile.TemporaryDirectory() as tmpdir: + gen = MarkdownReportGenerator(output_dir=tmpdir) + changes = [self._make_change(change_type='added', old_text='', new_text='New')] + report = gen.generate_comparison_report( + "https://a.com", "https://b.com", + changes, self._make_summary(total=1, added=1, removed=0, modified=0) + ) + + assert 'ADDED' in report + assert '**Added/New:**' in report + + def test_report_removed_change(self): + """Test report with removed change (no new_text).""" + with tempfile.TemporaryDirectory() as tmpdir: + gen = MarkdownReportGenerator(output_dir=tmpdir) + changes = [self._make_change(change_type='removed', old_text='Gone', new_text='')] + report = gen.generate_comparison_report( + "https://a.com", "https://b.com", + changes, self._make_summary(total=1, added=0, removed=1, modified=0) + ) + + assert 'REMOVED' in report + assert '**Removed/Changed:**' in report + + def test_report_long_text_truncation(self): + """Test that long text is truncated in report.""" + with tempfile.TemporaryDirectory() as tmpdir: + gen = MarkdownReportGenerator(output_dir=tmpdir) + long_text = 'A' * 500 + changes = [self._make_change(old_text=long_text, new_text=long_text)] + report = gen.generate_comparison_report( + "https://a.com", "https://b.com", + changes, self._make_summary() + ) + + assert '...' in report + + def test_report_recommendations_high_significance(self): + """Test recommendations when high significance changes exist.""" + with tempfile.TemporaryDirectory() as tmpdir: + gen = MarkdownReportGenerator(output_dir=tmpdir) + report = gen.generate_comparison_report( + "https://a.com", "https://b.com", + [], self._make_summary(high=5) + ) + + assert 'Action Required' in report + + def test_report_recommendations_minimal_changes(self): + """Test recommendations when changes are minimal.""" + with tempfile.TemporaryDirectory() as tmpdir: + gen = MarkdownReportGenerator(output_dir=tmpdir) + report = gen.generate_comparison_report( + "https://a.com", "https://b.com", + [], self._make_summary(total=2, high=0, medium=2, low=0) + ) + + assert 'Migration Status' in report + + def test_report_with_visual_results(self): + """Test report with visual comparison results.""" + with tempfile.TemporaryDirectory() as tmpdir: + gen = MarkdownReportGenerator(output_dir=tmpdir) + + visual_results = { + 'chrome': { + 'difference_ratio': 0.02, + 'different_pixels': 500, + 'total_pixels': 100000, + 'screenshot1': '', + 'screenshot2': '', + 'comparison': '', + } + } + + report = gen.generate_comparison_report( + "https://a.com", "https://b.com", + [], self._make_summary(total=0, high=0, medium=0, low=0), + visual_results=visual_results + ) + + assert '## Visual Comparison' in report + assert 'CHROME' in report + assert '2.00%' in report + assert 'Minimal differences' in report + + def test_report_visual_high_difference(self): + """Test report with high visual difference.""" + with tempfile.TemporaryDirectory() as tmpdir: + gen = MarkdownReportGenerator(output_dir=tmpdir) + + visual_results = { + 'firefox': { + 'difference_ratio': 0.15, + 'different_pixels': 15000, + 'total_pixels': 100000, + 'screenshot1': '', + 'screenshot2': '', + 'comparison': '', + } + } + + report = gen.generate_comparison_report( + "https://a.com", "https://b.com", + [], self._make_summary(total=0, high=0, medium=0, low=0), + visual_results=visual_results + ) + + assert 'Significant differences detected' in report + + def test_report_visual_with_error(self): + """Test report with visual comparison error.""" + with tempfile.TemporaryDirectory() as tmpdir: + gen = MarkdownReportGenerator(output_dir=tmpdir) + + visual_results = { + 'chrome': { + 'error': 'Browser not found', + } + } + + report = gen.generate_comparison_report( + "https://a.com", "https://b.com", + [], self._make_summary(total=0, high=0, medium=0, low=0), + visual_results=visual_results + ) + + assert 'Error' in report + assert 'Browser not found' in report + + def test_report_visual_recommendations(self): + """Test visual-specific recommendations.""" + with tempfile.TemporaryDirectory() as tmpdir: + gen = MarkdownReportGenerator(output_dir=tmpdir) + + visual_results = { + 'chrome': { + 'difference_ratio': 0.20, + 'different_pixels': 20000, + 'total_pixels': 100000, + 'screenshot1': '', + 'screenshot2': '', + 'comparison': '', + } + } + + report = gen.generate_comparison_report( + "https://a.com", "https://b.com", + [], self._make_summary(total=0, high=0, medium=0, low=0), + visual_results=visual_results + ) + + assert 'Visual Differences' in report + assert 'chrome' in report.lower() + + def test_report_visual_with_screenshots(self): + """Test report with actual screenshot files.""" + with tempfile.TemporaryDirectory() as tmpdir: + gen = MarkdownReportGenerator(output_dir=tmpdir) + + # Create fake screenshot files + screenshots_dir = os.path.join(tmpdir, "ss") + os.makedirs(screenshots_dir, exist_ok=True) + s1 = os.path.join(screenshots_dir, "s1.png") + s2 = os.path.join(screenshots_dir, "s2.png") + comp = os.path.join(screenshots_dir, "comp.png") + for f in [s1, s2, comp]: + with open(f, 'wb') as fh: + fh.write(b'\x89PNG\r\n') # PNG header bytes + + visual_results = { + 'chrome': { + 'difference_ratio': 0.05, + 'different_pixels': 5000, + 'total_pixels': 100000, + 'screenshot1': s1, + 'screenshot2': s2, + 'comparison': comp, + } + } + + report = gen.generate_comparison_report( + "https://a.com", "https://b.com", + [], self._make_summary(total=0, high=0, medium=0, low=0), + visual_results=visual_results + ) + + assert '![' in report # Image references + assert 'screenshots/' in report + + def test_report_with_traversal_results(self): + """Test report with traversal results.""" + with tempfile.TemporaryDirectory() as tmpdir: + gen = MarkdownReportGenerator(output_dir=tmpdir) + + traversal_results = [ + { + 'url1': 'https://a.com/', + 'url2': 'https://b.com/', + 'status': 'compared', + 'high_significance': 3, + 'changes_count': 10, + }, + { + 'url1': 'https://a.com/page2', + 'url2': 'https://b.com/page2', + 'status': 'error', + 'error': 'Timeout', + } + ] + + report = gen.generate_comparison_report( + "https://a.com", "https://b.com", + [], self._make_summary(total=0, high=0, medium=0, low=0), + traversal_results=traversal_results + ) + + assert '## Site-Wide Comparison' in report + assert '**Pages Compared:** 1' in report + assert '**Pages with Errors:** 1' in report + assert 'High Significance Differences' in report + + def test_report_traversal_long_urls(self): + """Test traversal report truncates long URLs.""" + with tempfile.TemporaryDirectory() as tmpdir: + gen = MarkdownReportGenerator(output_dir=tmpdir) + + long_url = 'https://example.com/' + 'a' * 100 + traversal_results = [ + { + 'url1': long_url, + 'url2': long_url, + 'status': 'compared', + 'high_significance': 0, + 'changes_count': 0, + } + ] + + report = gen.generate_comparison_report( + "https://a.com", "https://b.com", + [], self._make_summary(total=0, high=0, medium=0, low=0), + traversal_results=traversal_results + ) + + assert '...' in report # URL should be truncated + + def test_report_footer(self): + """Test report footer.""" + with tempfile.TemporaryDirectory() as tmpdir: + gen = MarkdownReportGenerator(output_dir=tmpdir) + report = gen.generate_comparison_report( + "https://a.com", "https://b.com", + [], self._make_summary(total=0, high=0, medium=0, low=0) + ) + + assert 'Report generated by Wayback-Diff' in report + + def test_report_timestamp(self): + """Test report contains generation timestamp.""" + with tempfile.TemporaryDirectory() as tmpdir: + gen = MarkdownReportGenerator(output_dir=tmpdir) + report = gen.generate_comparison_report( + "https://a.com", "https://b.com", + [], self._make_summary(total=0, high=0, medium=0, low=0) + ) + + assert '**Generated:**' in report + + +class TestSaveReport: + """Test report saving.""" + + def test_save_report_auto_filename(self): + """Test saving report with auto-generated filename.""" + with tempfile.TemporaryDirectory() as tmpdir: + gen = MarkdownReportGenerator(output_dir=tmpdir) + path = gen.save_report("# Test Report") + + assert os.path.exists(path) + assert path.endswith('.md') + assert 'comparison_report_' in path + + with open(path) as f: + assert f.read() == '# Test Report' + + def test_save_report_custom_filename(self): + """Test saving report with custom filename.""" + with tempfile.TemporaryDirectory() as tmpdir: + gen = MarkdownReportGenerator(output_dir=tmpdir) + path = gen.save_report("# Custom", filename="custom.md") + + assert os.path.exists(path) + assert path.endswith('custom.md') + + def test_save_report_returns_path(self): + """Test that save_report returns the file path.""" + with tempfile.TemporaryDirectory() as tmpdir: + gen = MarkdownReportGenerator(output_dir=tmpdir) + path = gen.save_report("# Report") + + assert isinstance(path, str) + assert tmpdir in path + + def test_save_report_unicode(self): + """Test saving report with unicode content.""" + with tempfile.TemporaryDirectory() as tmpdir: + gen = MarkdownReportGenerator(output_dir=tmpdir) + path = gen.save_report("# Informe con acentos y tildes") + + with open(path, encoding='utf-8') as f: + content = f.read() + assert 'acentos' in content diff --git a/tests/test_visual_comparison.py b/tests/test_visual_comparison.py new file mode 100644 index 0000000..27015f6 --- /dev/null +++ b/tests/test_visual_comparison.py @@ -0,0 +1,873 @@ +"""Tests for visual comparison module.""" + +import os +import io +import pytest +import tempfile +from unittest.mock import patch, Mock, MagicMock, PropertyMock +from pathlib import Path +from PIL import Image + +from wayback_diff.visual_comparison import ( + VisualComparison, + SELENIUM_AVAILABLE, + WEBDRIVER_MANAGER_AVAILABLE, +) + + +class TestVisualComparisonInit: + """Test VisualComparison initialization.""" + + def test_init_defaults(self): + """Test default initialization.""" + vc = VisualComparison() + assert vc.browser_name == 'chrome' + assert vc.headless is True + assert vc.viewport_width == 1920 + assert vc.viewport_height == 1080 + assert vc.wait_time == 3 + assert vc.driver is None + + def test_init_custom(self): + """Test custom initialization.""" + vc = VisualComparison( + browser='firefox', + headless=False, + viewport_width=1280, + viewport_height=720, + wait_time=5 + ) + assert vc.browser_name == 'firefox' + assert vc.headless is False + assert vc.viewport_width == 1280 + assert vc.viewport_height == 720 + assert vc.wait_time == 5 + + def test_init_all_supported_browsers(self): + """Test initialization with all supported browser names.""" + for browser in ['chrome', 'chromium', 'firefox', 'opera', 'edge']: + vc = VisualComparison(browser=browser) + assert vc.browser_name == browser + + def test_init_invalid_browser(self): + """Test initialization with invalid browser.""" + with pytest.raises(ValueError, match="Browser must be one of"): + VisualComparison(browser='safari') + + def test_init_case_insensitive_browser(self): + """Test browser name is lowercased.""" + vc = VisualComparison(browser='Chrome') + assert vc.browser_name == 'chrome' + + @patch('wayback_diff.visual_comparison.SELENIUM_AVAILABLE', False) + def test_init_no_selenium(self): + """Test initialization when selenium is not available.""" + with pytest.raises(ImportError, match="Selenium is required"): + VisualComparison() + + def test_supported_browsers_list(self): + """Test supported browsers class variable.""" + assert 'chrome' in VisualComparison.SUPPORTED_BROWSERS + assert 'firefox' in VisualComparison.SUPPORTED_BROWSERS + assert 'chromium' in VisualComparison.SUPPORTED_BROWSERS + + def test_default_viewport_constants(self): + """Test default viewport constants.""" + assert VisualComparison.DEFAULT_VIEWPORT_WIDTH == 1920 + assert VisualComparison.DEFAULT_VIEWPORT_HEIGHT == 1080 + + +class TestDetectAvailableBrowsers: + """Test browser detection.""" + + @patch('wayback_diff.visual_comparison.SELENIUM_AVAILABLE', False) + def test_detect_no_selenium(self): + """Test detection when selenium unavailable.""" + result = VisualComparison.detect_available_browsers() + assert result == [] + + @patch('wayback_diff.visual_comparison.SELENIUM_AVAILABLE', True) + @patch('wayback_diff.visual_comparison.webdriver') + def test_detect_chrome_available(self, mock_webdriver): + """Test detection when Chrome is available.""" + mock_driver = Mock() + mock_webdriver.Chrome.return_value = mock_driver + + result = VisualComparison.detect_available_browsers() + assert 'chrome' in result + + @patch('wayback_diff.visual_comparison.SELENIUM_AVAILABLE', True) + @patch('wayback_diff.visual_comparison.webdriver') + def test_detect_all_fail_returns_chrome(self, mock_webdriver): + """Test fallback to chrome when no browsers detected.""" + mock_webdriver.Chrome.side_effect = Exception("No chrome") + mock_webdriver.Firefox.side_effect = Exception("No firefox") + mock_webdriver.Edge.side_effect = Exception("No edge") + + result = VisualComparison.detect_available_browsers() + assert result == ['chrome'] + + +class TestCompareImages: + """Test image comparison.""" + + def _create_test_image(self, width=100, height=100, color=(255, 0, 0)): + """Create a test image.""" + img = Image.new('RGB', (width, height), color=color) + return img + + def _save_test_image(self, path, width=100, height=100, color=(255, 0, 0)): + """Save a test image to path.""" + img = self._create_test_image(width, height, color) + img.save(path) + return path + + def test_compare_identical_images(self): + """Test comparing identical images.""" + with tempfile.TemporaryDirectory() as tmpdir: + img_path = os.path.join(tmpdir, "img.png") + self._save_test_image(img_path, color=(100, 100, 100)) + + vc = VisualComparison() + result = vc.compare_images(img_path, img_path) + + assert float(result['difference_ratio']) == 0.0 + assert int(result['different_pixels']) == 0 + assert result['is_similar'] == True + + def test_compare_different_images(self): + """Test comparing different images.""" + with tempfile.TemporaryDirectory() as tmpdir: + img1_path = os.path.join(tmpdir, "img1.png") + img2_path = os.path.join(tmpdir, "img2.png") + self._save_test_image(img1_path, color=(255, 0, 0)) + self._save_test_image(img2_path, color=(0, 0, 255)) + + vc = VisualComparison() + result = vc.compare_images(img1_path, img2_path) + + assert result['difference_ratio'] > 0 + assert result['different_pixels'] > 0 + + def test_compare_images_different_sizes(self): + """Test comparing images of different sizes.""" + with tempfile.TemporaryDirectory() as tmpdir: + img1_path = os.path.join(tmpdir, "img1.png") + img2_path = os.path.join(tmpdir, "img2.png") + self._save_test_image(img1_path, width=100, height=100) + self._save_test_image(img2_path, width=200, height=200) + + vc = VisualComparison() + result = vc.compare_images(img1_path, img2_path) + + assert 'difference_ratio' in result + assert result['total_pixels'] > 0 + + def test_compare_images_output_path(self): + """Test that comparison image is saved.""" + with tempfile.TemporaryDirectory() as tmpdir: + img1_path = os.path.join(tmpdir, "img1.png") + img2_path = os.path.join(tmpdir, "img2.png") + output_path = os.path.join(tmpdir, "comparison.png") + self._save_test_image(img1_path, color=(255, 0, 0)) + self._save_test_image(img2_path, color=(0, 255, 0)) + + vc = VisualComparison() + result = vc.compare_images(img1_path, img2_path, output_path=output_path) + + assert os.path.exists(output_path) + assert result['comparison_image_path'] == output_path + + def test_compare_images_no_output_path(self): + """Test comparison without saving.""" + with tempfile.TemporaryDirectory() as tmpdir: + img1_path = os.path.join(tmpdir, "img1.png") + img2_path = os.path.join(tmpdir, "img2.png") + self._save_test_image(img1_path) + self._save_test_image(img2_path) + + vc = VisualComparison() + result = vc.compare_images(img1_path, img2_path) + + assert result['comparison_image_path'] is None + + def test_compare_images_rgba(self): + """Test comparing RGBA images (converted to RGB).""" + with tempfile.TemporaryDirectory() as tmpdir: + img1_path = os.path.join(tmpdir, "img1.png") + img2_path = os.path.join(tmpdir, "img2.png") + + img1 = Image.new('RGBA', (50, 50), (255, 0, 0, 128)) + img1.save(img1_path) + img2 = Image.new('RGBA', (50, 50), (0, 0, 255, 128)) + img2.save(img2_path) + + vc = VisualComparison() + result = vc.compare_images(img1_path, img2_path) + + assert 'difference_ratio' in result + + def test_compare_images_threshold(self): + """Test custom threshold parameter.""" + with tempfile.TemporaryDirectory() as tmpdir: + img1_path = os.path.join(tmpdir, "img1.png") + img2_path = os.path.join(tmpdir, "img2.png") + # Clearly different colors + self._save_test_image(img1_path, color=(0, 0, 0)) + self._save_test_image(img2_path, color=(255, 255, 255)) + + vc = VisualComparison() + # With very high threshold, even big differences are "similar" + result_high = vc.compare_images(img1_path, img2_path, threshold=1.0) + assert result_high['is_similar'] == True + + # With low threshold, should be different + result_low = vc.compare_images(img1_path, img2_path, threshold=0.1) + assert int(result_low['different_pixels']) > 0 + + +class TestCompareImagesWithoutNumpy: + """Test image comparison fallback without numpy.""" + + def _save_test_image(self, path, width=20, height=20, color=(255, 0, 0)): + """Save a small test image.""" + img = Image.new('RGB', (width, height), color=color) + img.save(path) + + def test_compare_fallback_identical(self): + """Test pixel-by-pixel fallback with identical images.""" + with tempfile.TemporaryDirectory() as tmpdir: + img_path = os.path.join(tmpdir, "img.png") + self._save_test_image(img_path, width=10, height=10, color=(100, 100, 100)) + + vc = VisualComparison() + + # Force the ImportError fallback path by making np.array raise + import numpy as np + with patch.object(np, 'array', side_effect=ImportError("no numpy")): + result = vc.compare_images(img_path, img_path) + + assert float(result['difference_ratio']) == 0.0 + assert int(result['different_pixels']) == 0 + + def test_compare_fallback_different(self): + """Test pixel-by-pixel fallback with different images.""" + with tempfile.TemporaryDirectory() as tmpdir: + img1_path = os.path.join(tmpdir, "img1.png") + img2_path = os.path.join(tmpdir, "img2.png") + self._save_test_image(img1_path, width=10, height=10, color=(255, 0, 0)) + self._save_test_image(img2_path, width=10, height=10, color=(0, 0, 255)) + + vc = VisualComparison() + + import numpy as np + with patch.object(np, 'array', side_effect=ImportError("no numpy")): + result = vc.compare_images(img1_path, img2_path) + + assert float(result['difference_ratio']) > 0 + assert int(result['different_pixels']) > 0 + + def test_compare_fallback_with_output(self): + """Test pixel-by-pixel fallback saves comparison output.""" + with tempfile.TemporaryDirectory() as tmpdir: + img1_path = os.path.join(tmpdir, "img1.png") + img2_path = os.path.join(tmpdir, "img2.png") + out_path = os.path.join(tmpdir, "comp.png") + self._save_test_image(img1_path, width=5, height=5, color=(100, 100, 100)) + self._save_test_image(img2_path, width=5, height=5, color=(110, 110, 110)) + + vc = VisualComparison() + + import numpy as np + with patch.object(np, 'array', side_effect=ImportError("no numpy")): + result = vc.compare_images(img1_path, img2_path, output_path=out_path) + + assert os.path.exists(out_path) + + +class TestCreateSideBySide: + """Test side-by-side comparison image creation.""" + + def test_create_side_by_side(self): + """Test creating side-by-side image.""" + vc = VisualComparison() + img1 = Image.new('RGB', (100, 100), color=(255, 0, 0)) + img2 = Image.new('RGB', (100, 100), color=(0, 255, 0)) + diff = Image.new('RGB', (100, 100), color=(0, 0, 255)) + + result = vc._create_side_by_side(img1, img2, diff) + + assert isinstance(result, Image.Image) + # Width should be 3 images + spacing + assert result.width == 100 * 3 + 20 + # Height should include label + assert result.height == 100 + 40 + + +class TestQuit: + """Test driver cleanup.""" + + def test_quit_with_driver(self): + """Test quitting with active driver.""" + vc = VisualComparison() + mock_driver = Mock() + vc.driver = mock_driver + + vc.quit() + + mock_driver.quit.assert_called_once() + assert vc.driver is None + + def test_quit_without_driver(self): + """Test quitting without driver does nothing.""" + vc = VisualComparison() + vc.driver = None + vc.quit() # Should not raise + + def test_quit_driver_exception(self): + """Test quitting when driver.quit raises exception.""" + vc = VisualComparison() + mock_driver = Mock() + mock_driver.quit.side_effect = Exception("Already closed") + vc.driver = mock_driver + + vc.quit() # Should not raise + assert vc.driver is None + + +class TestContextManager: + """Test context manager protocol.""" + + def test_enter(self): + """Test __enter__ returns self.""" + vc = VisualComparison() + assert vc.__enter__() is vc + + def test_exit_calls_quit(self): + """Test __exit__ calls quit.""" + vc = VisualComparison() + mock_driver = Mock() + vc.driver = mock_driver + + vc.__exit__(None, None, None) + + mock_driver.quit.assert_called_once() + assert vc.driver is None + + def test_context_manager_usage(self): + """Test using as context manager.""" + with VisualComparison() as vc: + assert isinstance(vc, VisualComparison) + assert vc.driver is None + + +class TestTakeScreenshot: + """Test screenshot taking.""" + + def test_take_screenshot_creates_driver(self): + """Test that take_screenshot creates driver if needed.""" + vc = VisualComparison() + assert vc.driver is None + + mock_driver = Mock() + mock_driver.get_screenshot_as_png.return_value = b'\x89PNG' + mock_driver.execute_script.side_effect = [ + 'complete', # document.readyState + 100, # scrollWidth + 100, # scrollHeight + 100, # innerWidth + 100, # innerHeight + None, # scrollTo + ] + + with patch.object(vc, '_create_driver', return_value=mock_driver): + # Mock full page screenshot + with patch.object(vc, '_take_full_page_screenshot', return_value=b'\x89PNG'): + result = vc.take_screenshot("https://example.com") + + assert result == b'\x89PNG' + + def test_take_screenshot_saves_to_file(self): + """Test that screenshot is saved when output_path provided.""" + with tempfile.TemporaryDirectory() as tmpdir: + vc = VisualComparison() + output_path = os.path.join(tmpdir, "screenshot.png") + + mock_driver = Mock() + vc.driver = mock_driver + mock_driver.execute_script.side_effect = [ + 'complete', # readyState + ] + + with patch.object(vc, '_take_full_page_screenshot', return_value=b'\x89PNG\r\n\x1a\n'): + result = vc.take_screenshot("https://example.com", output_path=output_path) + + assert os.path.exists(output_path) + + def test_take_screenshot_viewport_only(self): + """Test viewport-only screenshot.""" + vc = VisualComparison() + mock_driver = Mock() + vc.driver = mock_driver + mock_driver.get_screenshot_as_png.return_value = b'\x89PNG' + mock_driver.execute_script.side_effect = [ + 'complete', # readyState + ] + + result = vc.take_screenshot("https://example.com", full_page=False) + assert result == b'\x89PNG' + mock_driver.get_screenshot_as_png.assert_called_once() + + def test_take_screenshot_wayback_url(self): + """Test that Wayback banner is removed for archive URLs.""" + vc = VisualComparison() + mock_driver = Mock() + vc.driver = mock_driver + mock_driver.execute_script.side_effect = [ + 'complete', # readyState + None, # _remove_wayback_banner script + ] + + with patch.object(vc, '_remove_wayback_banner'): + with patch.object(vc, '_take_full_page_screenshot', return_value=b'\x89PNG'): + result = vc.take_screenshot( + "https://web.archive.org/web/20230101/https://example.com/" + ) + vc._remove_wayback_banner.assert_called_once() + + def test_take_screenshot_timeout_handled(self): + """Test that WebDriverWait timeout is handled gracefully.""" + vc = VisualComparison() + mock_driver = Mock() + vc.driver = mock_driver + + # Simulate timeout on readyState check + from selenium.common.exceptions import TimeoutException + mock_driver.execute_script.side_effect = [ + TimeoutException("Timeout"), + ] + + with patch.object(vc, '_take_full_page_screenshot', return_value=b'\x89PNG'): + with patch('wayback_diff.visual_comparison.WebDriverWait') as mock_wait: + mock_wait.return_value.until.side_effect = TimeoutException("Timeout") + result = vc.take_screenshot("https://example.com") + assert result == b'\x89PNG' + + +class TestCompareUrls: + """Test multi-browser URL comparison.""" + + def test_compare_urls_single_browser(self): + """Test comparison with single browser.""" + vc = VisualComparison() + + with tempfile.TemporaryDirectory() as tmpdir: + # Create mock screenshots + mock_screenshot = b'\x89PNG\r\n' + + with patch.object(vc, '_create_driver') as mock_create: + mock_driver = Mock() + mock_create.return_value = mock_driver + + with patch.object(vc, 'take_screenshot') as mock_ts: + mock_ts.return_value = mock_screenshot + + with patch.object(vc, 'compare_images') as mock_ci: + mock_ci.return_value = { + 'difference_ratio': 0.01, + 'different_pixels': 100, + 'total_pixels': 100000, + 'is_similar': True, + 'comparison_image_path': None, + } + + results = vc.compare_urls( + "https://a.com", "https://b.com", + tmpdir, browsers=['chrome'] + ) + + assert 'chrome' in results + assert results['chrome']['difference_ratio'] == 0.01 + + def test_compare_urls_auto_detect(self): + """Test comparison with auto-detected browsers.""" + vc = VisualComparison() + + with tempfile.TemporaryDirectory() as tmpdir: + with patch.object(VisualComparison, 'detect_available_browsers', + return_value=['chrome']): + with patch.object(vc, '_create_driver') as mock_create: + mock_driver = Mock() + mock_create.return_value = mock_driver + + with patch.object(vc, 'take_screenshot', return_value=b'\x89PNG'): + with patch.object(vc, 'compare_images', return_value={ + 'difference_ratio': 0.0, + 'different_pixels': 0, + 'total_pixels': 100000, + 'is_similar': True, + 'comparison_image_path': None, + }): + results = vc.compare_urls( + "https://a.com", "https://b.com", tmpdir + ) + + assert 'chrome' in results + + def test_compare_urls_browser_error(self): + """Test comparison when screenshot raises error.""" + vc = VisualComparison() + + with tempfile.TemporaryDirectory() as tmpdir: + with patch.object(vc, '_create_driver') as mock_create: + mock_driver = Mock() + mock_create.return_value = mock_driver + + with patch.object(vc, 'take_screenshot', + side_effect=Exception("Screenshot failed")): + results = vc.compare_urls( + "https://a.com", "https://b.com", + tmpdir, browsers=['chrome'] + ) + + assert 'chrome' in results + assert 'error' in results['chrome'] + + def test_compare_urls_unsupported_browser_skipped(self): + """Test that unsupported browsers are skipped.""" + vc = VisualComparison() + + with tempfile.TemporaryDirectory() as tmpdir: + results = vc.compare_urls( + "https://a.com", "https://b.com", + tmpdir, browsers=['safari'] + ) + assert 'safari' not in results + + def test_compare_urls_creates_output_dir(self): + """Test that output directory is created.""" + vc = VisualComparison() + + with tempfile.TemporaryDirectory() as tmpdir: + output_dir = os.path.join(tmpdir, "screenshots") + assert not os.path.exists(output_dir) + + with patch.object(vc, '_create_driver') as mock_create: + mock_driver = Mock() + mock_create.return_value = mock_driver + + with patch.object(vc, 'take_screenshot', return_value=b'\x89PNG'): + with patch.object(vc, 'compare_images', return_value={ + 'difference_ratio': 0.0, + 'different_pixels': 0, + 'total_pixels': 100000, + 'is_similar': True, + 'comparison_image_path': None, + }): + vc.compare_urls( + "https://a.com", "https://b.com", + output_dir, browsers=['chrome'] + ) + + assert os.path.exists(output_dir) + + +class TestRemoveWaybackBanner: + """Test Wayback banner removal.""" + + def test_remove_banner(self): + """Test that banner removal script is executed.""" + vc = VisualComparison() + mock_driver = Mock() + vc.driver = mock_driver + + with patch('wayback_diff.visual_comparison.time'): + vc._remove_wayback_banner() + + mock_driver.execute_script.assert_called_once() + script = mock_driver.execute_script.call_args[0][0] + assert 'wm-ipp' in script + + def test_remove_banner_error_handled(self): + """Test that banner removal errors are handled.""" + vc = VisualComparison() + mock_driver = Mock() + mock_driver.execute_script.side_effect = Exception("Script error") + vc.driver = mock_driver + + # Should not raise + vc._remove_wayback_banner() + + +class TestCreateDriver: + """Test WebDriver creation.""" + + @patch('wayback_diff.visual_comparison.WEBDRIVER_MANAGER_AVAILABLE', False) + @patch('wayback_diff.visual_comparison.webdriver') + def test_create_chrome_driver(self, mock_webdriver): + """Test Chrome driver creation.""" + vc = VisualComparison(browser='chrome') + mock_driver = Mock() + mock_webdriver.Chrome.return_value = mock_driver + + result = vc._create_driver() + assert result == mock_driver + + @patch('wayback_diff.visual_comparison.WEBDRIVER_MANAGER_AVAILABLE', False) + @patch('wayback_diff.visual_comparison.webdriver') + def test_create_firefox_driver(self, mock_webdriver): + """Test Firefox driver creation.""" + vc = VisualComparison(browser='firefox') + mock_driver = Mock() + mock_webdriver.Firefox.return_value = mock_driver + + result = vc._create_driver() + mock_driver.set_window_size.assert_called_once_with(1920, 1080) + + @patch('wayback_diff.visual_comparison.WEBDRIVER_MANAGER_AVAILABLE', True) + @patch('wayback_diff.visual_comparison.ChromeDriverManager') + @patch('wayback_diff.visual_comparison.webdriver') + def test_create_chrome_with_manager(self, mock_webdriver, mock_manager): + """Test Chrome driver creation with webdriver-manager.""" + vc = VisualComparison(browser='chrome') + mock_driver = Mock() + mock_webdriver.Chrome.return_value = mock_driver + mock_manager.return_value.install.return_value = '/path/to/chromedriver' + + result = vc._create_driver() + assert result == mock_driver + + @patch('wayback_diff.visual_comparison.WEBDRIVER_MANAGER_AVAILABLE', True) + @patch('wayback_diff.visual_comparison.ChromeDriverManager') + @patch('wayback_diff.visual_comparison.ChromeService') + @patch('wayback_diff.visual_comparison.webdriver') + def test_create_chrome_manager_fallback(self, mock_webdriver, mock_service, + mock_manager): + """Test Chrome driver falls back to system when manager fails.""" + vc = VisualComparison(browser='chrome') + mock_driver = Mock() + # Manager install fails, triggering fallback to system chromedriver + mock_manager.return_value.install.side_effect = Exception("Download failed") + mock_service.side_effect = Exception("Service failed") + # First Chrome() call is the fallback (no service) + mock_webdriver.Chrome.return_value = mock_driver + + result = vc._create_driver() + assert result == mock_driver + + @patch('wayback_diff.visual_comparison.WEBDRIVER_MANAGER_AVAILABLE', False) + @patch('wayback_diff.visual_comparison.webdriver') + def test_create_chromium_driver(self, mock_webdriver): + """Test Chromium driver creation.""" + vc = VisualComparison(browser='chromium') + mock_driver = Mock() + mock_webdriver.Chrome.return_value = mock_driver + + with patch('os.path.exists', return_value=False): + result = vc._create_driver() + assert result == mock_driver + + @patch('wayback_diff.visual_comparison.WEBDRIVER_MANAGER_AVAILABLE', False) + @patch('wayback_diff.visual_comparison.webdriver') + def test_create_opera_driver(self, mock_webdriver): + """Test Opera driver creation.""" + vc = VisualComparison(browser='opera') + mock_driver = Mock() + mock_webdriver.Chrome.return_value = mock_driver + + with patch('os.path.exists', return_value=False): + result = vc._create_driver() + assert result == mock_driver + + @patch('wayback_diff.visual_comparison.WEBDRIVER_MANAGER_AVAILABLE', False) + @patch('wayback_diff.visual_comparison.webdriver') + def test_create_edge_driver(self, mock_webdriver): + """Test Edge driver creation.""" + vc = VisualComparison(browser='edge') + mock_driver = Mock() + mock_webdriver.Edge.return_value = mock_driver + + result = vc._create_driver() + assert result == mock_driver + + @patch('wayback_diff.visual_comparison.WEBDRIVER_MANAGER_AVAILABLE', False) + @patch('wayback_diff.visual_comparison.webdriver') + def test_create_chrome_headless_args(self, mock_webdriver): + """Test Chrome headless arguments.""" + vc = VisualComparison(browser='chrome', headless=True) + mock_driver = Mock() + mock_webdriver.Chrome.return_value = mock_driver + + vc._create_driver() + # Verify Chrome was called (args checked via options) + mock_webdriver.Chrome.assert_called() + + @patch('wayback_diff.visual_comparison.WEBDRIVER_MANAGER_AVAILABLE', False) + @patch('wayback_diff.visual_comparison.webdriver') + def test_create_chrome_non_headless(self, mock_webdriver): + """Test Chrome non-headless mode.""" + vc = VisualComparison(browser='chrome', headless=False) + mock_driver = Mock() + mock_webdriver.Chrome.return_value = mock_driver + + vc._create_driver() + mock_webdriver.Chrome.assert_called() + + +class TestTakeFullPageScreenshot: + """Test full page screenshot functionality.""" + + def test_full_page_screenshot_basic(self): + """Test full page screenshot with mocked driver.""" + vc = VisualComparison() + mock_driver = Mock() + vc.driver = mock_driver + + # Create a small test PNG + img = Image.new('RGB', (100, 100), color=(255, 255, 255)) + img_bytes = io.BytesIO() + img.save(img_bytes, format='PNG') + png_data = img_bytes.getvalue() + + mock_driver.current_url = "https://example.com" + mock_driver.execute_script.side_effect = [ + 100, # scrollWidth + 100, # scrollHeight + 100, # innerWidth + 100, # innerHeight + None, # scrollTo + None, # scrollTo(0,0) reset + ] + mock_driver.get_screenshot_as_png.return_value = png_data + + result = vc._take_full_page_screenshot() + assert isinstance(result, bytes) + assert len(result) > 0 + + def test_full_page_screenshot_wayback_url(self): + """Test full page screenshot removes Wayback banner.""" + vc = VisualComparison() + mock_driver = Mock() + vc.driver = mock_driver + + img = Image.new('RGB', (50, 50), color=(200, 200, 200)) + img_bytes = io.BytesIO() + img.save(img_bytes, format='PNG') + png_data = img_bytes.getvalue() + + mock_driver.current_url = "https://web.archive.org/web/20230101/https://example.com" + # With _remove_wayback_banner patched out, execute_script calls are: + # scrollWidth, scrollHeight, innerWidth, innerHeight, scrollTo, wayback-style, scrollTo-reset + mock_driver.execute_script.side_effect = [ + 50, # scrollWidth + 50, # scrollHeight + 50, # innerWidth + 50, # innerHeight + None, # scrollTo + None, # wayback style injection during scroll + None, # scrollTo(0,0) reset + ] + mock_driver.get_screenshot_as_png.return_value = png_data + + with patch.object(vc, '_remove_wayback_banner'): + result = vc._take_full_page_screenshot() + vc._remove_wayback_banner.assert_called_once() + assert isinstance(result, bytes) + + def test_full_page_screenshot_multi_scroll(self): + """Test full page screenshot that requires scrolling.""" + vc = VisualComparison() + mock_driver = Mock() + vc.driver = mock_driver + + # Page is 200x200 but viewport is 100x100, needs 4 screenshots + img = Image.new('RGB', (100, 100), color=(128, 128, 128)) + img_bytes = io.BytesIO() + img.save(img_bytes, format='PNG') + png_data = img_bytes.getvalue() + + mock_driver.current_url = "https://example.com" + # Each scroll iteration calls execute_script for scrollTo + mock_driver.execute_script.side_effect = [ + 200, # scrollWidth + 200, # scrollHeight + 100, # innerWidth + 100, # innerHeight + None, # scrollTo(0,0) + None, # scrollTo(100,0) + None, # scrollTo(0,100) + None, # scrollTo(100,100) + None, # scrollTo(0,0) reset + ] + mock_driver.get_screenshot_as_png.return_value = png_data + + result = vc._take_full_page_screenshot() + assert isinstance(result, bytes) + # Should have taken 4 viewport screenshots + assert mock_driver.get_screenshot_as_png.call_count == 4 + + +class TestEdgeDriverFallback: + """Test Edge driver creation with fallback paths.""" + + @patch('wayback_diff.visual_comparison.WEBDRIVER_MANAGER_AVAILABLE', False) + @patch('wayback_diff.visual_comparison.webdriver') + def test_create_edge_fallback_to_chrome(self, mock_webdriver): + """Test Edge driver falls back to Chrome-based Edge.""" + vc = VisualComparison(browser='edge') + mock_driver = Mock() + # Edge driver fails, Chrome-based Edge succeeds + mock_webdriver.Edge.side_effect = Exception("No Edge driver") + mock_webdriver.Chrome.return_value = mock_driver + + with patch('os.path.exists', return_value=False): + result = vc._create_driver() + assert result == mock_driver + + +class TestFirefoxDriverManager: + """Test Firefox driver creation with webdriver-manager.""" + + @patch('wayback_diff.visual_comparison.WEBDRIVER_MANAGER_AVAILABLE', True) + @patch('wayback_diff.visual_comparison.GeckoDriverManager') + @patch('wayback_diff.visual_comparison.FirefoxService') + @patch('wayback_diff.visual_comparison.webdriver') + def test_create_firefox_with_manager(self, mock_webdriver, mock_service, + mock_manager): + """Test Firefox driver creation with webdriver-manager.""" + vc = VisualComparison(browser='firefox') + mock_driver = Mock() + mock_webdriver.Firefox.return_value = mock_driver + mock_manager.return_value.install.return_value = '/path/to/geckodriver' + + result = vc._create_driver() + mock_driver.set_window_size.assert_called_once() + + @patch('wayback_diff.visual_comparison.WEBDRIVER_MANAGER_AVAILABLE', True) + @patch('wayback_diff.visual_comparison.GeckoDriverManager') + @patch('wayback_diff.visual_comparison.FirefoxService') + @patch('wayback_diff.visual_comparison.webdriver') + def test_create_firefox_manager_fallback(self, mock_webdriver, mock_service, + mock_manager): + """Test Firefox driver falls back when manager fails.""" + vc = VisualComparison(browser='firefox') + mock_driver = Mock() + mock_manager.return_value.install.side_effect = Exception("Download failed") + mock_service.side_effect = Exception("Service failed") + mock_webdriver.Firefox.return_value = mock_driver + + result = vc._create_driver() + assert result == mock_driver + + +class TestModuleEntryPoint: + """Test __main__.py module entry point.""" + + def test_main_module_import(self): + """Test that __main__ module can be imported.""" + import wayback_diff.__main__ + + def test_package_version(self): + """Test package version is set.""" + from wayback_diff import __version__ + assert __version__ == "1.1.0" diff --git a/tests/test_wayback_cleaner.py b/tests/test_wayback_cleaner.py index ac68084..bc706b2 100644 --- a/tests/test_wayback_cleaner.py +++ b/tests/test_wayback_cleaner.py @@ -6,23 +6,45 @@ class TestWaybackCleaner: """Test cases for WaybackCleaner.""" - - def test_is_wayback_url(self): - """Test Wayback URL detection.""" + + def test_is_wayback_url_full(self): + """Test Wayback URL detection with full URL.""" assert WaybackCleaner.is_wayback_url("https://web.archive.org/web/20230101/https://example.com/") + assert WaybackCleaner.is_wayback_url("http://web.archive.org/web/20230101/https://example.com/") + + def test_is_wayback_url_relative(self): + """Test Wayback URL detection with relative URL.""" assert WaybackCleaner.is_wayback_url("/web/20230101/https://example.com/") + + def test_is_wayback_url_non_wayback(self): + """Test non-Wayback URLs.""" assert not WaybackCleaner.is_wayback_url("https://example.com/") - - def test_extract_timestamp(self): - """Test timestamp extraction.""" + assert not WaybackCleaner.is_wayback_url("https://archive.org/") + assert not WaybackCleaner.is_wayback_url("") + + def test_extract_timestamp_full_url(self): + """Test timestamp extraction from full URL.""" url = "https://web.archive.org/web/20230101120000/https://example.com/" timestamp = WaybackCleaner.extract_timestamp(url) assert timestamp == "20230101120000" - - url2 = "/web/20230101/https://example.com/" - timestamp2 = WaybackCleaner.extract_timestamp(url2) - assert timestamp2 == "20230101" - + + def test_extract_timestamp_short(self): + """Test timestamp extraction from short URL.""" + url = "/web/20230101/https://example.com/" + timestamp = WaybackCleaner.extract_timestamp(url) + assert timestamp == "20230101" + + def test_extract_timestamp_with_suffix(self): + """Test timestamp extraction with suffix (cs_, im_, etc.).""" + url = "https://web.archive.org/web/20230101cs_/https://example.com/style.css" + timestamp = WaybackCleaner.extract_timestamp(url) + assert timestamp == "20230101" + + def test_extract_timestamp_no_match(self): + """Test timestamp extraction when no timestamp present.""" + assert WaybackCleaner.extract_timestamp("https://example.com/") is None + assert WaybackCleaner.extract_timestamp("") is None + def test_remove_wayback_header(self): """Test header removal.""" content = b''' @@ -36,13 +58,58 @@ def test_remove_wayback_header(self): Content ''' - + cleaned = WaybackCleaner.remove_wayback_header(content) assert b'archive.org/includes/analytics.js' not in cleaned assert b'__wm.init' not in cleaned assert b'' in cleaned assert b'Content' in cleaned - + + def test_remove_wayback_header_bundle_playback(self): + """Test header removal with bundle-playback.js pattern.""" + content = b''' + + + +Page +''' + + cleaned = WaybackCleaner.remove_wayback_header(content) + assert b'bundle-playback.js' not in cleaned + assert b'Page' in cleaned + + def test_remove_wayback_header_no_end_marker(self): + """Test header removal when end marker is missing but meta tag exists after scripts.""" + content = b''' + + + +Page +''' + + cleaned = WaybackCleaner.remove_wayback_header(content) + # Fallback finds ' in cleaned + assert b'Page' in cleaned + + def test_remove_wayback_header_no_header(self): + """Test when there is no Wayback header.""" + content = b'Clean' + cleaned = WaybackCleaner.remove_wayback_header(content) + assert cleaned == content + + def test_remove_wayback_header_no_end_marker_no_meta(self): + """Test header removal when neither end marker nor meta tag exists.""" + content = b''' + + +Page +''' + + cleaned = WaybackCleaner.remove_wayback_header(content) + # Should return content unchanged since no end marker found + assert b'Page' in cleaned + def test_remove_wayback_footer(self): """Test footer removal.""" content = b'''Content @@ -51,23 +118,85 @@ def test_remove_wayback_footer(self): FILE ARCHIVED ON 23:59:13 Nov 20, 2021 AND RETRIEVED FROM THE INTERNET ARCHIVE ON 00:41:42 Dec 01, 2021. -->''' - + cleaned = WaybackCleaner.remove_wayback_footer(content) assert b'FILE ARCHIVED ON' not in cleaned assert cleaned.endswith(b'\n') - + + def test_remove_wayback_footer_inline_comment(self): + """Test footer removal with inline comment format.""" + content = b'''Content +''' + + cleaned = WaybackCleaner.remove_wayback_footer(content) + assert b'FILE ARCHIVED ON' not in cleaned + + def test_remove_wayback_footer_no_footer(self): + """Test when there is no Wayback footer.""" + content = b'Clean' + cleaned = WaybackCleaner.remove_wayback_footer(content) + # Content should still contain the original body + assert b'Clean' in cleaned + + def test_remove_wayback_footer_carriage_return(self): + """Test footer removal with \\r\\n line endings.""" + content = b'Content\r\n\r\n' + cleaned = WaybackCleaner.remove_wayback_footer(content) + assert b'FILE ARCHIVED ON' not in cleaned + + def test_remove_wayback_footer_standalone_comment(self): + """Test footer removal when comment is standalone (not right after ).""" + content = b'''Content + + +''' + + cleaned = WaybackCleaner.remove_wayback_footer(content) + cleaned_str = cleaned.decode('utf-8', errors='ignore') + assert 'FILE ARCHIVED ON' not in cleaned_str + def test_remove_wayback_urls(self): - """Test URL prefix removal.""" + """Test URL prefix removal with timestamp.""" content = b'''Link ''' - + cleaned = WaybackCleaner.remove_wayback_urls(content, "20230101") assert b'web.archive.org' not in cleaned assert b'/web/20230101' not in cleaned assert b'https://example.com/page' in cleaned assert b'https://example.com/image.png' in cleaned - + + def test_remove_wayback_urls_js_prefix(self): + """Test URL removal with js_ prefix.""" + content = b'' + cleaned = WaybackCleaner.remove_wayback_urls(content, "20230101") + assert b'/web/20230101' not in cleaned + assert b'https://example.com/app.js' in cleaned + + def test_remove_wayback_urls_no_timestamp(self): + """Test URL removal without explicit timestamp (extracts from content).""" + content = b'''Link +''' + + cleaned = WaybackCleaner.remove_wayback_urls(content, None) + assert b'web.archive.org' not in cleaned + + def test_remove_wayback_urls_no_timestamp_in_content(self): + """Test URL removal when no timestamp can be extracted.""" + content = b'Link' + cleaned = WaybackCleaner.remove_wayback_urls(content, None) + assert b'web.archive.org' not in cleaned + + def test_remove_wayback_urls_https_archive(self): + """Test URL removal with https web.archive.org.""" + content = b'Link' + cleaned = WaybackCleaner.remove_wayback_urls(content, "20230101") + assert b'web.archive.org' not in cleaned + def test_clean_wayback_html(self): """Test complete cleaning.""" content = b''' @@ -82,12 +211,69 @@ def test_clean_wayback_html(self): ''' - + cleaned = WaybackCleaner.clean_wayback_html(content, "https://web.archive.org/web/20230101/https://example.com/") - # Check that wayback artifacts are removed assert b'archive.org/includes' not in cleaned assert b'web.archive.org' not in cleaned assert b'https://example.com/' in cleaned - # The cleaner should remove the footer comment cleaned_str = cleaned.decode('utf-8', errors='ignore') assert 'FILE ARCHIVED ON' not in cleaned_str + + def test_clean_wayback_html_no_url(self): + """Test cleaning without URL (no timestamp extraction).""" + content = b''' + + + +Test + +Content +''' + + cleaned = WaybackCleaner.clean_wayback_html(content, None) + assert b'archive.org/includes' not in cleaned + + def test_clean_wayback_html_empty(self): + """Test cleaning empty content.""" + assert WaybackCleaner.clean_wayback_html(b'', None) == b'' + + def test_clean_wayback_html_none(self): + """Test cleaning None content.""" + assert WaybackCleaner.clean_wayback_html(b'', None) == b'' + + def test_clean_wayback_html_no_artifacts(self): + """Test cleaning content with no Wayback artifacts.""" + content = b'

Clean content

' + cleaned = WaybackCleaner.clean_wayback_html(content, "https://example.com") + assert b'Clean content' in cleaned + + def test_normalize_html_whitespace_self_closing(self): + """Test whitespace normalization in self-closing tags.""" + html = b'' + normalized = WaybackCleaner.normalize_html_whitespace(html) + assert b' />' not in normalized + assert b'/>' in normalized + + def test_normalize_html_whitespace_multiple_spaces(self): + """Test normalization of multiple spaces.""" + html = b'
text here
' + normalized = WaybackCleaner.normalize_html_whitespace(html) + assert b' ' not in normalized + + def test_normalize_html_whitespace_tabs(self): + """Test normalization of tabs.""" + html = b'
\ttext\there
' + normalized = WaybackCleaner.normalize_html_whitespace(html) + assert b'\t' not in normalized + + def test_normalize_html_whitespace_newlines(self): + """Test normalization of newlines with spaces.""" + html = b'
\n text
' + normalized = WaybackCleaner.normalize_html_whitespace(html) + assert b' \n ' not in normalized + + def test_extract_timestamp_jm_prefix(self): + """Test timestamp with jm_ prefix.""" + url = "https://web.archive.org/web/20230101jm_/https://example.com/" + timestamp = WaybackCleaner.extract_timestamp(url) + assert timestamp == "20230101" diff --git a/wayback_diff/diff_engine.py b/wayback_diff/diff_engine.py index 479cac6..aba69ff 100644 --- a/wayback_diff/diff_engine.py +++ b/wayback_diff/diff_engine.py @@ -252,15 +252,35 @@ def compare_structures(self, old_html: bytes, new_html: bytes) -> Dict: 'new_structure': [], 'similarity': 0.0 } - + # Compare structures old_structure = old_parser.structure new_structure = new_parser.structure - + # Calculate similarity - matcher = SequenceMatcher(None, old_structure, new_structure) + # Convert structure dicts to hashable tuples for SequenceMatcher + def _make_hashable(d): + items = [] + for k, v in sorted(d.items(), key=lambda x: str(x[0])): + if isinstance(v, dict): + v = tuple(sorted(v.items())) + items.append((k, v)) + return tuple(items) + + try: + old_hashable = [_make_hashable(d) for d in old_structure] + new_hashable = [_make_hashable(d) for d in new_structure] + matcher = SequenceMatcher(None, old_hashable, new_hashable) + except (TypeError, Exception): + # Fallback if structures cannot be compared + return { + 'structural_changes': [], + 'old_structure': old_structure, + 'new_structure': new_structure, + 'similarity': 0.0 + } similarity = matcher.ratio() - + # Find structural differences structural_changes = [] for tag, i1, i2, j1, j2 in matcher.get_opcodes(): diff --git a/wayback_diff/visual_comparison.py b/wayback_diff/visual_comparison.py index a25016b..9a87936 100644 --- a/wayback_diff/visual_comparison.py +++ b/wayback_diff/visual_comparison.py @@ -1,5 +1,7 @@ """Visual comparison module for taking screenshots and comparing them.""" +from __future__ import annotations + import os import time from typing import Optional, Tuple, List, Dict diff --git a/wayback_diff/wayback_cleaner.py b/wayback_diff/wayback_cleaner.py index 0ac1f7b..9f28664 100644 --- a/wayback_diff/wayback_cleaner.py +++ b/wayback_diff/wayback_cleaner.py @@ -89,10 +89,11 @@ def remove_wayback_header(content: bytes) -> bytes: # This is a fallback for pages where the comment might be missing next_tag = content.find(b' start_idx: - end_idx = next_tag + # Use