Skip to content

Commit ab91d24

Browse files
committed
Added TopSamples utility to store the top N objects with the highest priorities.
1 parent 47ca39e commit ab91d24

File tree

2 files changed

+63
-0
lines changed

2 files changed

+63
-0
lines changed

crawlmi/stats/top_samples.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
from crawlmi.queue import Heap
2+
3+
4+
class TopSamples(object):
5+
'''Stores the top `size` objects with the highest priorities.
6+
Useful to store the top N largest sites, for example.
7+
'''
8+
9+
def __init__(self, size=5):
10+
self.size = size
11+
self._heap = Heap()
12+
13+
def add_sample(self, priority, value):
14+
# negate priority, so that the smallest ones are on top and first to pop
15+
self._heap.push((-priority, value))
16+
if len(self._heap) > self.size:
17+
self._heap.pop()
18+
19+
@property
20+
def samples(self):
21+
objects = []
22+
while self._heap:
23+
objects.append(self._heap.pop())
24+
map(self._heap.push, objects)
25+
return [(-p, v) for (p, v) in reversed(objects)]
26+
27+
def __len__(self):
28+
return len(self._heap)
29+
30+
def __str__(self):
31+
return '\n '.join(map(str, self.samples))
32+
__repr__ = __str__

crawlmi/tests/test_top_samples.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
import random
2+
3+
from twisted.trial import unittest
4+
5+
from crawlmi.stats.top_samples import TopSamples
6+
7+
8+
class TopSamplesTest(unittest.TestCase):
9+
def test_size_1(self):
10+
ts = TopSamples(size=1)
11+
ts.add_sample(1, 'hello')
12+
self.assertEqual(len(ts), 1)
13+
self.assertListEqual(ts.samples, [(1, 'hello')])
14+
ts.add_sample(0, 'world')
15+
self.assertEqual(len(ts), 1)
16+
self.assertListEqual(ts.samples, [(1, 'hello')])
17+
ts.add_sample(2, '!')
18+
self.assertEqual(len(ts), 1)
19+
self.assertListEqual(ts.samples, [(2, '!')])
20+
21+
def test_default_size(self):
22+
ts = TopSamples()
23+
objects = [(i, 2 * i) for i in xrange(20)]
24+
random.shuffle(objects)
25+
for (p, v) in objects:
26+
ts.add_sample(p, v)
27+
self.assertEqual(len(ts), 5)
28+
self.assertListEqual(ts.samples, [(19, 38), (18, 36), (17, 34), (16, 32), (15, 30)])
29+
# test - requesting the values doesn't screw anything
30+
self.assertEqual(len(ts), 5)
31+
self.assertListEqual(ts.samples, [(19, 38), (18, 36), (17, 34), (16, 32), (15, 30)])

0 commit comments

Comments
 (0)