From ccaea17adbf028cad37f4f80b0cf66b91edf27cf Mon Sep 17 00:00:00 2001 From: Dirk McCormick Date: Fri, 31 Jan 2020 15:40:29 -0500 Subject: [PATCH 01/11] fix: simulate DONT_HAVE for older peers --- benchmarks_test.go | 85 ++++++++++++++++++--- bitswap.go | 27 ++++++- bitswap_test.go | 31 ++++---- bitswap_with_sessions_test.go | 18 ++--- internal/decision/engine.go | 15 +++- internal/messagequeue/messagequeue.go | 103 +++++++++++++++++++++++--- internal/testinstance/testinstance.go | 41 +++++----- internal/testnet/virtual.go | 41 ++++++++-- 8 files changed, 289 insertions(+), 72 deletions(-) diff --git a/benchmarks_test.go b/benchmarks_test.go index e56214d9..92906da0 100644 --- a/benchmarks_test.go +++ b/benchmarks_test.go @@ -15,11 +15,13 @@ import ( "github.com/ipfs/go-bitswap/internal/testutil" blocks "github.com/ipfs/go-block-format" + protocol "github.com/libp2p/go-libp2p-protocol" bitswap "github.com/ipfs/go-bitswap" bssession "github.com/ipfs/go-bitswap/internal/session" testinstance "github.com/ipfs/go-bitswap/internal/testinstance" tn "github.com/ipfs/go-bitswap/internal/testnet" + bsnet "github.com/ipfs/go-bitswap/network" cid "github.com/ipfs/go-cid" delay "github.com/ipfs/go-ipfs-delay" mockrouting "github.com/ipfs/go-ipfs-routing/mock" @@ -118,6 +120,74 @@ func BenchmarkFixedDelay(b *testing.B) { printResults(benchmarkLog) } +type mixedBench struct { + bench + fetcherCount int // number of nodes that fetch data + oldSeedCount int // number of seed nodes running old version of Bitswap +} + +var mixedBenches = []mixedBench{ + mixedBench{bench{"3Nodes-Overlap3-OneAtATime", 3, 10, overlap2, oneAtATime}, 1, 2}, + mixedBench{bench{"3Nodes-AllToAll-OneAtATime", 3, 10, allToAll, oneAtATime}, 1, 2}, + mixedBench{bench{"3Nodes-Overlap3-AllConcurrent", 3, 10, overlap2, fetchAllConcurrent}, 1, 2}, + // mixedBench{bench{"3Nodes-Overlap3-UnixfsFetch", 3, 100, overlap2, unixfsFileFetch}, 1, 2}, +} + +func BenchmarkFetchFromOldBitswap(b *testing.B) { + benchmarkLog = nil + fixedDelay := delay.Fixed(10 * time.Millisecond) + bstoreLatency := time.Duration(0) + + for _, bch := range mixedBenches { + b.Run(bch.name, func(b *testing.B) { + fetcherCount := bch.fetcherCount + oldSeedCount := bch.oldSeedCount + newSeedCount := bch.nodeCount - (fetcherCount + oldSeedCount) + + net := tn.VirtualNetwork(mockrouting.NewServer(), fixedDelay) + + // Simulate an older Bitswap node (old protocol ID) that doesn't + // send DONT_HAVE responses + oldProtocol := []protocol.ID{bsnet.ProtocolBitswapOneOne} + oldNetOpts := []bsnet.NetOpt{bsnet.SupportedProtocols(oldProtocol)} + oldBsOpts := []bitswap.Option{bitswap.SetSendDontHaves(false)} + oldNodeGenerator := testinstance.NewTestInstanceGenerator(net, oldNetOpts, oldBsOpts) + + // Regular new Bitswap node + newNodeGenerator := testinstance.NewTestInstanceGenerator(net, nil, nil) + var instances []testinstance.Instance + + // Create new nodes (fetchers + seeds) + for i := 0; i < fetcherCount+newSeedCount; i++ { + inst := newNodeGenerator.Next() + instances = append(instances, inst) + } + // Create old nodes (just seeds) + for i := 0; i < oldSeedCount; i++ { + inst := oldNodeGenerator.Next() + instances = append(instances, inst) + } + // Connect all the nodes together + testinstance.ConnectInstances(instances) + + // Generate blocks, with a smaller root block + rootBlock := testutil.GenerateBlocksOfSize(1, rootBlockSize) + blocks := testutil.GenerateBlocksOfSize(bch.blockCount, stdBlockSize) + blocks[0] = rootBlock[0] + + // Run the distribution + runDistributionMulti(b, instances[:fetcherCount], instances[fetcherCount:], blocks, bstoreLatency, bch.distFn, bch.fetchFn) + + newNodeGenerator.Close() + oldNodeGenerator.Close() + }) + } + + out, _ := json.MarshalIndent(benchmarkLog, "", " ") + _ = ioutil.WriteFile("tmp/benchmark.json", out, 0666) + printResults(benchmarkLog) +} + const datacenterSpeed = 5 * time.Millisecond const fastSpeed = 60 * time.Millisecond const mediumSpeed = 200 * time.Millisecond @@ -226,12 +296,12 @@ func BenchmarkDatacenterMultiLeechMultiSeed(b *testing.B) { for i := 0; i < b.N; i++ { net := tn.RateLimitedVirtualNetwork(mockrouting.NewServer(), d, rateLimitGenerator) - ig := testinstance.NewTestInstanceGenerator(net) + ig := testinstance.NewTestInstanceGenerator(net, nil, nil) defer ig.Close() instances := ig.Instances(numnodes) blocks := testutil.GenerateBlocksOfSize(numblks, blockSize) - runDistributionMulti(b, instances, 3, blocks, bstoreLatency, df, ff) + runDistributionMulti(b, instances[:3], instances[3:], blocks, bstoreLatency, df, ff) } }) @@ -244,7 +314,7 @@ func subtestDistributeAndFetch(b *testing.B, numnodes, numblks int, d delay.D, b for i := 0; i < b.N; i++ { net := tn.VirtualNetwork(mockrouting.NewServer(), d) - ig := testinstance.NewTestInstanceGenerator(net) + ig := testinstance.NewTestInstanceGenerator(net, nil, nil) instances := ig.Instances(numnodes) rootBlock := testutil.GenerateBlocksOfSize(1, rootBlockSize) @@ -252,7 +322,6 @@ func subtestDistributeAndFetch(b *testing.B, numnodes, numblks int, d delay.D, b blocks[0] = rootBlock[0] runDistribution(b, instances, blocks, bstoreLatency, df, ff) ig.Close() - // panic("done") } } @@ -260,7 +329,7 @@ func subtestDistributeAndFetchRateLimited(b *testing.B, numnodes, numblks int, d for i := 0; i < b.N; i++ { net := tn.RateLimitedVirtualNetwork(mockrouting.NewServer(), d, rateLimitGenerator) - ig := testinstance.NewTestInstanceGenerator(net) + ig := testinstance.NewTestInstanceGenerator(net, nil, nil) defer ig.Close() instances := ig.Instances(numnodes) @@ -271,12 +340,8 @@ func subtestDistributeAndFetchRateLimited(b *testing.B, numnodes, numblks int, d } } -func runDistributionMulti(b *testing.B, instances []testinstance.Instance, numFetchers int, blocks []blocks.Block, bstoreLatency time.Duration, df distFunc, ff fetchFunc) { - numnodes := len(instances) - fetchers := instances[numnodes-numFetchers:] - +func runDistributionMulti(b *testing.B, fetchers []testinstance.Instance, seeds []testinstance.Instance, blocks []blocks.Block, bstoreLatency time.Duration, df distFunc, ff fetchFunc) { // Distribute blocks to seed nodes - seeds := instances[:numnodes-numFetchers] df(b, seeds, blocks) // Set the blockstore latency on seed nodes diff --git a/bitswap.go b/bitswap.go index 2bc7a189..e5e0ef14 100644 --- a/bitswap.go +++ b/bitswap.go @@ -14,9 +14,7 @@ import ( bsbpm "github.com/ipfs/go-bitswap/internal/blockpresencemanager" decision "github.com/ipfs/go-bitswap/internal/decision" bsgetter "github.com/ipfs/go-bitswap/internal/getter" - bsmsg "github.com/ipfs/go-bitswap/message" bsmq "github.com/ipfs/go-bitswap/internal/messagequeue" - bsnet "github.com/ipfs/go-bitswap/network" notifications "github.com/ipfs/go-bitswap/internal/notifications" bspm "github.com/ipfs/go-bitswap/internal/peermanager" bspqm "github.com/ipfs/go-bitswap/internal/providerquerymanager" @@ -25,6 +23,8 @@ import ( bssm "github.com/ipfs/go-bitswap/internal/sessionmanager" bsspm "github.com/ipfs/go-bitswap/internal/sessionpeermanager" bswm "github.com/ipfs/go-bitswap/internal/wantmanager" + bsmsg "github.com/ipfs/go-bitswap/message" + bsnet "github.com/ipfs/go-bitswap/network" blocks "github.com/ipfs/go-block-format" cid "github.com/ipfs/go-cid" blockstore "github.com/ipfs/go-ipfs-blockstore" @@ -84,6 +84,17 @@ func RebroadcastDelay(newRebroadcastDelay delay.D) Option { } } +// SetSendDontHaves indicates what to do when the engine receives a want-block +// for a block that is not in the blockstore. Either +// - Send a DONT_HAVE message +// - Simply don't respond +// This option is only used for testing. +func SetSendDontHaves(send bool) Option { + return func(bs *Bitswap) { + bs.engine.SetSendDontHaves(send) + } +} + // New initializes a BitSwap instance that communicates over the provided // BitSwapNetwork. This function registers the returned instance as the network // delegate. Runs until context is cancelled or bitswap.Close is called. @@ -111,14 +122,22 @@ func New(parent context.Context, network bsnet.BitSwapNetwork, return nil }) + var wm *bswm.WantManager + // onDontHaveTimeout is called when a want-block is sent to a peer that + // has an old version of Bitswap that doesn't support DONT_HAVE messages, + // and no response is received within a timeout. + onDontHaveTimeout := func(p peer.ID, dontHaves []cid.Cid) { + // Simulate a DONT_HAVE message arriving to the WantManager + wm.ReceiveFrom(ctx, p, nil, nil, dontHaves) + } peerQueueFactory := func(ctx context.Context, p peer.ID) bspm.PeerQueue { - return bsmq.New(ctx, p, network) + return bsmq.New(ctx, p, network, onDontHaveTimeout) } sim := bssim.New() bpm := bsbpm.New() pm := bspm.New(ctx, peerQueueFactory, network.Self()) - wm := bswm.New(ctx, pm, sim, bpm) + wm = bswm.New(ctx, pm, sim, bpm) pqm := bspqm.New(ctx, network) sessionFactory := func(ctx context.Context, id uint64, spm bssession.SessionPeerManager, diff --git a/bitswap_test.go b/bitswap_test.go index 723b25d6..0a0bcc98 100644 --- a/bitswap_test.go +++ b/bitswap_test.go @@ -10,10 +10,10 @@ import ( bitswap "github.com/ipfs/go-bitswap" decision "github.com/ipfs/go-bitswap/internal/decision" - "github.com/ipfs/go-bitswap/message" bssession "github.com/ipfs/go-bitswap/internal/session" testinstance "github.com/ipfs/go-bitswap/internal/testinstance" tn "github.com/ipfs/go-bitswap/internal/testnet" + "github.com/ipfs/go-bitswap/message" blocks "github.com/ipfs/go-block-format" cid "github.com/ipfs/go-cid" detectrace "github.com/ipfs/go-detect-race" @@ -37,7 +37,7 @@ func getVirtualNetwork() tn.Network { func TestClose(t *testing.T) { vnet := getVirtualNetwork() - ig := testinstance.NewTestInstanceGenerator(vnet) + ig := testinstance.NewTestInstanceGenerator(vnet, nil, nil) defer ig.Close() bgen := blocksutil.NewBlockGenerator() @@ -55,7 +55,7 @@ func TestProviderForKeyButNetworkCannotFind(t *testing.T) { // TODO revisit this rs := mockrouting.NewServer() net := tn.VirtualNetwork(rs, delay.Fixed(kNetworkDelay)) - ig := testinstance.NewTestInstanceGenerator(net) + ig := testinstance.NewTestInstanceGenerator(net, nil, nil) defer ig.Close() block := blocks.NewBlock([]byte("block")) @@ -81,7 +81,7 @@ func TestGetBlockFromPeerAfterPeerAnnounces(t *testing.T) { net := tn.VirtualNetwork(mockrouting.NewServer(), delay.Fixed(kNetworkDelay)) block := blocks.NewBlock([]byte("block")) - ig := testinstance.NewTestInstanceGenerator(net) + ig := testinstance.NewTestInstanceGenerator(net, nil, nil) defer ig.Close() peers := ig.Instances(2) @@ -111,7 +111,8 @@ func TestGetBlockFromPeerAfterPeerAnnounces(t *testing.T) { func TestDoesNotProvideWhenConfiguredNotTo(t *testing.T) { net := tn.VirtualNetwork(mockrouting.NewServer(), delay.Fixed(kNetworkDelay)) block := blocks.NewBlock([]byte("block")) - ig := testinstance.NewTestInstanceGenerator(net, bitswap.ProvideEnabled(false), bitswap.ProviderSearchDelay(50*time.Millisecond)) + bsOpts := []bitswap.Option{bitswap.ProvideEnabled(false), bitswap.ProviderSearchDelay(50 * time.Millisecond)} + ig := testinstance.NewTestInstanceGenerator(net, nil, bsOpts) defer ig.Close() hasBlock := ig.Next() @@ -148,7 +149,7 @@ func TestUnwantedBlockNotAdded(t *testing.T) { bsMessage := message.New(true) bsMessage.AddBlock(block) - ig := testinstance.NewTestInstanceGenerator(net) + ig := testinstance.NewTestInstanceGenerator(net, nil, nil) defer ig.Close() peers := ig.Instances(2) @@ -184,7 +185,7 @@ func TestPendingBlockAdded(t *testing.T) { bg := blocksutil.NewBlockGenerator() sessionBroadcastWantCapacity := 4 - ig := testinstance.NewTestInstanceGenerator(net) + ig := testinstance.NewTestInstanceGenerator(net, nil, nil) defer ig.Close() instance := ig.Instances(1)[0] @@ -282,7 +283,7 @@ func PerformDistributionTest(t *testing.T, numInstances, numBlocks int) { t.SkipNow() } net := tn.VirtualNetwork(mockrouting.NewServer(), delay.Fixed(kNetworkDelay)) - ig := testinstance.NewTestInstanceGenerator(net) + ig := testinstance.NewTestInstanceGenerator(net, nil, nil) defer ig.Close() bg := blocksutil.NewBlockGenerator() @@ -348,7 +349,7 @@ func TestSendToWantingPeer(t *testing.T) { } net := tn.VirtualNetwork(mockrouting.NewServer(), delay.Fixed(kNetworkDelay)) - ig := testinstance.NewTestInstanceGenerator(net) + ig := testinstance.NewTestInstanceGenerator(net, nil, nil) defer ig.Close() bg := blocksutil.NewBlockGenerator() @@ -390,7 +391,7 @@ func TestSendToWantingPeer(t *testing.T) { func TestEmptyKey(t *testing.T) { net := tn.VirtualNetwork(mockrouting.NewServer(), delay.Fixed(kNetworkDelay)) - ig := testinstance.NewTestInstanceGenerator(net) + ig := testinstance.NewTestInstanceGenerator(net, nil, nil) defer ig.Close() bs := ig.Instances(1)[0].Exchange @@ -423,7 +424,7 @@ func assertStat(t *testing.T, st *bitswap.Stat, sblks, rblks, sdata, rdata uint6 func TestBasicBitswap(t *testing.T) { net := tn.VirtualNetwork(mockrouting.NewServer(), delay.Fixed(kNetworkDelay)) - ig := testinstance.NewTestInstanceGenerator(net) + ig := testinstance.NewTestInstanceGenerator(net, nil, nil) defer ig.Close() bg := blocksutil.NewBlockGenerator() @@ -499,7 +500,7 @@ func TestBasicBitswap(t *testing.T) { func TestDoubleGet(t *testing.T) { net := tn.VirtualNetwork(mockrouting.NewServer(), delay.Fixed(kNetworkDelay)) - ig := testinstance.NewTestInstanceGenerator(net) + ig := testinstance.NewTestInstanceGenerator(net, nil, nil) defer ig.Close() bg := blocksutil.NewBlockGenerator() @@ -567,7 +568,7 @@ func TestDoubleGet(t *testing.T) { func TestWantlistCleanup(t *testing.T) { net := tn.VirtualNetwork(mockrouting.NewServer(), delay.Fixed(kNetworkDelay)) - ig := testinstance.NewTestInstanceGenerator(net) + ig := testinstance.NewTestInstanceGenerator(net, nil, nil) defer ig.Close() bg := blocksutil.NewBlockGenerator() @@ -689,7 +690,7 @@ func newReceipt(sent, recv, exchanged uint64) *decision.Receipt { func TestBitswapLedgerOneWay(t *testing.T) { net := tn.VirtualNetwork(mockrouting.NewServer(), delay.Fixed(kNetworkDelay)) - ig := testinstance.NewTestInstanceGenerator(net) + ig := testinstance.NewTestInstanceGenerator(net, nil, nil) defer ig.Close() bg := blocksutil.NewBlockGenerator() @@ -741,7 +742,7 @@ func TestBitswapLedgerOneWay(t *testing.T) { func TestBitswapLedgerTwoWay(t *testing.T) { net := tn.VirtualNetwork(mockrouting.NewServer(), delay.Fixed(kNetworkDelay)) - ig := testinstance.NewTestInstanceGenerator(net) + ig := testinstance.NewTestInstanceGenerator(net, nil, nil) defer ig.Close() bg := blocksutil.NewBlockGenerator() diff --git a/bitswap_with_sessions_test.go b/bitswap_with_sessions_test.go index 49e6d273..28d3a325 100644 --- a/bitswap_with_sessions_test.go +++ b/bitswap_with_sessions_test.go @@ -20,7 +20,7 @@ func TestBasicSessions(t *testing.T) { defer cancel() vnet := getVirtualNetwork() - ig := testinstance.NewTestInstanceGenerator(vnet) + ig := testinstance.NewTestInstanceGenerator(vnet, nil, nil) defer ig.Close() bgen := blocksutil.NewBlockGenerator() @@ -71,7 +71,7 @@ func TestSessionBetweenPeers(t *testing.T) { defer cancel() vnet := getVirtualNetwork() - ig := testinstance.NewTestInstanceGenerator(vnet) + ig := testinstance.NewTestInstanceGenerator(vnet, nil, nil) defer ig.Close() bgen := blocksutil.NewBlockGenerator() @@ -127,7 +127,7 @@ func TestSessionSplitFetch(t *testing.T) { defer cancel() vnet := getVirtualNetwork() - ig := testinstance.NewTestInstanceGenerator(vnet) + ig := testinstance.NewTestInstanceGenerator(vnet, nil, nil) defer ig.Close() bgen := blocksutil.NewBlockGenerator() @@ -171,7 +171,7 @@ func TestFetchNotConnected(t *testing.T) { defer cancel() vnet := getVirtualNetwork() - ig := testinstance.NewTestInstanceGenerator(vnet, bitswap.ProviderSearchDelay(10*time.Millisecond)) + ig := testinstance.NewTestInstanceGenerator(vnet, nil, []bitswap.Option{bitswap.ProviderSearchDelay(10 * time.Millisecond)}) defer ig.Close() bgen := blocksutil.NewBlockGenerator() @@ -216,7 +216,7 @@ func TestFetchAfterDisconnect(t *testing.T) { defer cancel() vnet := getVirtualNetwork() - ig := testinstance.NewTestInstanceGenerator(vnet, bitswap.ProviderSearchDelay(10*time.Millisecond)) + ig := testinstance.NewTestInstanceGenerator(vnet, nil, []bitswap.Option{bitswap.ProviderSearchDelay(10 * time.Millisecond)}) defer ig.Close() bgen := blocksutil.NewBlockGenerator() @@ -290,7 +290,7 @@ func TestInterestCacheOverflow(t *testing.T) { defer cancel() vnet := getVirtualNetwork() - ig := testinstance.NewTestInstanceGenerator(vnet) + ig := testinstance.NewTestInstanceGenerator(vnet, nil, nil) defer ig.Close() bgen := blocksutil.NewBlockGenerator() @@ -342,7 +342,7 @@ func TestPutAfterSessionCacheEvict(t *testing.T) { defer cancel() vnet := getVirtualNetwork() - ig := testinstance.NewTestInstanceGenerator(vnet) + ig := testinstance.NewTestInstanceGenerator(vnet, nil, nil) defer ig.Close() bgen := blocksutil.NewBlockGenerator() @@ -382,7 +382,7 @@ func TestMultipleSessions(t *testing.T) { defer cancel() vnet := getVirtualNetwork() - ig := testinstance.NewTestInstanceGenerator(vnet) + ig := testinstance.NewTestInstanceGenerator(vnet, nil, nil) defer ig.Close() bgen := blocksutil.NewBlockGenerator() @@ -425,7 +425,7 @@ func TestWantlistClearsOnCancel(t *testing.T) { defer cancel() vnet := getVirtualNetwork() - ig := testinstance.NewTestInstanceGenerator(vnet) + ig := testinstance.NewTestInstanceGenerator(vnet, nil, nil) defer ig.Close() bgen := blocksutil.NewBlockGenerator() diff --git a/internal/decision/engine.go b/internal/decision/engine.go index 2e183b06..bf51beae 100644 --- a/internal/decision/engine.go +++ b/internal/decision/engine.go @@ -161,6 +161,8 @@ type Engine struct { // bytes up to which we will replace a want-have with a want-block maxBlockSizeReplaceHasWithBlock int + sendDontHaves bool + self peer.ID } @@ -180,6 +182,7 @@ func newEngine(ctx context.Context, bs bstore.Blockstore, peerTagger PeerTagger, ticker: time.NewTicker(time.Millisecond * 100), maxBlockSizeReplaceHasWithBlock: maxReplaceSize, taskWorkerCount: taskWorkerCount, + sendDontHaves: true, self: self, } e.tagQueued = fmt.Sprintf(tagFormat, "queued", uuid.New().String()) @@ -193,6 +196,16 @@ func newEngine(ctx context.Context, bs bstore.Blockstore, peerTagger PeerTagger, return e } +// SetSendDontHaves indicates what to do when the engine receives a want-block +// for a block that is not in the blockstore. Either +// - Send a DONT_HAVE message +// - Simply don't respond +// Older versions of Bitswap did not respond, so this allows us to simulate +// those older versions for testing. +func (e *Engine) SetSendDontHaves(send bool) { + e.sendDontHaves = send +} + // Start up workers to handle requests from other nodes for the data on this node func (e *Engine) StartWorkers(ctx context.Context, px process.Process) { // Start up blockstore manager @@ -563,7 +576,7 @@ func (e *Engine) MessageReceived(ctx context.Context, p peer.ID, m bsmsg.BitSwap // If the block was not found if !found { // Only add the task to the queue if the requester wants a DONT_HAVE - if entry.SendDontHave { + if e.sendDontHaves && entry.SendDontHave { newWorkExists = true isWantBlock := false if entry.WantType == pb.Message_Wantlist_Block { diff --git a/internal/messagequeue/messagequeue.go b/internal/messagequeue/messagequeue.go index b8caad57..2a48f83e 100644 --- a/internal/messagequeue/messagequeue.go +++ b/internal/messagequeue/messagequeue.go @@ -33,6 +33,11 @@ const ( maxPriority = math.MaxInt32 // sendMessageDebounce is the debounce duration when calling sendMessage() sendMessageDebounce = time.Millisecond + // dontHaveTimeout is used to simulate a DONT_HAVE when communicating with + // a peer whose Bitswap client doesn't support the DONT_HAVE response. + // If the peer doesn't respond to a want-block within the timeout, the + // local node assumes that the peer doesn't have the block. + dontHaveTimeout = 5 * time.Second ) // MessageNetwork is any network that can connect peers and generate a message @@ -56,11 +61,13 @@ type MessageQueue struct { done chan struct{} // Take lock whenever any of these variables are modified - wllock sync.Mutex - bcstWants recallWantlist - peerWants recallWantlist - cancels *cid.Set - priority int + wllock sync.Mutex + bcstWants recallWantlist + peerWants recallWantlist + cancels *cid.Set + priority int + dontHaveTimers map[cid.Cid]*time.Timer + onDontHaveTimeout OnDontHaveTimeout // Dont touch any of these variables outside of run loop sender bsnet.MessageSender @@ -104,13 +111,17 @@ func (r *recallWantlist) RemoveType(c cid.Cid, wtype pb.Message_Wantlist_WantTyp r.pending.RemoveType(c, wtype) } -// New creats a new MessageQueue. -func New(ctx context.Context, p peer.ID, network MessageNetwork) *MessageQueue { - return newMessageQueue(ctx, p, network, maxMessageSize, sendErrorBackoff) +// Fires when a timeout occurs waiting for a response from a peer running an +// older version of Bitswap that doesn't support DONT_HAVE messages. +type OnDontHaveTimeout func(peer.ID, []cid.Cid) + +// New creates a new MessageQueue. +func New(ctx context.Context, p peer.ID, network MessageNetwork, onDontHaveTimeout OnDontHaveTimeout) *MessageQueue { + return newMessageQueue(ctx, p, network, maxMessageSize, sendErrorBackoff, onDontHaveTimeout) } // This constructor is used by the tests -func newMessageQueue(ctx context.Context, p peer.ID, network MessageNetwork, maxMsgSize int, sendErrorBackoff time.Duration) *MessageQueue { +func newMessageQueue(ctx context.Context, p peer.ID, network MessageNetwork, maxMsgSize int, sendErrorBackoff time.Duration, onDontHaveTimeout OnDontHaveTimeout) *MessageQueue { mq := &MessageQueue{ ctx: ctx, p: p, @@ -124,6 +135,8 @@ func newMessageQueue(ctx context.Context, p peer.ID, network MessageNetwork, max rebroadcastInterval: defaultRebroadcastInterval, sendErrorBackoff: sendErrorBackoff, priority: maxPriority, + dontHaveTimers: make(map[cid.Cid]*time.Timer), + onDontHaveTimeout: onDontHaveTimeout, } // Apply debounce to the work ready signal (which triggers sending a message) @@ -198,6 +211,12 @@ func (mq *MessageQueue) AddCancels(cancelKs []cid.Cid) { mq.bcstWants.Remove(c) mq.peerWants.Remove(c) mq.cancels.Add(c) + + // The want has been cancelled, so stop any pending dontHaveTimers for the want + if timer, ok := mq.dontHaveTimers[c]; ok { + timer.Stop() + delete(mq.dontHaveTimers, c) + } } // Schedule a message send @@ -224,6 +243,15 @@ func (mq *MessageQueue) Startup() { // Shutdown stops the processing of messages for a message queue. func (mq *MessageQueue) Shutdown() { + mq.wllock.Lock() + defer mq.wllock.Unlock() + + // Clean up want dontHaveTimers + for c, timer := range mq.dontHaveTimers { + timer.Stop() + delete(mq.dontHaveTimers, c) + } + close(mq.done) } @@ -315,6 +343,8 @@ func (mq *MessageQueue) sendMessage() { // We were able to send successfully. onSent() + mq.simulateDontHaveWithTimeout(message) + // If the message was too big and only a subset of wants could be // sent, schedule sending the rest of the wants in the next // iteration of the event loop. @@ -327,6 +357,61 @@ func (mq *MessageQueue) sendMessage() { } } +// If the peer is running an older version of Bitswap that doesn't support the +// DONT_HAVE response, watch for timeouts on any want-blocks we sent the peer, +// and if there is a timeout simulate a DONT_HAVE response. +func (mq *MessageQueue) simulateDontHaveWithTimeout(msg bsmsg.BitSwapMessage) { + // If the peer supports DONT_HAVE responses, we don't need to simulate + if mq.sender.SupportsHave() { + return + } + + mq.wllock.Lock() + defer mq.wllock.Unlock() + + // Get the CID of each want-block that expects a DONT_HAVE response + wantlist := msg.Wantlist() + wants := make([]cid.Cid, 0, len(wantlist)) + for _, entry := range wantlist { + if entry.WantType == pb.Message_Wantlist_Block && entry.SendDontHave { + // Unlikely, but just in case check that the block hasn't been + // received in the interim + c := entry.Cid + if _, ok := mq.peerWants.allWants.Contains(c); ok { + // Check we don't already have a timer running for this cid + if _, existing := mq.dontHaveTimers[c]; !existing { + wants = append(wants, c) + } + } + } + } + + if len(wants) == 0 { + return + } + + // Create a timer that will fire if we don't receive a block + timer := time.AfterFunc(dontHaveTimeout, func() { + // Figure out which of the blocks that were wanted were not received + // within the timeout + pending := make([]cid.Cid, 0, len(wants)) + for _, c := range wants { + if _, ok := mq.dontHaveTimers[c]; ok { + delete(mq.dontHaveTimers, c) + pending = append(pending, c) + } + } + + // Fire the event + if len(pending) > 0 { + mq.onDontHaveTimeout(mq.p, pending) + } + }) + for _, c := range wants { + mq.dontHaveTimers[c] = timer + } +} + // func (mq *MessageQueue) logOutgoingMessage(msg bsmsg.BitSwapMessage) { // entries := msg.Wantlist() // for _, e := range entries { diff --git a/internal/testinstance/testinstance.go b/internal/testinstance/testinstance.go index 2068928d..b1651db1 100644 --- a/internal/testinstance/testinstance.go +++ b/internal/testinstance/testinstance.go @@ -5,8 +5,8 @@ import ( "time" bitswap "github.com/ipfs/go-bitswap" - bsnet "github.com/ipfs/go-bitswap/network" tn "github.com/ipfs/go-bitswap/internal/testnet" + bsnet "github.com/ipfs/go-bitswap/network" ds "github.com/ipfs/go-datastore" delayed "github.com/ipfs/go-datastore/delayed" ds_sync "github.com/ipfs/go-datastore/sync" @@ -19,24 +19,26 @@ import ( // NewTestInstanceGenerator generates a new InstanceGenerator for the given // testnet -func NewTestInstanceGenerator(net tn.Network, bsOptions ...bitswap.Option) InstanceGenerator { +func NewTestInstanceGenerator(net tn.Network, netOptions []bsnet.NetOpt, bsOptions []bitswap.Option) InstanceGenerator { ctx, cancel := context.WithCancel(context.Background()) return InstanceGenerator{ - net: net, - seq: 0, - ctx: ctx, // TODO take ctx as param to Next, Instances - cancel: cancel, - bsOptions: bsOptions, + net: net, + seq: 0, + ctx: ctx, // TODO take ctx as param to Next, Instances + cancel: cancel, + bsOptions: bsOptions, + netOptions: netOptions, } } // InstanceGenerator generates new test instances of bitswap+dependencies type InstanceGenerator struct { - seq int - net tn.Network - ctx context.Context - cancel context.CancelFunc - bsOptions []bitswap.Option + seq int + net tn.Network + ctx context.Context + cancel context.CancelFunc + bsOptions []bitswap.Option + netOptions []bsnet.NetOpt } // Close closes the clobal context, shutting down all test instances @@ -52,7 +54,7 @@ func (g *InstanceGenerator) Next() Instance { if err != nil { panic("FIXME") // TODO change signature } - return NewInstance(g.ctx, g.net, p, g.bsOptions...) + return NewInstance(g.ctx, g.net, p, g.netOptions, g.bsOptions) } // Instances creates N test instances of bitswap + dependencies and connects @@ -63,6 +65,12 @@ func (g *InstanceGenerator) Instances(n int) []Instance { inst := g.Next() instances = append(instances, inst) } + ConnectInstances(instances) + return instances +} + +// ConnectInstances connects the given instances to each other +func ConnectInstances(instances []Instance) { for i, inst := range instances { for j := i + 1; j < len(instances); j++ { oinst := instances[j] @@ -72,7 +80,6 @@ func (g *InstanceGenerator) Instances(n int) []Instance { } } } - return instances } // Instance is a test instance of bitswap + dependencies for integration testing @@ -100,10 +107,10 @@ func (i *Instance) SetBlockstoreLatency(t time.Duration) time.Duration { // NB: It's easy make mistakes by providing the same peer ID to two different // instances. To safeguard, use the InstanceGenerator to generate instances. It's // just a much better idea. -func NewInstance(ctx context.Context, net tn.Network, p tnet.Identity, options ...bitswap.Option) Instance { +func NewInstance(ctx context.Context, net tn.Network, p tnet.Identity, netOptions []bsnet.NetOpt, bsOptions []bitswap.Option) Instance { bsdelay := delay.Fixed(0) - adapter := net.Adapter(p) + adapter := net.Adapter(p, netOptions...) dstore := ds_sync.MutexWrap(delayed.New(ds.NewMapDatastore(), bsdelay)) bstore, err := blockstore.CachedBlockstore(ctx, @@ -113,7 +120,7 @@ func NewInstance(ctx context.Context, net tn.Network, p tnet.Identity, options . panic(err.Error()) // FIXME perhaps change signature and return error. } - bs := bitswap.New(ctx, adapter, bstore, options...).(*bitswap.Bitswap) + bs := bitswap.New(ctx, adapter, bstore, bsOptions...).(*bitswap.Bitswap) return Instance{ Adapter: adapter, diff --git a/internal/testnet/virtual.go b/internal/testnet/virtual.go index 9a92d1c7..4cce43e8 100644 --- a/internal/testnet/virtual.go +++ b/internal/testnet/virtual.go @@ -18,6 +18,7 @@ import ( "github.com/libp2p/go-libp2p-core/connmgr" "github.com/libp2p/go-libp2p-core/peer" "github.com/libp2p/go-libp2p-core/routing" + protocol "github.com/libp2p/go-libp2p-protocol" tnet "github.com/libp2p/go-libp2p-testing/net" mocknet "github.com/libp2p/go-libp2p/p2p/net/mock" ) @@ -88,10 +89,23 @@ func (n *network) Adapter(p tnet.Identity, opts ...bsnet.NetOpt) bsnet.BitSwapNe n.mu.Lock() defer n.mu.Unlock() + s := bsnet.Settings{ + SupportedProtocols: []protocol.ID{ + bsnet.ProtocolBitswap, + bsnet.ProtocolBitswapOneOne, + bsnet.ProtocolBitswapOneZero, + bsnet.ProtocolBitswapNoVers, + }, + } + for _, opt := range opts { + opt(&s) + } + client := &networkClient{ - local: p.ID(), - network: n, - routing: n.routingserver.Client(p), + local: p.ID(), + network: n, + routing: n.routingserver.Client(p), + supportedProtocols: s.SupportedProtocols, } n.clients[p.ID()] = &receiverQueue{receiver: client} return client @@ -169,9 +183,10 @@ func (n *network) SendMessage( type networkClient struct { local peer.ID bsnet.Receiver - network *network - routing routing.Routing - stats bsnet.Stats + network *network + routing routing.Routing + stats bsnet.Stats + supportedProtocols []protocol.ID } func (nc *networkClient) Self() peer.ID { @@ -240,8 +255,20 @@ func (mp *messagePasser) Reset() error { return nil } +var oldProtos = map[protocol.ID]struct{}{ + bsnet.ProtocolBitswapNoVers: struct{}{}, + bsnet.ProtocolBitswapOneZero: struct{}{}, + bsnet.ProtocolBitswapOneOne: struct{}{}, +} + func (mp *messagePasser) SupportsHave() bool { - return true + protos := mp.net.network.clients[mp.target].receiver.supportedProtocols + for _, proto := range protos { + if _, ok := oldProtos[proto]; !ok { + return true + } + } + return false } func (nc *networkClient) NewMessageSender(ctx context.Context, p peer.ID) (bsnet.MessageSender, error) { From c35d90d8507f872b1259fec34fe8313a4a6f8f99 Mon Sep 17 00:00:00 2001 From: Dirk McCormick Date: Fri, 31 Jan 2020 15:48:21 -0500 Subject: [PATCH 02/11] fix: lint --- benchmarks_test.go | 2 +- internal/messagequeue/messagequeue_test.go | 26 ++++++++++++---------- 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/benchmarks_test.go b/benchmarks_test.go index 92906da0..9761a26c 100644 --- a/benchmarks_test.go +++ b/benchmarks_test.go @@ -15,7 +15,7 @@ import ( "github.com/ipfs/go-bitswap/internal/testutil" blocks "github.com/ipfs/go-block-format" - protocol "github.com/libp2p/go-libp2p-protocol" + protocol "github.com/libp2p/go-libp2p-core/protocol" bitswap "github.com/ipfs/go-bitswap" bssession "github.com/ipfs/go-bitswap/internal/session" diff --git a/internal/messagequeue/messagequeue_test.go b/internal/messagequeue/messagequeue_test.go index ad66c944..f884c377 100644 --- a/internal/messagequeue/messagequeue_test.go +++ b/internal/messagequeue/messagequeue_test.go @@ -6,8 +6,8 @@ import ( "testing" "time" - "github.com/ipfs/go-bitswap/message" "github.com/ipfs/go-bitswap/internal/testutil" + "github.com/ipfs/go-bitswap/message" cid "github.com/ipfs/go-cid" bsmsg "github.com/ipfs/go-bitswap/message" @@ -56,6 +56,8 @@ func (fms *fakeMessageSender) Close() error { fms.fullClosed <- struct{}{} func (fms *fakeMessageSender) Reset() error { fms.reset <- struct{}{}; return nil } func (fms *fakeMessageSender) SupportsHave() bool { return fms.supportsHave } +func mockTimeoutCb(peer.ID, []cid.Cid) {} + func collectMessages(ctx context.Context, t *testing.T, messagesSent <-chan bsmsg.BitSwapMessage, @@ -90,7 +92,7 @@ func TestStartupAndShutdown(t *testing.T) { fakeSender := &fakeMessageSender{nil, fullClosedChan, resetChan, messagesSent, sendErrors, true} fakenet := &fakeMessageNetwork{nil, nil, fakeSender} peerID := testutil.GeneratePeers(1)[0] - messageQueue := New(ctx, peerID, fakenet) + messageQueue := New(ctx, peerID, fakenet, mockTimeoutCb) bcstwh := testutil.GenerateCids(10) messageQueue.Startup() @@ -132,7 +134,7 @@ func TestSendingMessagesDeduped(t *testing.T) { fakeSender := &fakeMessageSender{nil, fullClosedChan, resetChan, messagesSent, sendErrors, true} fakenet := &fakeMessageNetwork{nil, nil, fakeSender} peerID := testutil.GeneratePeers(1)[0] - messageQueue := New(ctx, peerID, fakenet) + messageQueue := New(ctx, peerID, fakenet, mockTimeoutCb) wantHaves := testutil.GenerateCids(10) wantBlocks := testutil.GenerateCids(10) @@ -155,7 +157,7 @@ func TestSendingMessagesPartialDupe(t *testing.T) { fakeSender := &fakeMessageSender{nil, fullClosedChan, resetChan, messagesSent, sendErrors, true} fakenet := &fakeMessageNetwork{nil, nil, fakeSender} peerID := testutil.GeneratePeers(1)[0] - messageQueue := New(ctx, peerID, fakenet) + messageQueue := New(ctx, peerID, fakenet, mockTimeoutCb) wantHaves := testutil.GenerateCids(10) wantBlocks := testutil.GenerateCids(10) @@ -178,7 +180,7 @@ func TestSendingMessagesPriority(t *testing.T) { fakeSender := &fakeMessageSender{nil, fullClosedChan, resetChan, messagesSent, sendErrors, true} fakenet := &fakeMessageNetwork{nil, nil, fakeSender} peerID := testutil.GeneratePeers(1)[0] - messageQueue := New(ctx, peerID, fakenet) + messageQueue := New(ctx, peerID, fakenet, mockTimeoutCb) wantHaves1 := testutil.GenerateCids(5) wantHaves2 := testutil.GenerateCids(5) wantHaves := append(wantHaves1, wantHaves2...) @@ -247,7 +249,7 @@ func TestCancelOverridesPendingWants(t *testing.T) { fakeSender := &fakeMessageSender{nil, fullClosedChan, resetChan, messagesSent, sendErrors, true} fakenet := &fakeMessageNetwork{nil, nil, fakeSender} peerID := testutil.GeneratePeers(1)[0] - messageQueue := New(ctx, peerID, fakenet) + messageQueue := New(ctx, peerID, fakenet, mockTimeoutCb) wantHaves := testutil.GenerateCids(2) wantBlocks := testutil.GenerateCids(2) @@ -281,7 +283,7 @@ func TestWantOverridesPendingCancels(t *testing.T) { fakeSender := &fakeMessageSender{nil, fullClosedChan, resetChan, messagesSent, sendErrors, true} fakenet := &fakeMessageNetwork{nil, nil, fakeSender} peerID := testutil.GeneratePeers(1)[0] - messageQueue := New(ctx, peerID, fakenet) + messageQueue := New(ctx, peerID, fakenet, mockTimeoutCb) cancels := testutil.GenerateCids(3) messageQueue.Startup() @@ -314,7 +316,7 @@ func TestWantlistRebroadcast(t *testing.T) { fakeSender := &fakeMessageSender{nil, fullClosedChan, resetChan, messagesSent, sendErrors, true} fakenet := &fakeMessageNetwork{nil, nil, fakeSender} peerID := testutil.GeneratePeers(1)[0] - messageQueue := New(ctx, peerID, fakenet) + messageQueue := New(ctx, peerID, fakenet, mockTimeoutCb) bcstwh := testutil.GenerateCids(10) wantHaves := testutil.GenerateCids(10) wantBlocks := testutil.GenerateCids(10) @@ -415,7 +417,7 @@ func TestSendingLargeMessages(t *testing.T) { wantBlocks := testutil.GenerateCids(10) entrySize := 44 maxMsgSize := entrySize * 3 // 3 wants - messageQueue := newMessageQueue(ctx, peerID, fakenet, maxMsgSize, sendErrorBackoff) + messageQueue := newMessageQueue(ctx, peerID, fakenet, maxMsgSize, sendErrorBackoff, mockTimeoutCb) messageQueue.Startup() messageQueue.AddWants(wantBlocks, []cid.Cid{}) @@ -442,7 +444,7 @@ func TestSendToPeerThatDoesntSupportHave(t *testing.T) { fakenet := &fakeMessageNetwork{nil, nil, fakeSender} peerID := testutil.GeneratePeers(1)[0] - messageQueue := New(ctx, peerID, fakenet) + messageQueue := New(ctx, peerID, fakenet, mockTimeoutCb) messageQueue.Startup() // If the remote peer doesn't support HAVE / DONT_HAVE messages @@ -498,7 +500,7 @@ func TestResendAfterError(t *testing.T) { fakenet := &fakeMessageNetwork{nil, nil, fakeSender} peerID := testutil.GeneratePeers(1)[0] sendErrBackoff := 5 * time.Millisecond - messageQueue := newMessageQueue(ctx, peerID, fakenet, maxMessageSize, sendErrBackoff) + messageQueue := newMessageQueue(ctx, peerID, fakenet, maxMessageSize, sendErrBackoff, mockTimeoutCb) wantBlocks := testutil.GenerateCids(10) wantHaves := testutil.GenerateCids(10) @@ -536,7 +538,7 @@ func TestResendAfterMaxRetries(t *testing.T) { fakenet := &fakeMessageNetwork{nil, nil, fakeSender} peerID := testutil.GeneratePeers(1)[0] sendErrBackoff := 2 * time.Millisecond - messageQueue := newMessageQueue(ctx, peerID, fakenet, maxMessageSize, sendErrBackoff) + messageQueue := newMessageQueue(ctx, peerID, fakenet, maxMessageSize, sendErrBackoff, mockTimeoutCb) wantBlocks := testutil.GenerateCids(10) wantHaves := testutil.GenerateCids(10) wantBlocks2 := testutil.GenerateCids(10) From 2a57572c03bd5c55fc6ee3d55472d4ecc631279a Mon Sep 17 00:00:00 2001 From: Dirk McCormick Date: Fri, 31 Jan 2020 16:33:52 -0500 Subject: [PATCH 03/11] fix: message queue DONT_HAVE timer bug --- internal/messagequeue/messagequeue.go | 35 +++++++++++++++------- internal/messagequeue/messagequeue_test.go | 22 +++++++++++--- 2 files changed, 43 insertions(+), 14 deletions(-) diff --git a/internal/messagequeue/messagequeue.go b/internal/messagequeue/messagequeue.go index 2a48f83e..6ce21440 100644 --- a/internal/messagequeue/messagequeue.go +++ b/internal/messagequeue/messagequeue.go @@ -55,6 +55,7 @@ type MessageQueue struct { network MessageNetwork maxMessageSize int sendErrorBackoff time.Duration + dontHaveTimeout time.Duration signalWorkReady func() outgoingWork chan struct{} @@ -66,7 +67,7 @@ type MessageQueue struct { peerWants recallWantlist cancels *cid.Set priority int - dontHaveTimers map[cid.Cid]*time.Timer + dontHaveTimers map[cid.Cid]*dontHaveTimer onDontHaveTimeout OnDontHaveTimeout // Dont touch any of these variables outside of run loop @@ -111,17 +112,26 @@ func (r *recallWantlist) RemoveType(c cid.Cid, wtype pb.Message_Wantlist_WantTyp r.pending.RemoveType(c, wtype) } +// dontHaveTimer is used to keep track of references to a timer +// (see OnDontHaveTimeout) +type dontHaveTimer struct { + timer *time.Timer + refs int +} + // Fires when a timeout occurs waiting for a response from a peer running an // older version of Bitswap that doesn't support DONT_HAVE messages. type OnDontHaveTimeout func(peer.ID, []cid.Cid) // New creates a new MessageQueue. func New(ctx context.Context, p peer.ID, network MessageNetwork, onDontHaveTimeout OnDontHaveTimeout) *MessageQueue { - return newMessageQueue(ctx, p, network, maxMessageSize, sendErrorBackoff, onDontHaveTimeout) + return newMessageQueue(ctx, p, network, maxMessageSize, sendErrorBackoff, dontHaveTimeout, onDontHaveTimeout) } // This constructor is used by the tests -func newMessageQueue(ctx context.Context, p peer.ID, network MessageNetwork, maxMsgSize int, sendErrorBackoff time.Duration, onDontHaveTimeout OnDontHaveTimeout) *MessageQueue { +func newMessageQueue(ctx context.Context, p peer.ID, network MessageNetwork, + maxMsgSize int, sendErrorBackoff time.Duration, dontHaveTimeout time.Duration, onDontHaveTimeout OnDontHaveTimeout) *MessageQueue { + mq := &MessageQueue{ ctx: ctx, p: p, @@ -135,7 +145,8 @@ func newMessageQueue(ctx context.Context, p peer.ID, network MessageNetwork, max rebroadcastInterval: defaultRebroadcastInterval, sendErrorBackoff: sendErrorBackoff, priority: maxPriority, - dontHaveTimers: make(map[cid.Cid]*time.Timer), + dontHaveTimers: make(map[cid.Cid]*dontHaveTimer), + dontHaveTimeout: dontHaveTimeout, onDontHaveTimeout: onDontHaveTimeout, } @@ -213,8 +224,11 @@ func (mq *MessageQueue) AddCancels(cancelKs []cid.Cid) { mq.cancels.Add(c) // The want has been cancelled, so stop any pending dontHaveTimers for the want - if timer, ok := mq.dontHaveTimers[c]; ok { - timer.Stop() + if t, ok := mq.dontHaveTimers[c]; ok { + t.refs-- + if t.refs <= 0 { + t.timer.Stop() + } delete(mq.dontHaveTimers, c) } } @@ -247,8 +261,8 @@ func (mq *MessageQueue) Shutdown() { defer mq.wllock.Unlock() // Clean up want dontHaveTimers - for c, timer := range mq.dontHaveTimers { - timer.Stop() + for c, t := range mq.dontHaveTimers { + t.timer.Stop() delete(mq.dontHaveTimers, c) } @@ -391,7 +405,7 @@ func (mq *MessageQueue) simulateDontHaveWithTimeout(msg bsmsg.BitSwapMessage) { } // Create a timer that will fire if we don't receive a block - timer := time.AfterFunc(dontHaveTimeout, func() { + timer := time.AfterFunc(mq.dontHaveTimeout, func() { // Figure out which of the blocks that were wanted were not received // within the timeout pending := make([]cid.Cid, 0, len(wants)) @@ -407,8 +421,9 @@ func (mq *MessageQueue) simulateDontHaveWithTimeout(msg bsmsg.BitSwapMessage) { mq.onDontHaveTimeout(mq.p, pending) } }) + dt := &dontHaveTimer{timer, len(wants)} for _, c := range wants { - mq.dontHaveTimers[c] = timer + mq.dontHaveTimers[c] = dt } } diff --git a/internal/messagequeue/messagequeue_test.go b/internal/messagequeue/messagequeue_test.go index f884c377..f8c7eb51 100644 --- a/internal/messagequeue/messagequeue_test.go +++ b/internal/messagequeue/messagequeue_test.go @@ -417,7 +417,7 @@ func TestSendingLargeMessages(t *testing.T) { wantBlocks := testutil.GenerateCids(10) entrySize := 44 maxMsgSize := entrySize * 3 // 3 wants - messageQueue := newMessageQueue(ctx, peerID, fakenet, maxMsgSize, sendErrorBackoff, mockTimeoutCb) + messageQueue := newMessageQueue(ctx, peerID, fakenet, maxMsgSize, sendErrorBackoff, dontHaveTimeout, mockTimeoutCb) messageQueue.Startup() messageQueue.AddWants(wantBlocks, []cid.Cid{}) @@ -443,8 +443,14 @@ func TestSendToPeerThatDoesntSupportHave(t *testing.T) { fakeSender := &fakeMessageSender{nil, fullClosedChan, resetChan, messagesSent, sendErrors, false} fakenet := &fakeMessageNetwork{nil, nil, fakeSender} peerID := testutil.GeneratePeers(1)[0] + sendErrBackoff := 5 * time.Millisecond - messageQueue := New(ctx, peerID, fakenet, mockTimeoutCb) + var timedOutKs []cid.Cid + dontHaveTimeoutCb := func(p peer.ID, ks []cid.Cid) { + timedOutKs = append(timedOutKs, ks...) + } + dontHaveTimeout := time.Millisecond * 15 + messageQueue := newMessageQueue(ctx, peerID, fakenet, maxMessageSize, sendErrBackoff, dontHaveTimeout, dontHaveTimeoutCb) messageQueue.Startup() // If the remote peer doesn't support HAVE / DONT_HAVE messages @@ -488,6 +494,14 @@ func TestSendToPeerThatDoesntSupportHave(t *testing.T) { t.Fatal("should only send want-blocks") } } + + messageQueue.AddCancels(wbs[:5]) + time.Sleep(20 * time.Millisecond) + + // dontHaveTimeoutCb should be called after the timeout expires + if len(timedOutKs) != len(wbs)-5 { + t.Fatal("expected timeout callback to be called for want-blocks") + } } func TestResendAfterError(t *testing.T) { @@ -500,7 +514,7 @@ func TestResendAfterError(t *testing.T) { fakenet := &fakeMessageNetwork{nil, nil, fakeSender} peerID := testutil.GeneratePeers(1)[0] sendErrBackoff := 5 * time.Millisecond - messageQueue := newMessageQueue(ctx, peerID, fakenet, maxMessageSize, sendErrBackoff, mockTimeoutCb) + messageQueue := newMessageQueue(ctx, peerID, fakenet, maxMessageSize, sendErrBackoff, dontHaveTimeout, mockTimeoutCb) wantBlocks := testutil.GenerateCids(10) wantHaves := testutil.GenerateCids(10) @@ -538,7 +552,7 @@ func TestResendAfterMaxRetries(t *testing.T) { fakenet := &fakeMessageNetwork{nil, nil, fakeSender} peerID := testutil.GeneratePeers(1)[0] sendErrBackoff := 2 * time.Millisecond - messageQueue := newMessageQueue(ctx, peerID, fakenet, maxMessageSize, sendErrBackoff, mockTimeoutCb) + messageQueue := newMessageQueue(ctx, peerID, fakenet, maxMessageSize, sendErrBackoff, dontHaveTimeout, mockTimeoutCb) wantBlocks := testutil.GenerateCids(10) wantHaves := testutil.GenerateCids(10) wantBlocks2 := testutil.GenerateCids(10) From 014348b08e989b05435d200736fe453e9567eaed Mon Sep 17 00:00:00 2001 From: Dirk McCormick Date: Tue, 4 Feb 2020 16:56:03 -0500 Subject: [PATCH 04/11] feat: add message queue pinger --- internal/messagequeue/messagequeue.go | 54 ++++-- internal/messagequeue/messagequeue_test.go | 14 +- internal/messagequeue/mqpinger.go | 205 +++++++++++++++++++++ internal/testnet/virtual.go | 11 ++ network/interface.go | 13 ++ network/ipfs_impl.go | 12 ++ 6 files changed, 292 insertions(+), 17 deletions(-) create mode 100644 internal/messagequeue/mqpinger.go diff --git a/internal/messagequeue/messagequeue.go b/internal/messagequeue/messagequeue.go index 6ce21440..532b3ac4 100644 --- a/internal/messagequeue/messagequeue.go +++ b/internal/messagequeue/messagequeue.go @@ -15,6 +15,7 @@ import ( cid "github.com/ipfs/go-cid" logging "github.com/ipfs/go-log" peer "github.com/libp2p/go-libp2p-core/peer" + "github.com/libp2p/go-libp2p/p2p/protocol/ping" ) var log = logging.Logger("bitswap") @@ -33,11 +34,6 @@ const ( maxPriority = math.MaxInt32 // sendMessageDebounce is the debounce duration when calling sendMessage() sendMessageDebounce = time.Millisecond - // dontHaveTimeout is used to simulate a DONT_HAVE when communicating with - // a peer whose Bitswap client doesn't support the DONT_HAVE response. - // If the peer doesn't respond to a want-block within the timeout, the - // local node assumes that the peer doesn't have the block. - dontHaveTimeout = 5 * time.Second ) // MessageNetwork is any network that can connect peers and generate a message @@ -45,7 +41,8 @@ const ( type MessageNetwork interface { ConnectTo(context.Context, peer.ID) error NewMessageSender(context.Context, peer.ID) (bsnet.MessageSender, error) - Self() peer.ID + Latency(peer.ID) time.Duration + Ping(context.Context, peer.ID) ping.Result } // MessageQueue implements queue of want messages to send to peers. @@ -53,9 +50,9 @@ type MessageQueue struct { ctx context.Context p peer.ID network MessageNetwork + pinger *mqPinger maxMessageSize int sendErrorBackoff time.Duration - dontHaveTimeout time.Duration signalWorkReady func() outgoingWork chan struct{} @@ -112,10 +109,27 @@ func (r *recallWantlist) RemoveType(c cid.Cid, wtype pb.Message_Wantlist_WantTyp r.pending.RemoveType(c, wtype) } +type peerConn struct { + p peer.ID + network MessageNetwork +} + +func newPeerConnection(p peer.ID, network MessageNetwork) *peerConn { + return &peerConn{p, network} +} + +func (pc *peerConn) Ping(ctx context.Context) ping.Result { + return pc.network.Ping(ctx, pc.p) +} + +func (pc *peerConn) Latency() time.Duration { + return pc.network.Latency(pc.p) +} + // dontHaveTimer is used to keep track of references to a timer // (see OnDontHaveTimeout) type dontHaveTimer struct { - timer *time.Timer + timer Stoppable refs int } @@ -125,7 +139,7 @@ type OnDontHaveTimeout func(peer.ID, []cid.Cid) // New creates a new MessageQueue. func New(ctx context.Context, p peer.ID, network MessageNetwork, onDontHaveTimeout OnDontHaveTimeout) *MessageQueue { - return newMessageQueue(ctx, p, network, maxMessageSize, sendErrorBackoff, dontHaveTimeout, onDontHaveTimeout) + return newMessageQueue(ctx, p, network, maxMessageSize, dontHaveTimeout, sendErrorBackoff, onDontHaveTimeout) } // This constructor is used by the tests @@ -136,6 +150,7 @@ func newMessageQueue(ctx context.Context, p peer.ID, network MessageNetwork, ctx: ctx, p: p, network: network, + pinger: newMQPinger(ctx, newPeerConnection(p, network), dontHaveTimeout), maxMessageSize: maxMsgSize, bcstWants: newRecallWantList(), peerWants: newRecallWantList(), @@ -146,7 +161,6 @@ func newMessageQueue(ctx context.Context, p peer.ID, network MessageNetwork, sendErrorBackoff: sendErrorBackoff, priority: maxPriority, dontHaveTimers: make(map[cid.Cid]*dontHaveTimer), - dontHaveTimeout: dontHaveTimeout, onDontHaveTimeout: onDontHaveTimeout, } @@ -257,16 +271,21 @@ func (mq *MessageQueue) Startup() { // Shutdown stops the processing of messages for a message queue. func (mq *MessageQueue) Shutdown() { + close(mq.done) +} + +func (mq *MessageQueue) onShutdown() { mq.wllock.Lock() defer mq.wllock.Unlock() - // Clean up want dontHaveTimers + // Clean up dontHaveTimers for c, t := range mq.dontHaveTimers { t.timer.Stop() delete(mq.dontHaveTimers, c) } - close(mq.done) + // Stop the pinger + mq.pinger.Stop() } func (mq *MessageQueue) runQueue() { @@ -280,11 +299,13 @@ func (mq *MessageQueue) runQueue() { if mq.sender != nil { mq.sender.Close() } + mq.onShutdown() return case <-mq.ctx.Done(): if mq.sender != nil { _ = mq.sender.Reset() } + mq.onShutdown() return } } @@ -343,6 +364,12 @@ func (mq *MessageQueue) sendMessage() { return } + // Make sure the pinger has started + if !mq.sender.SupportsHave() { + // Note: Start is idempotent + mq.pinger.Start() + } + // Convert want lists to a Bitswap Message message, onSent := mq.extractOutgoingMessage(mq.sender.SupportsHave()) if message == nil || message.Empty() { @@ -405,7 +432,7 @@ func (mq *MessageQueue) simulateDontHaveWithTimeout(msg bsmsg.BitSwapMessage) { } // Create a timer that will fire if we don't receive a block - timer := time.AfterFunc(mq.dontHaveTimeout, func() { + timer := mq.pinger.AfterTimeout(func() { // Figure out which of the blocks that were wanted were not received // within the timeout pending := make([]cid.Cid, 0, len(wants)) @@ -520,6 +547,7 @@ func (mq *MessageQueue) extractOutgoingMessage(supportsHave bool) (bsmsg.BitSwap return msg, onSent } + func (mq *MessageQueue) initializeSender() error { if mq.sender != nil { return nil diff --git a/internal/messagequeue/messagequeue_test.go b/internal/messagequeue/messagequeue_test.go index f8c7eb51..e92eece7 100644 --- a/internal/messagequeue/messagequeue_test.go +++ b/internal/messagequeue/messagequeue_test.go @@ -3,6 +3,7 @@ package messagequeue import ( "context" "errors" + "fmt" "testing" "time" @@ -14,6 +15,7 @@ import ( pb "github.com/ipfs/go-bitswap/message/pb" bsnet "github.com/ipfs/go-bitswap/network" peer "github.com/libp2p/go-libp2p-core/peer" + "github.com/libp2p/go-libp2p/p2p/protocol/ping" ) type fakeMessageNetwork struct { @@ -33,7 +35,11 @@ func (fmn *fakeMessageNetwork) NewMessageSender(context.Context, peer.ID) (bsnet return nil, fmn.messageSenderError } -func (fms *fakeMessageNetwork) Self() peer.ID { return "" } +func (fms *fakeMessageNetwork) Self() peer.ID { return "" } +func (fms *fakeMessageNetwork) Latency(peer.ID) time.Duration { return 0 } +func (fms *fakeMessageNetwork) Ping(context.Context, peer.ID) ping.Result { + return ping.Result{Error: fmt.Errorf("err")} +} type fakeMessageSender struct { sendError error @@ -449,7 +455,7 @@ func TestSendToPeerThatDoesntSupportHave(t *testing.T) { dontHaveTimeoutCb := func(p peer.ID, ks []cid.Cid) { timedOutKs = append(timedOutKs, ks...) } - dontHaveTimeout := time.Millisecond * 15 + dontHaveTimeout := time.Millisecond * 1 messageQueue := newMessageQueue(ctx, peerID, fakenet, maxMessageSize, sendErrBackoff, dontHaveTimeout, dontHaveTimeoutCb) messageQueue.Startup() @@ -496,11 +502,11 @@ func TestSendToPeerThatDoesntSupportHave(t *testing.T) { } messageQueue.AddCancels(wbs[:5]) - time.Sleep(20 * time.Millisecond) + time.Sleep(300 * time.Millisecond) // dontHaveTimeoutCb should be called after the timeout expires if len(timedOutKs) != len(wbs)-5 { - t.Fatal("expected timeout callback to be called for want-blocks") + t.Fatal("expected timeout callback to be called for want-blocks", timedOutKs) } } diff --git a/internal/messagequeue/mqpinger.go b/internal/messagequeue/mqpinger.go new file mode 100644 index 00000000..9fc39481 --- /dev/null +++ b/internal/messagequeue/mqpinger.go @@ -0,0 +1,205 @@ +package messagequeue + +import ( + "context" + "sync" + "time" + + "github.com/libp2p/go-libp2p/p2p/protocol/ping" +) + +const ( + // dontHaveTimeout is used to simulate a DONT_HAVE when communicating with + // a peer whose Bitswap client doesn't support the DONT_HAVE response. + // If the peer doesn't respond to a want-block within the timeout, the + // local node assumes that the peer doesn't have the block. + dontHaveTimeout = 5 * time.Second + + // maxExpectedWantProcessTime is the maximum amount of time we expect a + // peer takes to process a want and initiate sending a response to us + maxExpectedWantProcessTime = 200 * time.Millisecond + + // wantProcessLatencyMultiplier is multiplied by the average ping time to + // get an upper bound on how long we expect to wait for a peer's response + // to arrive + wantProcessLatencyMultiplier = 2 + + // maxPingErrorCount is the maximum number of times we can receive an + // error while pinging before we give up + maxPingErrorCount = 3 + + // pingInterval is the time between pings + pingInterval = 60 * time.Second +) + +type PeerConnection interface { + Ping(context.Context) ping.Result + Latency() time.Duration +} + +type Stoppable interface { + Stop() bool +} + +type mqPinger struct { + ctx context.Context + stop func() + peerConn PeerConnection + startCh chan struct{} + pendingTimers chan *pendingTimer + rtt time.Duration + defaultTimeout time.Duration + pingInterval time.Duration + latencyMultiplier int + maxExpectedWantProcessTime time.Duration + maxErrorCount int +} + +func newMQPinger(ctx context.Context, pc PeerConnection, defaultTimeout time.Duration) *mqPinger { + return newMQPingerWithParams(ctx, pc, defaultTimeout, pingInterval, + wantProcessLatencyMultiplier, maxExpectedWantProcessTime, maxPingErrorCount) +} + +func newMQPingerWithParams(ctx context.Context, pc PeerConnection, + defaultTimeout time.Duration, pingInterval time.Duration, latencyMultiplier int, + maxExpectedWantProcessTime time.Duration, maxErrorCount int) *mqPinger { + + ctx, stop := context.WithCancel(ctx) + mqp := &mqPinger{ + ctx: ctx, + stop: stop, + peerConn: pc, + startCh: make(chan struct{}, 1), + pendingTimers: make(chan *pendingTimer, 128), + defaultTimeout: defaultTimeout, + pingInterval: pingInterval, + latencyMultiplier: latencyMultiplier, + maxExpectedWantProcessTime: maxExpectedWantProcessTime, + maxErrorCount: maxErrorCount, + } + + go mqp.init() + + return mqp +} + +type pendingTimer struct { + fn func() + start time.Time + lk sync.Mutex + timer *time.Timer +} + +func (pt *pendingTimer) Stop() bool { + pt.lk.Lock() + defer pt.lk.Unlock() + + if pt.timer != nil { + return pt.timer.Stop() + } + return true +} + +func (mqp *mqPinger) AfterTimeout(fn func()) Stoppable { + pt := &pendingTimer{fn: fn, start: time.Now()} + + select { + case mqp.pendingTimers <- pt: + case <-mqp.ctx.Done(): + } + + return pt +} + +func (mqp *mqPinger) Stop() { + mqp.stop() +} + +func (mqp *mqPinger) Start() { + select { + case mqp.startCh <- struct{}{}: + case <-mqp.ctx.Done(): + default: + } +} + +func (mqp *mqPinger) init() { + defer mqp.shutdown() + + var doPeriodicPing bool + select { + case <-mqp.ctx.Done(): + return + case <-mqp.startCh: + doPeriodicPing = mqp.start() + } + + select { + case <-mqp.ctx.Done(): + return + default: + } + + if doPeriodicPing { + go mqp.periodicPing() + } + + mqp.run() +} + +func (mqp *mqPinger) shutdown() { + // TODO: Do I need to drain the pending timer / startCh queues? +} + +func (mqp *mqPinger) start() bool { + res := mqp.peerConn.Ping(mqp.ctx) + if res.Error != nil { + mqp.rtt = mqp.defaultTimeout + return false + } + + mqp.rtt = mqp.peerConn.Latency() + return true +} + +func (mqp *mqPinger) periodicPing() { + errCnt := 0 + periodicTick := time.NewTimer(mqp.pingInterval) + defer periodicTick.Stop() + + for { + select { + case <-periodicTick.C: + res := mqp.peerConn.Ping(mqp.ctx) + if res.Error != nil { + errCnt++ + if errCnt > mqp.maxErrorCount { + return + } + } + case <-mqp.ctx.Done(): + return + } + } +} + +func (mqp *mqPinger) run() { + for { + select { + case pt := <-mqp.pendingTimers: + mqp.createTimer(pt) + case <-mqp.ctx.Done(): + return + } + } +} + +func (mqp *mqPinger) createTimer(pt *pendingTimer) { + expected := mqp.maxExpectedWantProcessTime + time.Duration(mqp.latencyMultiplier)*mqp.peerConn.Latency() + after := expected - time.Since(pt.start) + + pt.lk.Lock() + defer pt.lk.Unlock() + + pt.timer = time.AfterFunc(after, pt.fn) +} diff --git a/internal/testnet/virtual.go b/internal/testnet/virtual.go index 4cce43e8..a624295e 100644 --- a/internal/testnet/virtual.go +++ b/internal/testnet/virtual.go @@ -21,6 +21,7 @@ import ( protocol "github.com/libp2p/go-libp2p-protocol" tnet "github.com/libp2p/go-libp2p-testing/net" mocknet "github.com/libp2p/go-libp2p/p2p/net/mock" + "github.com/libp2p/go-libp2p/p2p/protocol/ping" ) // VirtualNetwork generates a new testnet instance - a fake network that @@ -193,6 +194,16 @@ func (nc *networkClient) Self() peer.ID { return nc.local } +func (nc *networkClient) Ping(ctx context.Context, p peer.ID) ping.Result { + return ping.Result{RTT: nc.Latency(p)} +} + +func (nc *networkClient) Latency(p peer.ID) time.Duration { + nc.network.mu.Lock() + defer nc.network.mu.Unlock() + return nc.network.latencies[nc.local][p] +} + func (nc *networkClient) SendMessage( ctx context.Context, to peer.ID, diff --git a/network/interface.go b/network/interface.go index 704d851f..6b2878e3 100644 --- a/network/interface.go +++ b/network/interface.go @@ -2,6 +2,7 @@ package network import ( "context" + "time" bsmsg "github.com/ipfs/go-bitswap/message" @@ -10,6 +11,7 @@ import ( "github.com/libp2p/go-libp2p-core/connmgr" "github.com/libp2p/go-libp2p-core/peer" "github.com/libp2p/go-libp2p-core/protocol" + "github.com/libp2p/go-libp2p/p2p/protocol/ping" ) var ( @@ -26,6 +28,7 @@ var ( // BitSwapNetwork provides network connectivity for BitSwap sessions. type BitSwapNetwork interface { Self() peer.ID + // SendMessage sends a BitSwap message to a peer. SendMessage( context.Context, @@ -46,6 +49,8 @@ type BitSwapNetwork interface { Stats() Stats Routing + + Pinger } // MessageSender is an interface for sending a series of messages over the bitswap @@ -82,6 +87,14 @@ type Routing interface { Provide(context.Context, cid.Cid) error } +// Pinger is an interface to ping a peer and get the average latency of all pings +type Pinger interface { + // Ping a peer + Ping(context.Context, peer.ID) ping.Result + // Get the average latency of all pings + Latency(peer.ID) time.Duration +} + // Stats is a container for statistics about the bitswap network // the numbers inside are specific to bitswap, and not any other protocols // using the same underlying network. diff --git a/network/ipfs_impl.go b/network/ipfs_impl.go index 2a25b7a0..b73a2545 100644 --- a/network/ipfs_impl.go +++ b/network/ipfs_impl.go @@ -19,6 +19,7 @@ import ( peerstore "github.com/libp2p/go-libp2p-core/peerstore" "github.com/libp2p/go-libp2p-core/protocol" "github.com/libp2p/go-libp2p-core/routing" + "github.com/libp2p/go-libp2p/p2p/protocol/ping" msgio "github.com/libp2p/go-msgio" ma "github.com/multiformats/go-multiaddr" ) @@ -107,6 +108,17 @@ func (bsnet *impl) Self() peer.ID { return bsnet.host.ID() } +func (bsnet *impl) Ping(ctx context.Context, p peer.ID) ping.Result { + ctx, cancel := context.WithCancel(ctx) + defer cancel() + res := <-ping.Ping(ctx, bsnet.host, p) + return res +} + +func (bsnet *impl) Latency(p peer.ID) time.Duration { + return bsnet.host.Peerstore().LatencyEWMA(p) +} + // Indicates whether the given protocol supports HAVE / DONT_HAVE messages func (bsnet *impl) SupportsHave(proto protocol.ID) bool { switch proto { From 5388e73dcfe69ec29385565e14b63d23de4868cc Mon Sep 17 00:00:00 2001 From: Dirk McCormick Date: Wed, 5 Feb 2020 12:53:31 -0500 Subject: [PATCH 05/11] fix: a couple of bugs in the message queue pinger --- internal/messagequeue/messagequeue.go | 30 +++- internal/messagequeue/messagequeue_test.go | 76 +++++++-- internal/messagequeue/mqpinger.go | 80 ++++++--- internal/messagequeue/mqpinger_test.go | 178 +++++++++++++++++++++ 4 files changed, 320 insertions(+), 44 deletions(-) create mode 100644 internal/messagequeue/mqpinger_test.go diff --git a/internal/messagequeue/messagequeue.go b/internal/messagequeue/messagequeue.go index 532b3ac4..5f948aea 100644 --- a/internal/messagequeue/messagequeue.go +++ b/internal/messagequeue/messagequeue.go @@ -50,7 +50,7 @@ type MessageQueue struct { ctx context.Context p peer.ID network MessageNetwork - pinger *mqPinger + pinger MQPinger maxMessageSize int sendErrorBackoff time.Duration @@ -137,20 +137,34 @@ type dontHaveTimer struct { // older version of Bitswap that doesn't support DONT_HAVE messages. type OnDontHaveTimeout func(peer.ID, []cid.Cid) +// MQPinger pings a peer to estimate latency so it can set a reasonable +// upper bound on when to consider a request timed out +type MQPinger interface { + // Start the pinger (idempotent) + Start() + // Shutdown the pinger (Shutdown is final, pinger cannot be restarted) + Shutdown() + // AfterTimeout runs the given function after the peer has not responded + // for a time defined by the peer's latency. + // The timeout can be stopped by calling Stoppable.Stop() + AfterTimeout(func()) Stoppable +} + // New creates a new MessageQueue. func New(ctx context.Context, p peer.ID, network MessageNetwork, onDontHaveTimeout OnDontHaveTimeout) *MessageQueue { - return newMessageQueue(ctx, p, network, maxMessageSize, dontHaveTimeout, sendErrorBackoff, onDontHaveTimeout) + pinger := newMQPinger(ctx, newPeerConnection(p, network)) + return newMessageQueue(ctx, p, network, maxMessageSize, sendErrorBackoff, pinger, onDontHaveTimeout) } // This constructor is used by the tests func newMessageQueue(ctx context.Context, p peer.ID, network MessageNetwork, - maxMsgSize int, sendErrorBackoff time.Duration, dontHaveTimeout time.Duration, onDontHaveTimeout OnDontHaveTimeout) *MessageQueue { + maxMsgSize int, sendErrorBackoff time.Duration, pinger MQPinger, onDontHaveTimeout OnDontHaveTimeout) *MessageQueue { mq := &MessageQueue{ ctx: ctx, p: p, network: network, - pinger: newMQPinger(ctx, newPeerConnection(p, network), dontHaveTimeout), + pinger: pinger, maxMessageSize: maxMsgSize, bcstWants: newRecallWantList(), peerWants: newRecallWantList(), @@ -284,11 +298,13 @@ func (mq *MessageQueue) onShutdown() { delete(mq.dontHaveTimers, c) } - // Stop the pinger - mq.pinger.Stop() + // Shutdown the pinger + mq.pinger.Shutdown() } func (mq *MessageQueue) runQueue() { + defer mq.onShutdown() + for { select { case <-mq.rebroadcastTimer.C: @@ -299,13 +315,11 @@ func (mq *MessageQueue) runQueue() { if mq.sender != nil { mq.sender.Close() } - mq.onShutdown() return case <-mq.ctx.Done(): if mq.sender != nil { _ = mq.sender.Reset() } - mq.onShutdown() return } } diff --git a/internal/messagequeue/messagequeue_test.go b/internal/messagequeue/messagequeue_test.go index e92eece7..29079044 100644 --- a/internal/messagequeue/messagequeue_test.go +++ b/internal/messagequeue/messagequeue_test.go @@ -38,7 +38,17 @@ func (fmn *fakeMessageNetwork) NewMessageSender(context.Context, peer.ID) (bsnet func (fms *fakeMessageNetwork) Self() peer.ID { return "" } func (fms *fakeMessageNetwork) Latency(peer.ID) time.Duration { return 0 } func (fms *fakeMessageNetwork) Ping(context.Context, peer.ID) ping.Result { - return ping.Result{Error: fmt.Errorf("err")} + return ping.Result{Error: fmt.Errorf("ping error")} +} + +type fakePinger struct { + timeout time.Duration +} + +func (fp *fakePinger) Start() {} +func (fp *fakePinger) Shutdown() {} +func (fp *fakePinger) AfterTimeout(fn func()) Stoppable { + return time.AfterFunc(fp.timeout, fn) } type fakeMessageSender struct { @@ -418,12 +428,13 @@ func TestSendingLargeMessages(t *testing.T) { fullClosedChan := make(chan struct{}, 1) fakeSender := &fakeMessageSender{nil, fullClosedChan, resetChan, messagesSent, sendErrors, true} fakenet := &fakeMessageNetwork{nil, nil, fakeSender} + pinger := &fakePinger{10000 * time.Millisecond} peerID := testutil.GeneratePeers(1)[0] wantBlocks := testutil.GenerateCids(10) entrySize := 44 maxMsgSize := entrySize * 3 // 3 wants - messageQueue := newMessageQueue(ctx, peerID, fakenet, maxMsgSize, sendErrorBackoff, dontHaveTimeout, mockTimeoutCb) + messageQueue := newMessageQueue(ctx, peerID, fakenet, maxMsgSize, sendErrorBackoff, pinger, mockTimeoutCb) messageQueue.Startup() messageQueue.AddWants(wantBlocks, []cid.Cid{}) @@ -449,14 +460,8 @@ func TestSendToPeerThatDoesntSupportHave(t *testing.T) { fakeSender := &fakeMessageSender{nil, fullClosedChan, resetChan, messagesSent, sendErrors, false} fakenet := &fakeMessageNetwork{nil, nil, fakeSender} peerID := testutil.GeneratePeers(1)[0] - sendErrBackoff := 5 * time.Millisecond - var timedOutKs []cid.Cid - dontHaveTimeoutCb := func(p peer.ID, ks []cid.Cid) { - timedOutKs = append(timedOutKs, ks...) - } - dontHaveTimeout := time.Millisecond * 1 - messageQueue := newMessageQueue(ctx, peerID, fakenet, maxMessageSize, sendErrBackoff, dontHaveTimeout, dontHaveTimeoutCb) + messageQueue := New(ctx, peerID, fakenet, mockTimeoutCb) messageQueue.Startup() // If the remote peer doesn't support HAVE / DONT_HAVE messages @@ -500,12 +505,51 @@ func TestSendToPeerThatDoesntSupportHave(t *testing.T) { t.Fatal("should only send want-blocks") } } +} + +func TestTimeoutOnSendToPeerThatDoesntSupportHave(t *testing.T) { + ctx := context.Background() + messagesSent := make(chan bsmsg.BitSwapMessage) + sendErrors := make(chan error) + resetChan := make(chan struct{}, 1) + fullClosedChan := make(chan struct{}, 1) + fakeSender := &fakeMessageSender{nil, fullClosedChan, resetChan, messagesSent, sendErrors, false} + fakenet := &fakeMessageNetwork{nil, nil, fakeSender} + peerID := testutil.GeneratePeers(1)[0] + sendErrBackoff := 5 * time.Millisecond + + var timedOutKs []cid.Cid + dontHaveTimeoutCb := func(p peer.ID, ks []cid.Cid) { + timedOutKs = append(timedOutKs, ks...) + } + // Timeout after 15ms + pinger := &fakePinger{15 * time.Millisecond} + messageQueue := newMessageQueue(ctx, peerID, fakenet, maxMessageSize, sendErrBackoff, pinger, dontHaveTimeoutCb) + messageQueue.Startup() + + // Send some want-blocks + wbs := testutil.GenerateCids(10) + messageQueue.AddWants(wbs, nil) + messages := collectMessages(ctx, t, messagesSent, 10*time.Millisecond) + + if len(messages) != 1 { + t.Fatal("wrong number of messages were sent", len(messages)) + } + wl := messages[0].Wantlist() + if len(wl) != len(wbs) { + t.Fatal("should only send want-blocks") + } + + // Cancel some of the want-blocks + cancelCount := 3 + messageQueue.AddCancels(wbs[:cancelCount]) - messageQueue.AddCancels(wbs[:5]) - time.Sleep(300 * time.Millisecond) + // Wait for a response to the remaining want-blocks + time.Sleep(10 * time.Millisecond) - // dontHaveTimeoutCb should be called after the timeout expires - if len(timedOutKs) != len(wbs)-5 { + // dontHaveTimeoutCb should be called after the timeout expires with the + // want-blocks that we were waiting for + if len(timedOutKs) != len(wbs)-cancelCount { t.Fatal("expected timeout callback to be called for want-blocks", timedOutKs) } } @@ -518,9 +562,10 @@ func TestResendAfterError(t *testing.T) { fullClosedChan := make(chan struct{}, 1) fakeSender := &fakeMessageSender{nil, fullClosedChan, resetChan, messagesSent, sendErrors, true} fakenet := &fakeMessageNetwork{nil, nil, fakeSender} + pinger := &fakePinger{10000 * time.Millisecond} peerID := testutil.GeneratePeers(1)[0] sendErrBackoff := 5 * time.Millisecond - messageQueue := newMessageQueue(ctx, peerID, fakenet, maxMessageSize, sendErrBackoff, dontHaveTimeout, mockTimeoutCb) + messageQueue := newMessageQueue(ctx, peerID, fakenet, maxMessageSize, sendErrBackoff, pinger, mockTimeoutCb) wantBlocks := testutil.GenerateCids(10) wantHaves := testutil.GenerateCids(10) @@ -556,9 +601,10 @@ func TestResendAfterMaxRetries(t *testing.T) { fullClosedChan := make(chan struct{}, 1) fakeSender := &fakeMessageSender{nil, fullClosedChan, resetChan, messagesSent, sendErrors, true} fakenet := &fakeMessageNetwork{nil, nil, fakeSender} + pinger := &fakePinger{10000 * time.Millisecond} peerID := testutil.GeneratePeers(1)[0] sendErrBackoff := 2 * time.Millisecond - messageQueue := newMessageQueue(ctx, peerID, fakenet, maxMessageSize, sendErrBackoff, dontHaveTimeout, mockTimeoutCb) + messageQueue := newMessageQueue(ctx, peerID, fakenet, maxMessageSize, sendErrBackoff, pinger, mockTimeoutCb) wantBlocks := testutil.GenerateCids(10) wantHaves := testutil.GenerateCids(10) wantBlocks2 := testutil.GenerateCids(10) diff --git a/internal/messagequeue/mqpinger.go b/internal/messagequeue/mqpinger.go index 9fc39481..0d4fd3d8 100644 --- a/internal/messagequeue/mqpinger.go +++ b/internal/messagequeue/mqpinger.go @@ -19,10 +19,10 @@ const ( // peer takes to process a want and initiate sending a response to us maxExpectedWantProcessTime = 200 * time.Millisecond - // wantProcessLatencyMultiplier is multiplied by the average ping time to + // latencyMultiplier is multiplied by the average ping time to // get an upper bound on how long we expect to wait for a peer's response // to arrive - wantProcessLatencyMultiplier = 2 + latencyMultiplier = 2 // maxPingErrorCount is the maximum number of times we can receive an // error while pinging before we give up @@ -32,18 +32,26 @@ const ( pingInterval = 60 * time.Second ) +// PeerConnection is a connection to a peer that can be pinged, and the +// average latency measured type PeerConnection interface { + // Ping the peer Ping(context.Context) ping.Result + // The average latency of all pings Latency() time.Duration } +// Stoppable can be stopped type Stoppable interface { + // Stop the Stoppable - the return value is false if it was already stopped. Stop() bool } +// mqPinger pings the peer to measure latency. It uses the latency to set a +// reasonable timeout for AfterTimeout(). type mqPinger struct { ctx context.Context - stop func() + shutdown func() peerConn PeerConnection startCh chan struct{} pendingTimers chan *pendingTimer @@ -55,19 +63,21 @@ type mqPinger struct { maxErrorCount int } -func newMQPinger(ctx context.Context, pc PeerConnection, defaultTimeout time.Duration) *mqPinger { - return newMQPingerWithParams(ctx, pc, defaultTimeout, pingInterval, - wantProcessLatencyMultiplier, maxExpectedWantProcessTime, maxPingErrorCount) +// newMQPinger creates a new mqPinger +func newMQPinger(ctx context.Context, pc PeerConnection) *mqPinger { + return newMQPingerWithParams(ctx, pc, dontHaveTimeout, pingInterval, + latencyMultiplier, maxExpectedWantProcessTime, maxPingErrorCount) } +// newMQPingerWithParams is used by the tests func newMQPingerWithParams(ctx context.Context, pc PeerConnection, defaultTimeout time.Duration, pingInterval time.Duration, latencyMultiplier int, maxExpectedWantProcessTime time.Duration, maxErrorCount int) *mqPinger { - ctx, stop := context.WithCancel(ctx) + ctx, shutdown := context.WithCancel(ctx) mqp := &mqPinger{ ctx: ctx, - stop: stop, + shutdown: shutdown, peerConn: pc, startCh: make(chan struct{}, 1), pendingTimers: make(chan *pendingTimer, 128), @@ -83,6 +93,8 @@ func newMQPingerWithParams(ctx context.Context, pc PeerConnection, return mqp } +// pendingTimer keeps information required for a timer that has been craeted +// but not yet processed type pendingTimer struct { fn func() start time.Time @@ -90,16 +102,21 @@ type pendingTimer struct { timer *time.Timer } +// Stop the timer func (pt *pendingTimer) Stop() bool { pt.lk.Lock() defer pt.lk.Unlock() + // If the timer has been started, stop it if pt.timer != nil { return pt.timer.Stop() } - return true + return false } +// AfterTimeout calls the given function after a reasonable estimate of when +// the peer should be expected to respond by. The estimate is derived from +// the ping latency plus some padding for processing time. func (mqp *mqPinger) AfterTimeout(fn func()) Stoppable { pt := &pendingTimer{fn: fn, start: time.Now()} @@ -111,10 +128,12 @@ func (mqp *mqPinger) AfterTimeout(fn func()) Stoppable { return pt } -func (mqp *mqPinger) Stop() { - mqp.stop() +// Shutdown the mqPinger. Any subsequent call to Start() will be ignored +func (mqp *mqPinger) Shutdown() { + mqp.shutdown() } +// Start the mqPinger func (mqp *mqPinger) Start() { select { case mqp.startCh <- struct{}{}: @@ -123,9 +142,12 @@ func (mqp *mqPinger) Start() { } } +// init the mqPinger func (mqp *mqPinger) init() { - defer mqp.shutdown() + defer mqp.onShutdown() + // Ping the peer. If there's an error don't do periodic pings, just use + // the default timeout. var doPeriodicPing bool select { case <-mqp.ctx.Done(): @@ -140,21 +162,28 @@ func (mqp *mqPinger) init() { default: } + // If doing periodic pings, start pinging if doPeriodicPing { go mqp.periodicPing() } - mqp.run() + // Process incoming timers + mqp.processTimers() } -func (mqp *mqPinger) shutdown() { +// onShutdown is called when the mqPinger shuts down +func (mqp *mqPinger) onShutdown() { // TODO: Do I need to drain the pending timer / startCh queues? } +// start pings the peer and sets the RTT. If there's an error, it derives the +// RTT from the default timeout and returns false. func (mqp *mqPinger) start() bool { - res := mqp.peerConn.Ping(mqp.ctx) + ctx, cancel := context.WithTimeout(mqp.ctx, mqp.defaultTimeout) + defer cancel() + res := mqp.peerConn.Ping(ctx) if res.Error != nil { - mqp.rtt = mqp.defaultTimeout + mqp.rtt = mqp.defaultTimeout / time.Duration(mqp.latencyMultiplier) return false } @@ -162,28 +191,33 @@ func (mqp *mqPinger) start() bool { return true } +// Ping the peer periodically func (mqp *mqPinger) periodicPing() { errCnt := 0 - periodicTick := time.NewTimer(mqp.pingInterval) - defer periodicTick.Stop() + timer := time.NewTimer(mqp.pingInterval) + defer timer.Stop() for { select { - case <-periodicTick.C: + case <-timer.C: + // Ping the peer res := mqp.peerConn.Ping(mqp.ctx) if res.Error != nil { errCnt++ + // If there are too many errors, stop pinging if errCnt > mqp.maxErrorCount { return } } + timer = time.NewTimer(mqp.pingInterval) case <-mqp.ctx.Done(): return } } } -func (mqp *mqPinger) run() { +// processTimers pulls timer requests of the queue and starts them +func (mqp *mqPinger) processTimers() { for { select { case pt := <-mqp.pendingTimers: @@ -194,8 +228,12 @@ func (mqp *mqPinger) run() { } } +// createTimer turns a pendingTimer into a timer func (mqp *mqPinger) createTimer(pt *pendingTimer) { - expected := mqp.maxExpectedWantProcessTime + time.Duration(mqp.latencyMultiplier)*mqp.peerConn.Latency() + // The maximum expected time for a response is + // the expected time to process the want + (latency * multiplier) + // The multiplier is to provide some padding for variable latency. + expected := mqp.maxExpectedWantProcessTime + time.Duration(mqp.latencyMultiplier)*mqp.rtt after := expected - time.Since(pt.start) pt.lk.Lock() diff --git a/internal/messagequeue/mqpinger_test.go b/internal/messagequeue/mqpinger_test.go new file mode 100644 index 00000000..ac437a87 --- /dev/null +++ b/internal/messagequeue/mqpinger_test.go @@ -0,0 +1,178 @@ +package messagequeue + +import ( + "context" + "fmt" + "testing" + "time" + + "github.com/libp2p/go-libp2p/p2p/protocol/ping" +) + +type mockPeerConn struct { + err error + latency time.Duration + latencies []time.Duration +} + +func (pc *mockPeerConn) Ping(ctx context.Context) ping.Result { + timer := time.NewTimer(pc.latency) + select { + case <-timer.C: + if pc.err != nil { + return ping.Result{Error: pc.err} + } + pc.latencies = append(pc.latencies, pc.latency) + case <-ctx.Done(): + } + return ping.Result{RTT: pc.latency} +} + +func (pc *mockPeerConn) Latency() time.Duration { + sum := time.Duration(0) + if len(pc.latencies) == 0 { + return sum + } + for _, l := range pc.latencies { + sum += l + } + return sum / time.Duration(len(pc.latencies)) +} + +func TestPingerTimeout(t *testing.T) { + latency := time.Millisecond * 10 + latMultiplier := 2 + expProcessTime := 5 * time.Millisecond + expectedTimeout := expProcessTime + latency*time.Duration(latMultiplier) + + ctx := context.Background() + pc := &mockPeerConn{latency: latency} + mqp := newMQPingerWithParams(ctx, pc, dontHaveTimeout, pingInterval, + latMultiplier, expProcessTime, maxPingErrorCount) + mqp.Start() + + timedOut := false + onTimeout := func() { + timedOut = true + } + mqp.AfterTimeout(onTimeout) + + time.Sleep(expectedTimeout - 5*time.Millisecond) + if timedOut { + t.Fatal("expected timeout not to have happened yet") + } + + time.Sleep(10 * time.Millisecond) + if !timedOut { + t.Fatal("expected timeout") + } +} + +func TestPingerDoestNotCreateTimersAfterShutdown(t *testing.T) { + latency := time.Millisecond * 10 + latMultiplier := 2 + expProcessTime := 5 * time.Millisecond + expectedTimeout := expProcessTime + latency*time.Duration(latMultiplier) + + ctx := context.Background() + pc := &mockPeerConn{latency: latency} + mqp := newMQPingerWithParams(ctx, pc, dontHaveTimeout, pingInterval, + latMultiplier, expProcessTime, maxPingErrorCount) + + // Give the pinger some time to start + mqp.Start() + time.Sleep(time.Millisecond * 5) + + // Shut down the pinger + mqp.Shutdown() + + // Set a timer (should never fire) + timedOut := false + onTimeout := func() { + timedOut = true + } + mqp.AfterTimeout(onTimeout) + + time.Sleep(expectedTimeout) + if timedOut { + t.Fatal("expected timeout not to fire") + } +} + +func TestPingerPeriodic(t *testing.T) { + latency := time.Millisecond * 5 + // No latency padding + latMultiplier := 1 + // No processing time + expProcessTime := time.Duration(0) + // No interval between pings + pingInt := time.Duration(0) + + ctx := context.Background() + pc := &mockPeerConn{latency: latency} + mqp := newMQPingerWithParams(ctx, pc, dontHaveTimeout, pingInt, + latMultiplier, expProcessTime, maxPingErrorCount) + mqp.Start() + + // Allow time for several pings + time.Sleep(latency * 10) + if len(pc.latencies) < 8 { + t.Fatal("expected several pings", len(pc.latencies)) + } + + // Timeout should be the average of ping latency + timedOut := false + onTimeout := func() { + timedOut = true + } + mqp.AfterTimeout(onTimeout) + + time.Sleep(latency / 2) + if timedOut { + t.Fatal("expected timeout not to have happened yet") + } + + time.Sleep(latency) + if !timedOut { + t.Fatal("expected timeout") + } + + // There should be no further pings after shutdown + mqp.Shutdown() + + pingCount := len(pc.latencies) + time.Sleep(latency * 5) + if len(pc.latencies) != pingCount { + t.Fatal("expected no more pings", len(pc.latencies)) + } +} + +func TestPingerUsesDefaultTimeoutIfPingError(t *testing.T) { + latency := time.Millisecond * 1 + latMultiplier := 2 + expProcessTime := 2 * time.Millisecond + defaultTimeout := 10 * time.Millisecond + expectedTimeout := expProcessTime + defaultTimeout + + ctx := context.Background() + pc := &mockPeerConn{latency: latency, err: fmt.Errorf("ping error")} + mqp := newMQPingerWithParams(ctx, pc, defaultTimeout, pingInterval, + latMultiplier, expProcessTime, maxPingErrorCount) + mqp.Start() + + timedOut := false + onTimeout := func() { + timedOut = true + } + mqp.AfterTimeout(onTimeout) + + time.Sleep(expectedTimeout - 5*time.Millisecond) + if timedOut { + t.Fatal("expected timeout not to have happened yet") + } + + time.Sleep(10 * time.Millisecond) + if !timedOut { + t.Fatal("expected timeout") + } +} From 27505d2d62384dfe835039c9ee5bbda7ef78a1f8 Mon Sep 17 00:00:00 2001 From: Dirk McCormick Date: Wed, 5 Feb 2020 14:20:04 -0500 Subject: [PATCH 06/11] fix: lint --- internal/testnet/virtual.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/testnet/virtual.go b/internal/testnet/virtual.go index a624295e..1d1c7b79 100644 --- a/internal/testnet/virtual.go +++ b/internal/testnet/virtual.go @@ -17,8 +17,8 @@ import ( "github.com/libp2p/go-libp2p-core/connmgr" "github.com/libp2p/go-libp2p-core/peer" + protocol "github.com/libp2p/go-libp2p-core/protocol" "github.com/libp2p/go-libp2p-core/routing" - protocol "github.com/libp2p/go-libp2p-protocol" tnet "github.com/libp2p/go-libp2p-testing/net" mocknet "github.com/libp2p/go-libp2p/p2p/net/mock" "github.com/libp2p/go-libp2p/p2p/protocol/ping" From 574b57717c71e6e2f738a28dc4c0b849e8f7dfa3 Mon Sep 17 00:00:00 2001 From: Dirk McCormick Date: Fri, 7 Feb 2020 15:39:56 -0500 Subject: [PATCH 07/11] refactor: simplify DONT_HAVE timeout management --- internal/messagequeue/donthavetimeoutmgr.go | 270 ++++++++++++++++++ .../messagequeue/donthavetimeoutmgr_test.go | 267 +++++++++++++++++ internal/messagequeue/messagequeue.go | 117 +++----- internal/messagequeue/messagequeue_test.go | 76 ++--- internal/messagequeue/mqpinger.go | 243 ---------------- internal/messagequeue/mqpinger_test.go | 178 ------------ 6 files changed, 611 insertions(+), 540 deletions(-) create mode 100644 internal/messagequeue/donthavetimeoutmgr.go create mode 100644 internal/messagequeue/donthavetimeoutmgr_test.go delete mode 100644 internal/messagequeue/mqpinger.go delete mode 100644 internal/messagequeue/mqpinger_test.go diff --git a/internal/messagequeue/donthavetimeoutmgr.go b/internal/messagequeue/donthavetimeoutmgr.go new file mode 100644 index 00000000..83147557 --- /dev/null +++ b/internal/messagequeue/donthavetimeoutmgr.go @@ -0,0 +1,270 @@ +package messagequeue + +import ( + "context" + "sync" + "time" + + cid "github.com/ipfs/go-cid" + "github.com/libp2p/go-libp2p/p2p/protocol/ping" +) + +const ( + // dontHaveTimeout is used to simulate a DONT_HAVE when communicating with + // a peer whose Bitswap client doesn't support the DONT_HAVE response. + // If the peer doesn't respond to a want-block within the timeout, the + // local node assumes that the peer doesn't have the block. + dontHaveTimeout = 5 * time.Second + + // maxExpectedWantProcessTime is the maximum amount of time we expect a + // peer takes to process a want and initiate sending a response to us + maxExpectedWantProcessTime = 200 * time.Millisecond + + // latencyMultiplier is multiplied by the average ping time to + // get an upper bound on how long we expect to wait for a peer's response + // to arrive + latencyMultiplier = 2 +) + +// PeerConnection is a connection to a peer that can be pinged, and the +// average latency measured +type PeerConnection interface { + // Ping the peer + Ping(context.Context) ping.Result + // The average latency of all pings + Latency() time.Duration +} + +// dontHaveTimeoutMgr pings the peer to measure latency. It uses the latency to +// set a reasonable timeout for simulating a DONT_HAVE message for peers that +// don't support DONT_HAVE +type dontHaveTimeoutMgr struct { + ctx context.Context + shutdown func() + peerConn PeerConnection + onDontHaveTimeout func([]cid.Cid) + defaultTimeout time.Duration + latencyMultiplier int + maxExpectedWantProcessTime time.Duration + + // All variables below here must be proctected by the lock + lk sync.RWMutex + started bool + pending map[cid.Cid]time.Time + timeout time.Duration + checkForTimeoutsTimer *time.Timer +} + +// newDontHaveTimeoutMgr creates a new dontHaveTimeoutMgr +// onDontHaveTimeout is called when pending keys expire (not cancelled before timeout) +func newDontHaveTimeoutMgr(ctx context.Context, pc PeerConnection, onDontHaveTimeout func([]cid.Cid)) *dontHaveTimeoutMgr { + return newDontHaveTimeoutMgrWithParams(ctx, pc, onDontHaveTimeout, dontHaveTimeout, + latencyMultiplier, maxExpectedWantProcessTime) +} + +// newDontHaveTimeoutMgrWithParams is used by the tests +func newDontHaveTimeoutMgrWithParams(ctx context.Context, pc PeerConnection, onDontHaveTimeout func([]cid.Cid), + defaultTimeout time.Duration, latencyMultiplier int, + maxExpectedWantProcessTime time.Duration) *dontHaveTimeoutMgr { + + ctx, shutdown := context.WithCancel(ctx) + mqp := &dontHaveTimeoutMgr{ + ctx: ctx, + shutdown: shutdown, + peerConn: pc, + pending: make(map[cid.Cid]time.Time), + timeout: defaultTimeout, + defaultTimeout: defaultTimeout, + latencyMultiplier: latencyMultiplier, + maxExpectedWantProcessTime: maxExpectedWantProcessTime, + onDontHaveTimeout: onDontHaveTimeout, + } + + return mqp +} + +// Shutdown the dontHaveTimeoutMgr. Any subsequent call to Start() will be ignored +func (dhtm *dontHaveTimeoutMgr) Shutdown() { + dhtm.shutdown() +} + +// onShutdown is called when the dontHaveTimeoutMgr shuts down +func (dhtm *dontHaveTimeoutMgr) onShutdown() { + dhtm.lk.Lock() + defer dhtm.lk.Unlock() + + // Clear any pending check for timeouts + if dhtm.checkForTimeoutsTimer != nil { + dhtm.checkForTimeoutsTimer.Stop() + } +} + +// onStarted is called when the dontHaveTimeoutMgr starts. +// It monitors for the context being cancelled. +func (dhtm *dontHaveTimeoutMgr) onStarted() { + <-dhtm.ctx.Done() + dhtm.onShutdown() +} + +// Start the dontHaveTimeoutMgr. This method is idempotent +func (dhtm *dontHaveTimeoutMgr) Start() { + dhtm.lk.Lock() + defer dhtm.lk.Unlock() + + // Make sure the dont have timeout manager hasn't already been started + if dhtm.started { + return + } + dhtm.started = true + + go dhtm.onStarted() + + // If we already have a measure of latency to the peer, use it to + // calculate a reasonable timeout + latency := dhtm.peerConn.Latency() + if latency.Nanoseconds() > 0 { + dhtm.timeout = dhtm.calculateTimeoutFromLatency(latency) + return + } + + // Otherwise measure latency by pinging the peer + go dhtm.measureLatency() +} + +// measureLatency measures the latency to the peer by pinging it +func (dhtm *dontHaveTimeoutMgr) measureLatency() { + // Wait up to defaultTimeout for a response to the ping + ctx, cancel := context.WithTimeout(dhtm.ctx, dhtm.defaultTimeout) + defer cancel() + + // Ping the peer + res := dhtm.peerConn.Ping(ctx) + if res.Error != nil { + // If there was an error, we'll just leave the timeout as + // defaultTimeout + return + } + + // Get the average latency to the peer + latency := dhtm.peerConn.Latency() + + dhtm.lk.Lock() + defer dhtm.lk.Unlock() + + // Calculate a reasonable timeout based on latency + dhtm.timeout = dhtm.calculateTimeoutFromLatency(latency) + + // Check if after changing the timeout there are any pending wants that are + // now over the timeout + go dhtm.checkForTimeouts() +} + +// checkForTimeouts checks pending wants to see if any are over the timeout +func (dhtm *dontHaveTimeoutMgr) checkForTimeouts() { + dhtm.lk.Lock() + defer dhtm.lk.Unlock() + + if len(dhtm.pending) == 0 { + return + } + + // Figure out which of the blocks that were wanted were not received + // within the timeout + pending := make([]cid.Cid, 0, len(dhtm.pending)) + var oldest time.Time + for c, started := range dhtm.pending { + if time.Since(started) > dhtm.timeout { + pending = append(pending, c) + delete(dhtm.pending, c) + } else if oldest.IsZero() || started.Before(oldest) { + oldest = started + } + } + + // Fire the timeout event for the pending wants + if len(pending) > 0 { + go dhtm.fireTimeout(pending) + } + + if len(dhtm.pending) == 0 { + return + } + + // Make sure things are still running + select { + case <-dhtm.ctx.Done(): + return + default: + } + + // If there's an existing scheduled check, stop it + if dhtm.checkForTimeoutsTimer != nil { + dhtm.checkForTimeoutsTimer.Stop() + } + + // Schedule the next check for the moment when the oldest pending want will + // timeout + until := time.Until(oldest.Add(dhtm.timeout)) + dhtm.checkForTimeoutsTimer = time.AfterFunc(until, dhtm.checkForTimeouts) +} + +// AddPending adds the given keys that will expire if not cancelled before +// the timeout +func (dhtm *dontHaveTimeoutMgr) AddPending(ks []cid.Cid) { + if len(ks) == 0 { + return + } + + start := time.Now() + + dhtm.lk.Lock() + defer dhtm.lk.Unlock() + + queueWasEmpty := len(dhtm.pending) == 0 + + // Record the start time for each key + for _, c := range ks { + if _, ok := dhtm.pending[c]; !ok { + dhtm.pending[c] = start + } + } + + // If there was already an earlier pending item in the queue, then there + // must already be a timeout check scheduled. If there is nothing in the + // queue then we should make sure to schedule a check. + if queueWasEmpty { + go dhtm.checkForTimeouts() + } +} + +// CancelPending is called when we receive a response for a key +func (dhtm *dontHaveTimeoutMgr) CancelPending(ks []cid.Cid) { + dhtm.lk.Lock() + defer dhtm.lk.Unlock() + + // The want has been cancelled, so remove pending timers + for _, c := range ks { + delete(dhtm.pending, c) + } +} + +// fireTimeout fires the onDontHaveTimeout method with the timed out keys +func (dhtm *dontHaveTimeoutMgr) fireTimeout(pending []cid.Cid) { + // Make sure the timeout manager has not been shut down + select { + case <-dhtm.ctx.Done(): + return + default: + } + + // Fire the timeout + dhtm.onDontHaveTimeout(pending) +} + +// calculateTimeoutFromLatency calculates a reasonable timeout derived from latency +func (dhtm *dontHaveTimeoutMgr) calculateTimeoutFromLatency(latency time.Duration) time.Duration { + // The maximum expected time for a response is + // the expected time to process the want + (latency * multiplier) + // The multiplier is to provide some padding for variable latency. + return dhtm.maxExpectedWantProcessTime + time.Duration(dhtm.latencyMultiplier)*latency +} diff --git a/internal/messagequeue/donthavetimeoutmgr_test.go b/internal/messagequeue/donthavetimeoutmgr_test.go new file mode 100644 index 00000000..0f072858 --- /dev/null +++ b/internal/messagequeue/donthavetimeoutmgr_test.go @@ -0,0 +1,267 @@ +package messagequeue + +import ( + "context" + "fmt" + "testing" + "time" + + "github.com/ipfs/go-bitswap/internal/testutil" + cid "github.com/ipfs/go-cid" + "github.com/libp2p/go-libp2p/p2p/protocol/ping" +) + +type mockPeerConn struct { + err error + latency time.Duration + latencies []time.Duration +} + +func (pc *mockPeerConn) Ping(ctx context.Context) ping.Result { + timer := time.NewTimer(pc.latency) + select { + case <-timer.C: + if pc.err != nil { + return ping.Result{Error: pc.err} + } + pc.latencies = append(pc.latencies, pc.latency) + case <-ctx.Done(): + } + return ping.Result{RTT: pc.latency} +} + +func (pc *mockPeerConn) Latency() time.Duration { + sum := time.Duration(0) + if len(pc.latencies) == 0 { + return sum + } + for _, l := range pc.latencies { + sum += l + } + return sum / time.Duration(len(pc.latencies)) +} + +func TestDontHaveTimeoutMgrTimeout(t *testing.T) { + firstks := testutil.GenerateCids(2) + secondks := append(firstks, testutil.GenerateCids(3)...) + latency := time.Millisecond * 10 + latMultiplier := 2 + expProcessTime := 5 * time.Millisecond + expectedTimeout := expProcessTime + latency*time.Duration(latMultiplier) + ctx := context.Background() + pc := &mockPeerConn{latency: latency} + + var timedOutKs []cid.Cid + onTimeout := func(tks []cid.Cid) { + timedOutKs = append(timedOutKs, tks...) + } + dhtm := newDontHaveTimeoutMgrWithParams(ctx, pc, onTimeout, + dontHaveTimeout, latMultiplier, expProcessTime) + dhtm.Start() + + // Add first set of keys + dhtm.AddPending(firstks) + + // Wait for less than the expected timeout + time.Sleep(expectedTimeout - 5*time.Millisecond) + + // At this stage no keys should have timed out + if len(timedOutKs) > 0 { + t.Fatal("expected timeout not to have happened yet") + } + + // Add second set of keys + dhtm.AddPending(secondks) + + // At this stage first set of keys should have timed out + time.Sleep(10 * time.Millisecond) + if len(timedOutKs) != len(firstks) { + t.Fatal("expected timeout") + } + + // Clear the recorded timed out keys + timedOutKs = nil + + // Sleep until the second set of keys should have timed out + time.Sleep(expectedTimeout) + + // At this stage all keys should have timed out. The second set included + // the first set of keys, but they were added before the first set timed + // out, so only the remaining keys should have beed added. + if len(timedOutKs) != len(secondks)-len(firstks) { + t.Fatal("expected second set of keys to timeout") + } +} + +func TestDontHaveTimeoutMgrCancel(t *testing.T) { + ks := testutil.GenerateCids(3) + latency := time.Millisecond * 10 + latMultiplier := 2 + expProcessTime := 5 * time.Millisecond + expectedTimeout := expProcessTime + latency*time.Duration(latMultiplier) + ctx := context.Background() + pc := &mockPeerConn{latency: latency} + + var timedOutKs []cid.Cid + onTimeout := func(tks []cid.Cid) { + timedOutKs = append(timedOutKs, tks...) + } + dhtm := newDontHaveTimeoutMgrWithParams(ctx, pc, onTimeout, + dontHaveTimeout, latMultiplier, expProcessTime) + dhtm.Start() + + // Add keys + dhtm.AddPending(ks) + time.Sleep(5 * time.Millisecond) + + // Cancel keys + cancelCount := 1 + dhtm.CancelPending(ks[:cancelCount]) + + // Wait for the expected timeout + time.Sleep(expectedTimeout) + + // At this stage all non-cancelled keys should have timed out + if len(timedOutKs) != len(ks)-cancelCount { + t.Fatal("expected timeout") + } +} + +func TestDontHaveTimeoutRepeatedAddPending(t *testing.T) { + ks := testutil.GenerateCids(10) + latency := time.Millisecond * 5 + latMultiplier := 1 + expProcessTime := time.Duration(0) + ctx := context.Background() + pc := &mockPeerConn{latency: latency} + + var timedOutKs []cid.Cid + onTimeout := func(tks []cid.Cid) { + timedOutKs = append(timedOutKs, tks...) + } + dhtm := newDontHaveTimeoutMgrWithParams(ctx, pc, onTimeout, + dontHaveTimeout, latMultiplier, expProcessTime) + dhtm.Start() + + // Add keys repeatedly + for _, c := range ks { + dhtm.AddPending([]cid.Cid{c}) + } + + // Wait for the expected timeout + time.Sleep(latency + 5*time.Millisecond) + + // At this stage all keys should have timed out + if len(timedOutKs) != len(ks) { + t.Fatal("expected timeout") + } +} + +func TestDontHaveTimeoutMgrUsesDefaultTimeoutIfPingError(t *testing.T) { + ks := testutil.GenerateCids(2) + latency := time.Millisecond * 1 + latMultiplier := 2 + expProcessTime := 2 * time.Millisecond + defaultTimeout := 10 * time.Millisecond + expectedTimeout := expProcessTime + defaultTimeout + + ctx := context.Background() + var timedOutKs []cid.Cid + onTimeout := func(tks []cid.Cid) { + timedOutKs = append(timedOutKs, tks...) + } + pc := &mockPeerConn{latency: latency, err: fmt.Errorf("ping error")} + dhtm := newDontHaveTimeoutMgrWithParams(ctx, pc, onTimeout, + defaultTimeout, latMultiplier, expProcessTime) + dhtm.Start() + + // Add keys + dhtm.AddPending(ks) + + // Sleep for less than the expected timeout + time.Sleep(expectedTimeout - 5*time.Millisecond) + + // At this stage no timeout should have happened yet + if len(timedOutKs) > 0 { + t.Fatal("expected timeout not to have happened yet") + } + + // Sleep until after the expected timeout + time.Sleep(10 * time.Millisecond) + + // Now the keys should have timed out + if len(timedOutKs) != len(ks) { + t.Fatal("expected timeout") + } +} + +func TestDontHaveTimeoutMgrUsesDefaultTimeoutIfLatencyLonger(t *testing.T) { + ks := testutil.GenerateCids(2) + latency := time.Millisecond * 20 + latMultiplier := 1 + expProcessTime := time.Duration(0) + defaultTimeout := 10 * time.Millisecond + + ctx := context.Background() + var timedOutKs []cid.Cid + onTimeout := func(tks []cid.Cid) { + timedOutKs = append(timedOutKs, tks...) + } + pc := &mockPeerConn{latency: latency} + dhtm := newDontHaveTimeoutMgrWithParams(ctx, pc, onTimeout, + defaultTimeout, latMultiplier, expProcessTime) + dhtm.Start() + + // Add keys + dhtm.AddPending(ks) + + // Sleep for less than the default timeout + time.Sleep(defaultTimeout - 5*time.Millisecond) + + // At this stage no timeout should have happened yet + if len(timedOutKs) > 0 { + t.Fatal("expected timeout not to have happened yet") + } + + // Sleep until after the default timeout + time.Sleep(10 * time.Millisecond) + + // Now the keys should have timed out + if len(timedOutKs) != len(ks) { + t.Fatal("expected timeout") + } +} + +func TestDontHaveTimeoutNoTimeoutAfterShutdown(t *testing.T) { + ks := testutil.GenerateCids(2) + latency := time.Millisecond * 10 + latMultiplier := 1 + expProcessTime := time.Duration(0) + ctx := context.Background() + pc := &mockPeerConn{latency: latency} + + var timedOutKs []cid.Cid + onTimeout := func(tks []cid.Cid) { + timedOutKs = append(timedOutKs, tks...) + } + dhtm := newDontHaveTimeoutMgrWithParams(ctx, pc, onTimeout, + dontHaveTimeout, latMultiplier, expProcessTime) + dhtm.Start() + + // Add keys + dhtm.AddPending(ks) + + // Wait less than the timeout + time.Sleep(latency - 5*time.Millisecond) + + // Shutdown the manager + dhtm.Shutdown() + + // Wait for the expected timeout + time.Sleep(10 * time.Millisecond) + + // Manager was shut down so timeout should not have fired + if len(timedOutKs) != 0 { + t.Fatal("expected no timeout after shutdown") + } +} diff --git a/internal/messagequeue/messagequeue.go b/internal/messagequeue/messagequeue.go index 5f948aea..15f8100d 100644 --- a/internal/messagequeue/messagequeue.go +++ b/internal/messagequeue/messagequeue.go @@ -50,7 +50,7 @@ type MessageQueue struct { ctx context.Context p peer.ID network MessageNetwork - pinger MQPinger + dhTimeoutMgr DontHaveTimeoutManager maxMessageSize int sendErrorBackoff time.Duration @@ -59,13 +59,11 @@ type MessageQueue struct { done chan struct{} // Take lock whenever any of these variables are modified - wllock sync.Mutex - bcstWants recallWantlist - peerWants recallWantlist - cancels *cid.Set - priority int - dontHaveTimers map[cid.Cid]*dontHaveTimer - onDontHaveTimeout OnDontHaveTimeout + wllock sync.Mutex + bcstWants recallWantlist + peerWants recallWantlist + cancels *cid.Set + priority int // Dont touch any of these variables outside of run loop sender bsnet.MessageSender @@ -126,45 +124,43 @@ func (pc *peerConn) Latency() time.Duration { return pc.network.Latency(pc.p) } -// dontHaveTimer is used to keep track of references to a timer -// (see OnDontHaveTimeout) -type dontHaveTimer struct { - timer Stoppable - refs int -} - // Fires when a timeout occurs waiting for a response from a peer running an // older version of Bitswap that doesn't support DONT_HAVE messages. type OnDontHaveTimeout func(peer.ID, []cid.Cid) -// MQPinger pings a peer to estimate latency so it can set a reasonable -// upper bound on when to consider a request timed out -type MQPinger interface { - // Start the pinger (idempotent) +// DontHaveTimeoutManager pings a peer to estimate latency so it can set a reasonable +// upper bound on when to consider a DONT_HAVE request as timed out (when connected to +// a peer that doesn't support DONT_HAVE messages) +type DontHaveTimeoutManager interface { + // Start the manager (idempotent) Start() - // Shutdown the pinger (Shutdown is final, pinger cannot be restarted) + // Shutdown the manager (Shutdown is final, manager cannot be restarted) Shutdown() - // AfterTimeout runs the given function after the peer has not responded - // for a time defined by the peer's latency. - // The timeout can be stopped by calling Stoppable.Stop() - AfterTimeout(func()) Stoppable + // AddPending adds the wants as pending a response. If the are not + // cancelled before the timeout, the OnDontHaveTimeout method will be called. + AddPending([]cid.Cid) + // CancelPending removes the wants + CancelPending([]cid.Cid) } // New creates a new MessageQueue. func New(ctx context.Context, p peer.ID, network MessageNetwork, onDontHaveTimeout OnDontHaveTimeout) *MessageQueue { - pinger := newMQPinger(ctx, newPeerConnection(p, network)) - return newMessageQueue(ctx, p, network, maxMessageSize, sendErrorBackoff, pinger, onDontHaveTimeout) + onTimeout := func(ks []cid.Cid) { + onDontHaveTimeout(p, ks) + } + dhTimeoutMgr := newDontHaveTimeoutMgr(ctx, newPeerConnection(p, network), onTimeout) + return newMessageQueue(ctx, p, network, maxMessageSize, sendErrorBackoff, dhTimeoutMgr) } // This constructor is used by the tests func newMessageQueue(ctx context.Context, p peer.ID, network MessageNetwork, - maxMsgSize int, sendErrorBackoff time.Duration, pinger MQPinger, onDontHaveTimeout OnDontHaveTimeout) *MessageQueue { + maxMsgSize int, sendErrorBackoff time.Duration, dhTimeoutMgr DontHaveTimeoutManager) *MessageQueue { mq := &MessageQueue{ ctx: ctx, p: p, network: network, - pinger: pinger, + dhTimeoutMgr: dhTimeoutMgr, maxMessageSize: maxMsgSize, bcstWants: newRecallWantList(), peerWants: newRecallWantList(), @@ -174,8 +170,6 @@ func newMessageQueue(ctx context.Context, p peer.ID, network MessageNetwork, rebroadcastInterval: defaultRebroadcastInterval, sendErrorBackoff: sendErrorBackoff, priority: maxPriority, - dontHaveTimers: make(map[cid.Cid]*dontHaveTimer), - onDontHaveTimeout: onDontHaveTimeout, } // Apply debounce to the work ready signal (which triggers sending a message) @@ -243,22 +237,17 @@ func (mq *MessageQueue) AddCancels(cancelKs []cid.Cid) { return } + // Cancel any outstanding DONT_HAVE timers + mq.dhTimeoutMgr.CancelPending(cancelKs) + mq.wllock.Lock() defer mq.wllock.Unlock() + // Remove keys from broadcast and peer wants, and add to cancels for _, c := range cancelKs { mq.bcstWants.Remove(c) mq.peerWants.Remove(c) mq.cancels.Add(c) - - // The want has been cancelled, so stop any pending dontHaveTimers for the want - if t, ok := mq.dontHaveTimers[c]; ok { - t.refs-- - if t.refs <= 0 { - t.timer.Stop() - } - delete(mq.dontHaveTimers, c) - } } // Schedule a message send @@ -289,17 +278,8 @@ func (mq *MessageQueue) Shutdown() { } func (mq *MessageQueue) onShutdown() { - mq.wllock.Lock() - defer mq.wllock.Unlock() - - // Clean up dontHaveTimers - for c, t := range mq.dontHaveTimers { - t.timer.Stop() - delete(mq.dontHaveTimers, c) - } - - // Shutdown the pinger - mq.pinger.Shutdown() + // Shut down the DONT_HAVE timeout manager + mq.dhTimeoutMgr.Shutdown() } func (mq *MessageQueue) runQueue() { @@ -378,10 +358,10 @@ func (mq *MessageQueue) sendMessage() { return } - // Make sure the pinger has started + // Make sure the DONT_HAVE timeout manager has started if !mq.sender.SupportsHave() { // Note: Start is idempotent - mq.pinger.Start() + mq.dhTimeoutMgr.Start() } // Convert want lists to a Bitswap Message @@ -422,7 +402,6 @@ func (mq *MessageQueue) simulateDontHaveWithTimeout(msg bsmsg.BitSwapMessage) { } mq.wllock.Lock() - defer mq.wllock.Unlock() // Get the CID of each want-block that expects a DONT_HAVE response wantlist := msg.Wantlist() @@ -433,39 +412,15 @@ func (mq *MessageQueue) simulateDontHaveWithTimeout(msg bsmsg.BitSwapMessage) { // received in the interim c := entry.Cid if _, ok := mq.peerWants.allWants.Contains(c); ok { - // Check we don't already have a timer running for this cid - if _, existing := mq.dontHaveTimers[c]; !existing { - wants = append(wants, c) - } + wants = append(wants, c) } } } - if len(wants) == 0 { - return - } - - // Create a timer that will fire if we don't receive a block - timer := mq.pinger.AfterTimeout(func() { - // Figure out which of the blocks that were wanted were not received - // within the timeout - pending := make([]cid.Cid, 0, len(wants)) - for _, c := range wants { - if _, ok := mq.dontHaveTimers[c]; ok { - delete(mq.dontHaveTimers, c) - pending = append(pending, c) - } - } + mq.wllock.Unlock() - // Fire the event - if len(pending) > 0 { - mq.onDontHaveTimeout(mq.p, pending) - } - }) - dt := &dontHaveTimer{timer, len(wants)} - for _, c := range wants { - mq.dontHaveTimers[c] = dt - } + // Add wants to DONT_HAVE timeout manager + mq.dhTimeoutMgr.AddPending(wants) } // func (mq *MessageQueue) logOutgoingMessage(msg bsmsg.BitSwapMessage) { diff --git a/internal/messagequeue/messagequeue_test.go b/internal/messagequeue/messagequeue_test.go index 29079044..0ea93c43 100644 --- a/internal/messagequeue/messagequeue_test.go +++ b/internal/messagequeue/messagequeue_test.go @@ -41,14 +41,28 @@ func (fms *fakeMessageNetwork) Ping(context.Context, peer.ID) ping.Result { return ping.Result{Error: fmt.Errorf("ping error")} } -type fakePinger struct { - timeout time.Duration +type fakeDontHaveTimeoutMgr struct { + ks []cid.Cid } -func (fp *fakePinger) Start() {} -func (fp *fakePinger) Shutdown() {} -func (fp *fakePinger) AfterTimeout(fn func()) Stoppable { - return time.AfterFunc(fp.timeout, fn) +func (fp *fakeDontHaveTimeoutMgr) Start() {} +func (fp *fakeDontHaveTimeoutMgr) Shutdown() {} +func (fp *fakeDontHaveTimeoutMgr) AddPending(ks []cid.Cid) { + s := cid.NewSet() + for _, c := range append(fp.ks, ks...) { + s.Add(c) + } + fp.ks = s.Keys() +} +func (fp *fakeDontHaveTimeoutMgr) CancelPending(ks []cid.Cid) { + s := cid.NewSet() + for _, c := range fp.ks { + s.Add(c) + } + for _, c := range ks { + s.Remove(c) + } + fp.ks = s.Keys() } type fakeMessageSender struct { @@ -428,13 +442,13 @@ func TestSendingLargeMessages(t *testing.T) { fullClosedChan := make(chan struct{}, 1) fakeSender := &fakeMessageSender{nil, fullClosedChan, resetChan, messagesSent, sendErrors, true} fakenet := &fakeMessageNetwork{nil, nil, fakeSender} - pinger := &fakePinger{10000 * time.Millisecond} + dhtm := &fakeDontHaveTimeoutMgr{} peerID := testutil.GeneratePeers(1)[0] wantBlocks := testutil.GenerateCids(10) entrySize := 44 maxMsgSize := entrySize * 3 // 3 wants - messageQueue := newMessageQueue(ctx, peerID, fakenet, maxMsgSize, sendErrorBackoff, pinger, mockTimeoutCb) + messageQueue := newMessageQueue(ctx, peerID, fakenet, maxMsgSize, sendErrorBackoff, dhtm) messageQueue.Startup() messageQueue.AddWants(wantBlocks, []cid.Cid{}) @@ -507,7 +521,7 @@ func TestSendToPeerThatDoesntSupportHave(t *testing.T) { } } -func TestTimeoutOnSendToPeerThatDoesntSupportHave(t *testing.T) { +func TestSendToPeerThatDoesntSupportHaveMonitorsTimeouts(t *testing.T) { ctx := context.Background() messagesSent := make(chan bsmsg.BitSwapMessage) sendErrors := make(chan error) @@ -516,41 +530,27 @@ func TestTimeoutOnSendToPeerThatDoesntSupportHave(t *testing.T) { fakeSender := &fakeMessageSender{nil, fullClosedChan, resetChan, messagesSent, sendErrors, false} fakenet := &fakeMessageNetwork{nil, nil, fakeSender} peerID := testutil.GeneratePeers(1)[0] - sendErrBackoff := 5 * time.Millisecond - var timedOutKs []cid.Cid - dontHaveTimeoutCb := func(p peer.ID, ks []cid.Cid) { - timedOutKs = append(timedOutKs, ks...) - } - // Timeout after 15ms - pinger := &fakePinger{15 * time.Millisecond} - messageQueue := newMessageQueue(ctx, peerID, fakenet, maxMessageSize, sendErrBackoff, pinger, dontHaveTimeoutCb) + dhtm := &fakeDontHaveTimeoutMgr{} + messageQueue := newMessageQueue(ctx, peerID, fakenet, maxMessageSize, sendErrorBackoff, dhtm) messageQueue.Startup() - // Send some want-blocks wbs := testutil.GenerateCids(10) messageQueue.AddWants(wbs, nil) - messages := collectMessages(ctx, t, messagesSent, 10*time.Millisecond) + collectMessages(ctx, t, messagesSent, 10*time.Millisecond) - if len(messages) != 1 { - t.Fatal("wrong number of messages were sent", len(messages)) - } - wl := messages[0].Wantlist() - if len(wl) != len(wbs) { - t.Fatal("should only send want-blocks") + // Check want-blocks are added to DontHaveTimeoutMgr + if len(dhtm.ks) != len(wbs) { + t.Fatal("want-blocks not added to DontHaveTimeoutMgr") } - // Cancel some of the want-blocks - cancelCount := 3 + cancelCount := 2 messageQueue.AddCancels(wbs[:cancelCount]) + collectMessages(ctx, t, messagesSent, 10*time.Millisecond) - // Wait for a response to the remaining want-blocks - time.Sleep(10 * time.Millisecond) - - // dontHaveTimeoutCb should be called after the timeout expires with the - // want-blocks that we were waiting for - if len(timedOutKs) != len(wbs)-cancelCount { - t.Fatal("expected timeout callback to be called for want-blocks", timedOutKs) + // Check want-blocks are removed from DontHaveTimeoutMgr + if len(dhtm.ks) != len(wbs)-cancelCount { + t.Fatal("want-blocks not removed from DontHaveTimeoutMgr") } } @@ -562,10 +562,10 @@ func TestResendAfterError(t *testing.T) { fullClosedChan := make(chan struct{}, 1) fakeSender := &fakeMessageSender{nil, fullClosedChan, resetChan, messagesSent, sendErrors, true} fakenet := &fakeMessageNetwork{nil, nil, fakeSender} - pinger := &fakePinger{10000 * time.Millisecond} + dhtm := &fakeDontHaveTimeoutMgr{} peerID := testutil.GeneratePeers(1)[0] sendErrBackoff := 5 * time.Millisecond - messageQueue := newMessageQueue(ctx, peerID, fakenet, maxMessageSize, sendErrBackoff, pinger, mockTimeoutCb) + messageQueue := newMessageQueue(ctx, peerID, fakenet, maxMessageSize, sendErrBackoff, dhtm) wantBlocks := testutil.GenerateCids(10) wantHaves := testutil.GenerateCids(10) @@ -601,10 +601,10 @@ func TestResendAfterMaxRetries(t *testing.T) { fullClosedChan := make(chan struct{}, 1) fakeSender := &fakeMessageSender{nil, fullClosedChan, resetChan, messagesSent, sendErrors, true} fakenet := &fakeMessageNetwork{nil, nil, fakeSender} - pinger := &fakePinger{10000 * time.Millisecond} + dhtm := &fakeDontHaveTimeoutMgr{} peerID := testutil.GeneratePeers(1)[0] sendErrBackoff := 2 * time.Millisecond - messageQueue := newMessageQueue(ctx, peerID, fakenet, maxMessageSize, sendErrBackoff, pinger, mockTimeoutCb) + messageQueue := newMessageQueue(ctx, peerID, fakenet, maxMessageSize, sendErrBackoff, dhtm) wantBlocks := testutil.GenerateCids(10) wantHaves := testutil.GenerateCids(10) wantBlocks2 := testutil.GenerateCids(10) diff --git a/internal/messagequeue/mqpinger.go b/internal/messagequeue/mqpinger.go deleted file mode 100644 index 0d4fd3d8..00000000 --- a/internal/messagequeue/mqpinger.go +++ /dev/null @@ -1,243 +0,0 @@ -package messagequeue - -import ( - "context" - "sync" - "time" - - "github.com/libp2p/go-libp2p/p2p/protocol/ping" -) - -const ( - // dontHaveTimeout is used to simulate a DONT_HAVE when communicating with - // a peer whose Bitswap client doesn't support the DONT_HAVE response. - // If the peer doesn't respond to a want-block within the timeout, the - // local node assumes that the peer doesn't have the block. - dontHaveTimeout = 5 * time.Second - - // maxExpectedWantProcessTime is the maximum amount of time we expect a - // peer takes to process a want and initiate sending a response to us - maxExpectedWantProcessTime = 200 * time.Millisecond - - // latencyMultiplier is multiplied by the average ping time to - // get an upper bound on how long we expect to wait for a peer's response - // to arrive - latencyMultiplier = 2 - - // maxPingErrorCount is the maximum number of times we can receive an - // error while pinging before we give up - maxPingErrorCount = 3 - - // pingInterval is the time between pings - pingInterval = 60 * time.Second -) - -// PeerConnection is a connection to a peer that can be pinged, and the -// average latency measured -type PeerConnection interface { - // Ping the peer - Ping(context.Context) ping.Result - // The average latency of all pings - Latency() time.Duration -} - -// Stoppable can be stopped -type Stoppable interface { - // Stop the Stoppable - the return value is false if it was already stopped. - Stop() bool -} - -// mqPinger pings the peer to measure latency. It uses the latency to set a -// reasonable timeout for AfterTimeout(). -type mqPinger struct { - ctx context.Context - shutdown func() - peerConn PeerConnection - startCh chan struct{} - pendingTimers chan *pendingTimer - rtt time.Duration - defaultTimeout time.Duration - pingInterval time.Duration - latencyMultiplier int - maxExpectedWantProcessTime time.Duration - maxErrorCount int -} - -// newMQPinger creates a new mqPinger -func newMQPinger(ctx context.Context, pc PeerConnection) *mqPinger { - return newMQPingerWithParams(ctx, pc, dontHaveTimeout, pingInterval, - latencyMultiplier, maxExpectedWantProcessTime, maxPingErrorCount) -} - -// newMQPingerWithParams is used by the tests -func newMQPingerWithParams(ctx context.Context, pc PeerConnection, - defaultTimeout time.Duration, pingInterval time.Duration, latencyMultiplier int, - maxExpectedWantProcessTime time.Duration, maxErrorCount int) *mqPinger { - - ctx, shutdown := context.WithCancel(ctx) - mqp := &mqPinger{ - ctx: ctx, - shutdown: shutdown, - peerConn: pc, - startCh: make(chan struct{}, 1), - pendingTimers: make(chan *pendingTimer, 128), - defaultTimeout: defaultTimeout, - pingInterval: pingInterval, - latencyMultiplier: latencyMultiplier, - maxExpectedWantProcessTime: maxExpectedWantProcessTime, - maxErrorCount: maxErrorCount, - } - - go mqp.init() - - return mqp -} - -// pendingTimer keeps information required for a timer that has been craeted -// but not yet processed -type pendingTimer struct { - fn func() - start time.Time - lk sync.Mutex - timer *time.Timer -} - -// Stop the timer -func (pt *pendingTimer) Stop() bool { - pt.lk.Lock() - defer pt.lk.Unlock() - - // If the timer has been started, stop it - if pt.timer != nil { - return pt.timer.Stop() - } - return false -} - -// AfterTimeout calls the given function after a reasonable estimate of when -// the peer should be expected to respond by. The estimate is derived from -// the ping latency plus some padding for processing time. -func (mqp *mqPinger) AfterTimeout(fn func()) Stoppable { - pt := &pendingTimer{fn: fn, start: time.Now()} - - select { - case mqp.pendingTimers <- pt: - case <-mqp.ctx.Done(): - } - - return pt -} - -// Shutdown the mqPinger. Any subsequent call to Start() will be ignored -func (mqp *mqPinger) Shutdown() { - mqp.shutdown() -} - -// Start the mqPinger -func (mqp *mqPinger) Start() { - select { - case mqp.startCh <- struct{}{}: - case <-mqp.ctx.Done(): - default: - } -} - -// init the mqPinger -func (mqp *mqPinger) init() { - defer mqp.onShutdown() - - // Ping the peer. If there's an error don't do periodic pings, just use - // the default timeout. - var doPeriodicPing bool - select { - case <-mqp.ctx.Done(): - return - case <-mqp.startCh: - doPeriodicPing = mqp.start() - } - - select { - case <-mqp.ctx.Done(): - return - default: - } - - // If doing periodic pings, start pinging - if doPeriodicPing { - go mqp.periodicPing() - } - - // Process incoming timers - mqp.processTimers() -} - -// onShutdown is called when the mqPinger shuts down -func (mqp *mqPinger) onShutdown() { - // TODO: Do I need to drain the pending timer / startCh queues? -} - -// start pings the peer and sets the RTT. If there's an error, it derives the -// RTT from the default timeout and returns false. -func (mqp *mqPinger) start() bool { - ctx, cancel := context.WithTimeout(mqp.ctx, mqp.defaultTimeout) - defer cancel() - res := mqp.peerConn.Ping(ctx) - if res.Error != nil { - mqp.rtt = mqp.defaultTimeout / time.Duration(mqp.latencyMultiplier) - return false - } - - mqp.rtt = mqp.peerConn.Latency() - return true -} - -// Ping the peer periodically -func (mqp *mqPinger) periodicPing() { - errCnt := 0 - timer := time.NewTimer(mqp.pingInterval) - defer timer.Stop() - - for { - select { - case <-timer.C: - // Ping the peer - res := mqp.peerConn.Ping(mqp.ctx) - if res.Error != nil { - errCnt++ - // If there are too many errors, stop pinging - if errCnt > mqp.maxErrorCount { - return - } - } - timer = time.NewTimer(mqp.pingInterval) - case <-mqp.ctx.Done(): - return - } - } -} - -// processTimers pulls timer requests of the queue and starts them -func (mqp *mqPinger) processTimers() { - for { - select { - case pt := <-mqp.pendingTimers: - mqp.createTimer(pt) - case <-mqp.ctx.Done(): - return - } - } -} - -// createTimer turns a pendingTimer into a timer -func (mqp *mqPinger) createTimer(pt *pendingTimer) { - // The maximum expected time for a response is - // the expected time to process the want + (latency * multiplier) - // The multiplier is to provide some padding for variable latency. - expected := mqp.maxExpectedWantProcessTime + time.Duration(mqp.latencyMultiplier)*mqp.rtt - after := expected - time.Since(pt.start) - - pt.lk.Lock() - defer pt.lk.Unlock() - - pt.timer = time.AfterFunc(after, pt.fn) -} diff --git a/internal/messagequeue/mqpinger_test.go b/internal/messagequeue/mqpinger_test.go deleted file mode 100644 index ac437a87..00000000 --- a/internal/messagequeue/mqpinger_test.go +++ /dev/null @@ -1,178 +0,0 @@ -package messagequeue - -import ( - "context" - "fmt" - "testing" - "time" - - "github.com/libp2p/go-libp2p/p2p/protocol/ping" -) - -type mockPeerConn struct { - err error - latency time.Duration - latencies []time.Duration -} - -func (pc *mockPeerConn) Ping(ctx context.Context) ping.Result { - timer := time.NewTimer(pc.latency) - select { - case <-timer.C: - if pc.err != nil { - return ping.Result{Error: pc.err} - } - pc.latencies = append(pc.latencies, pc.latency) - case <-ctx.Done(): - } - return ping.Result{RTT: pc.latency} -} - -func (pc *mockPeerConn) Latency() time.Duration { - sum := time.Duration(0) - if len(pc.latencies) == 0 { - return sum - } - for _, l := range pc.latencies { - sum += l - } - return sum / time.Duration(len(pc.latencies)) -} - -func TestPingerTimeout(t *testing.T) { - latency := time.Millisecond * 10 - latMultiplier := 2 - expProcessTime := 5 * time.Millisecond - expectedTimeout := expProcessTime + latency*time.Duration(latMultiplier) - - ctx := context.Background() - pc := &mockPeerConn{latency: latency} - mqp := newMQPingerWithParams(ctx, pc, dontHaveTimeout, pingInterval, - latMultiplier, expProcessTime, maxPingErrorCount) - mqp.Start() - - timedOut := false - onTimeout := func() { - timedOut = true - } - mqp.AfterTimeout(onTimeout) - - time.Sleep(expectedTimeout - 5*time.Millisecond) - if timedOut { - t.Fatal("expected timeout not to have happened yet") - } - - time.Sleep(10 * time.Millisecond) - if !timedOut { - t.Fatal("expected timeout") - } -} - -func TestPingerDoestNotCreateTimersAfterShutdown(t *testing.T) { - latency := time.Millisecond * 10 - latMultiplier := 2 - expProcessTime := 5 * time.Millisecond - expectedTimeout := expProcessTime + latency*time.Duration(latMultiplier) - - ctx := context.Background() - pc := &mockPeerConn{latency: latency} - mqp := newMQPingerWithParams(ctx, pc, dontHaveTimeout, pingInterval, - latMultiplier, expProcessTime, maxPingErrorCount) - - // Give the pinger some time to start - mqp.Start() - time.Sleep(time.Millisecond * 5) - - // Shut down the pinger - mqp.Shutdown() - - // Set a timer (should never fire) - timedOut := false - onTimeout := func() { - timedOut = true - } - mqp.AfterTimeout(onTimeout) - - time.Sleep(expectedTimeout) - if timedOut { - t.Fatal("expected timeout not to fire") - } -} - -func TestPingerPeriodic(t *testing.T) { - latency := time.Millisecond * 5 - // No latency padding - latMultiplier := 1 - // No processing time - expProcessTime := time.Duration(0) - // No interval between pings - pingInt := time.Duration(0) - - ctx := context.Background() - pc := &mockPeerConn{latency: latency} - mqp := newMQPingerWithParams(ctx, pc, dontHaveTimeout, pingInt, - latMultiplier, expProcessTime, maxPingErrorCount) - mqp.Start() - - // Allow time for several pings - time.Sleep(latency * 10) - if len(pc.latencies) < 8 { - t.Fatal("expected several pings", len(pc.latencies)) - } - - // Timeout should be the average of ping latency - timedOut := false - onTimeout := func() { - timedOut = true - } - mqp.AfterTimeout(onTimeout) - - time.Sleep(latency / 2) - if timedOut { - t.Fatal("expected timeout not to have happened yet") - } - - time.Sleep(latency) - if !timedOut { - t.Fatal("expected timeout") - } - - // There should be no further pings after shutdown - mqp.Shutdown() - - pingCount := len(pc.latencies) - time.Sleep(latency * 5) - if len(pc.latencies) != pingCount { - t.Fatal("expected no more pings", len(pc.latencies)) - } -} - -func TestPingerUsesDefaultTimeoutIfPingError(t *testing.T) { - latency := time.Millisecond * 1 - latMultiplier := 2 - expProcessTime := 2 * time.Millisecond - defaultTimeout := 10 * time.Millisecond - expectedTimeout := expProcessTime + defaultTimeout - - ctx := context.Background() - pc := &mockPeerConn{latency: latency, err: fmt.Errorf("ping error")} - mqp := newMQPingerWithParams(ctx, pc, defaultTimeout, pingInterval, - latMultiplier, expProcessTime, maxPingErrorCount) - mqp.Start() - - timedOut := false - onTimeout := func() { - timedOut = true - } - mqp.AfterTimeout(onTimeout) - - time.Sleep(expectedTimeout - 5*time.Millisecond) - if timedOut { - t.Fatal("expected timeout not to have happened yet") - } - - time.Sleep(10 * time.Millisecond) - if !timedOut { - t.Fatal("expected timeout") - } -} From c56d552714ecc4b531cc7780baa71c99df5bb4eb Mon Sep 17 00:00:00 2001 From: Dirk McCormick Date: Tue, 11 Feb 2020 10:16:04 -0500 Subject: [PATCH 08/11] refactor: optimize DONT_HAVE timeout manager --- internal/messagequeue/donthavetimeoutmgr.go | 87 ++++++++++++------- .../messagequeue/donthavetimeoutmgr_test.go | 4 +- 2 files changed, 60 insertions(+), 31 deletions(-) diff --git a/internal/messagequeue/donthavetimeoutmgr.go b/internal/messagequeue/donthavetimeoutmgr.go index 83147557..b1f54fb0 100644 --- a/internal/messagequeue/donthavetimeoutmgr.go +++ b/internal/messagequeue/donthavetimeoutmgr.go @@ -51,6 +51,7 @@ type dontHaveTimeoutMgr struct { lk sync.RWMutex started bool pending map[cid.Cid]time.Time + wantQueue []cid.Cid timeout time.Duration checkForTimeoutsTimer *time.Timer } @@ -148,15 +149,14 @@ func (dhtm *dontHaveTimeoutMgr) measureLatency() { // Get the average latency to the peer latency := dhtm.peerConn.Latency() - dhtm.lk.Lock() - defer dhtm.lk.Unlock() - // Calculate a reasonable timeout based on latency + dhtm.lk.Lock() dhtm.timeout = dhtm.calculateTimeoutFromLatency(latency) + dhtm.lk.Unlock() // Check if after changing the timeout there are any pending wants that are // now over the timeout - go dhtm.checkForTimeouts() + dhtm.checkForTimeouts() } // checkForTimeouts checks pending wants to see if any are over the timeout @@ -164,48 +164,72 @@ func (dhtm *dontHaveTimeoutMgr) checkForTimeouts() { dhtm.lk.Lock() defer dhtm.lk.Unlock() - if len(dhtm.pending) == 0 { + if len(dhtm.wantQueue) == 0 { return } // Figure out which of the blocks that were wanted were not received // within the timeout - pending := make([]cid.Cid, 0, len(dhtm.pending)) - var oldest time.Time - for c, started := range dhtm.pending { - if time.Since(started) > dhtm.timeout { - pending = append(pending, c) - delete(dhtm.pending, c) - } else if oldest.IsZero() || started.Before(oldest) { - oldest = started + more := true + expired := make([]cid.Cid, 0, len(dhtm.pending)) + removeCount := 0 + for i := 0; i < len(dhtm.wantQueue) && more; i++ { + c := dhtm.wantQueue[i] + + // If the want has been cancelled, its cid will have been removed from + // the pending map + if started, ok := dhtm.pending[c]; ok { + // If the want has expired + if time.Since(started) > dhtm.timeout { + // Add the want to the expired list + expired = append(expired, c) + // Remove the want from the pending wants + delete(dhtm.pending, c) + // Remove expired wants from the want queue + removeCount++ + } else { + // The queue is in order from earliest to latest, so if we + // didn't find an expired entry we can stop iterating + more = false + } + } else { + // Remove cancelled wants from the want queue + removeCount++ } } - // Fire the timeout event for the pending wants - if len(pending) > 0 { - go dhtm.fireTimeout(pending) + // Remove cancelled or expired wants from want queue + dhtm.wantQueue = dhtm.wantQueue[removeCount:] + + // Fire the timeout event for the expired wants + if len(expired) > 0 { + go dhtm.fireTimeout(expired) } - if len(dhtm.pending) == 0 { + if len(dhtm.wantQueue) == 0 { return } - // Make sure things are still running - select { - case <-dhtm.ctx.Done(): + // Make sure the timeout manager is still running + if dhtm.ctx.Err() != nil { return - default: } - // If there's an existing scheduled check, stop it - if dhtm.checkForTimeoutsTimer != nil { - dhtm.checkForTimeoutsTimer.Stop() + // Note: we cleared cancelled wants from the front of the queue above + oldest, ok := dhtm.pending[dhtm.wantQueue[0]] + if !ok { + panic("cancelled want found in DONT_HAVE timeout manager want queue after purge") } // Schedule the next check for the moment when the oldest pending want will // timeout until := time.Until(oldest.Add(dhtm.timeout)) - dhtm.checkForTimeoutsTimer = time.AfterFunc(until, dhtm.checkForTimeouts) + if dhtm.checkForTimeoutsTimer == nil { + dhtm.checkForTimeoutsTimer = time.AfterFunc(until, dhtm.checkForTimeouts) + } else { + dhtm.checkForTimeoutsTimer.Stop() + dhtm.checkForTimeoutsTimer.Reset(until) + } } // AddPending adds the given keys that will expire if not cancelled before @@ -218,7 +242,6 @@ func (dhtm *dontHaveTimeoutMgr) AddPending(ks []cid.Cid) { start := time.Now() dhtm.lk.Lock() - defer dhtm.lk.Unlock() queueWasEmpty := len(dhtm.pending) == 0 @@ -226,14 +249,17 @@ func (dhtm *dontHaveTimeoutMgr) AddPending(ks []cid.Cid) { for _, c := range ks { if _, ok := dhtm.pending[c]; !ok { dhtm.pending[c] = start + dhtm.wantQueue = append(dhtm.wantQueue, c) } } + dhtm.lk.Unlock() + // If there was already an earlier pending item in the queue, then there // must already be a timeout check scheduled. If there is nothing in the // queue then we should make sure to schedule a check. if queueWasEmpty { - go dhtm.checkForTimeouts() + dhtm.checkForTimeouts() } } @@ -245,16 +271,17 @@ func (dhtm *dontHaveTimeoutMgr) CancelPending(ks []cid.Cid) { // The want has been cancelled, so remove pending timers for _, c := range ks { delete(dhtm.pending, c) + // Note: when we check the want queue for timeouts, we will check to + // see if there is a value in the pending map, and if not we ignore + // the queued item } } // fireTimeout fires the onDontHaveTimeout method with the timed out keys func (dhtm *dontHaveTimeoutMgr) fireTimeout(pending []cid.Cid) { // Make sure the timeout manager has not been shut down - select { - case <-dhtm.ctx.Done(): + if dhtm.ctx.Err() != nil { return - default: } // Fire the timeout diff --git a/internal/messagequeue/donthavetimeoutmgr_test.go b/internal/messagequeue/donthavetimeoutmgr_test.go index 0f072858..16bff889 100644 --- a/internal/messagequeue/donthavetimeoutmgr_test.go +++ b/internal/messagequeue/donthavetimeoutmgr_test.go @@ -73,8 +73,10 @@ func TestDontHaveTimeoutMgrTimeout(t *testing.T) { // Add second set of keys dhtm.AddPending(secondks) - // At this stage first set of keys should have timed out + // Wait until after the expected timeout time.Sleep(10 * time.Millisecond) + + // At this stage first set of keys should have timed out if len(timedOutKs) != len(firstks) { t.Fatal("expected timeout") } From 4a1a0733db987d428c1c7de41c1ff56824ba63e4 Mon Sep 17 00:00:00 2001 From: Dirk McCormick Date: Wed, 12 Feb 2020 12:03:01 -0500 Subject: [PATCH 09/11] fix: handle want-cancel-want case in DONT_HAVE timeout manager --- benchmarks_test.go | 2 +- internal/messagequeue/donthavetimeoutmgr.go | 131 +++++++++--------- .../messagequeue/donthavetimeoutmgr_test.go | 80 +++++------ 3 files changed, 110 insertions(+), 103 deletions(-) diff --git a/benchmarks_test.go b/benchmarks_test.go index 9761a26c..71e04629 100644 --- a/benchmarks_test.go +++ b/benchmarks_test.go @@ -130,7 +130,7 @@ var mixedBenches = []mixedBench{ mixedBench{bench{"3Nodes-Overlap3-OneAtATime", 3, 10, overlap2, oneAtATime}, 1, 2}, mixedBench{bench{"3Nodes-AllToAll-OneAtATime", 3, 10, allToAll, oneAtATime}, 1, 2}, mixedBench{bench{"3Nodes-Overlap3-AllConcurrent", 3, 10, overlap2, fetchAllConcurrent}, 1, 2}, - // mixedBench{bench{"3Nodes-Overlap3-UnixfsFetch", 3, 100, overlap2, unixfsFileFetch}, 1, 2}, + mixedBench{bench{"3Nodes-Overlap3-UnixfsFetch", 3, 100, overlap2, unixfsFileFetch}, 1, 2}, } func BenchmarkFetchFromOldBitswap(b *testing.B) { diff --git a/internal/messagequeue/donthavetimeoutmgr.go b/internal/messagequeue/donthavetimeoutmgr.go index b1f54fb0..9905076d 100644 --- a/internal/messagequeue/donthavetimeoutmgr.go +++ b/internal/messagequeue/donthavetimeoutmgr.go @@ -35,6 +35,14 @@ type PeerConnection interface { Latency() time.Duration } +// pendingWant keeps track of a want that has been sent and we're waiting +// for a response or for a timeout to expire +type pendingWant struct { + c cid.Cid + active bool + sent time.Time +} + // dontHaveTimeoutMgr pings the peer to measure latency. It uses the latency to // set a reasonable timeout for simulating a DONT_HAVE message for peers that // don't support DONT_HAVE @@ -47,12 +55,17 @@ type dontHaveTimeoutMgr struct { latencyMultiplier int maxExpectedWantProcessTime time.Duration - // All variables below here must be proctected by the lock - lk sync.RWMutex - started bool - pending map[cid.Cid]time.Time - wantQueue []cid.Cid - timeout time.Duration + // All variables below here must be protected by the lock + lk sync.RWMutex + // has the timeout manager started + started bool + // wants that are active (waiting for a response or timeout) + activeWants map[cid.Cid]*pendingWant + // queue of wants, from oldest to newest + wantQueue []*pendingWant + // time to wait for a response (depends on latency) + timeout time.Duration + // timer used to wait until want at front of queue expires checkForTimeoutsTimer *time.Timer } @@ -73,7 +86,7 @@ func newDontHaveTimeoutMgrWithParams(ctx context.Context, pc PeerConnection, onD ctx: ctx, shutdown: shutdown, peerConn: pc, - pending: make(map[cid.Cid]time.Time), + activeWants: make(map[cid.Cid]*pendingWant), timeout: defaultTimeout, defaultTimeout: defaultTimeout, latencyMultiplier: latencyMultiplier, @@ -100,9 +113,9 @@ func (dhtm *dontHaveTimeoutMgr) onShutdown() { } } -// onStarted is called when the dontHaveTimeoutMgr starts. +// closeAfterContext is called when the dontHaveTimeoutMgr starts. // It monitors for the context being cancelled. -func (dhtm *dontHaveTimeoutMgr) onStarted() { +func (dhtm *dontHaveTimeoutMgr) closeAfterContext() { <-dhtm.ctx.Done() dhtm.onShutdown() } @@ -118,7 +131,7 @@ func (dhtm *dontHaveTimeoutMgr) Start() { } dhtm.started = true - go dhtm.onStarted() + go dhtm.closeAfterContext() // If we already have a measure of latency to the peer, use it to // calculate a reasonable timeout @@ -149,57 +162,47 @@ func (dhtm *dontHaveTimeoutMgr) measureLatency() { // Get the average latency to the peer latency := dhtm.peerConn.Latency() - // Calculate a reasonable timeout based on latency dhtm.lk.Lock() + defer dhtm.lk.Unlock() + + // Calculate a reasonable timeout based on latency dhtm.timeout = dhtm.calculateTimeoutFromLatency(latency) - dhtm.lk.Unlock() // Check if after changing the timeout there are any pending wants that are // now over the timeout dhtm.checkForTimeouts() } -// checkForTimeouts checks pending wants to see if any are over the timeout +// checkForTimeouts checks pending wants to see if any are over the timeout. +// Note: this function should only be called within the lock. func (dhtm *dontHaveTimeoutMgr) checkForTimeouts() { - dhtm.lk.Lock() - defer dhtm.lk.Unlock() - if len(dhtm.wantQueue) == 0 { return } // Figure out which of the blocks that were wanted were not received // within the timeout - more := true - expired := make([]cid.Cid, 0, len(dhtm.pending)) - removeCount := 0 - for i := 0; i < len(dhtm.wantQueue) && more; i++ { - c := dhtm.wantQueue[i] - - // If the want has been cancelled, its cid will have been removed from - // the pending map - if started, ok := dhtm.pending[c]; ok { - // If the want has expired - if time.Since(started) > dhtm.timeout { - // Add the want to the expired list - expired = append(expired, c) - // Remove the want from the pending wants - delete(dhtm.pending, c) - // Remove expired wants from the want queue - removeCount++ - } else { - // The queue is in order from earliest to latest, so if we - // didn't find an expired entry we can stop iterating - more = false + expired := make([]cid.Cid, 0, len(dhtm.activeWants)) + for i := 0; i < len(dhtm.wantQueue); i++ { + pw := dhtm.wantQueue[0] + + // If the want is still active + if pw.active { + // The queue is in order from earliest to latest, so if we + // didn't find an expired entry we can stop iterating + if time.Since(pw.sent) < dhtm.timeout { + break } - } else { - // Remove cancelled wants from the want queue - removeCount++ + + // Add the want to the expired list + expired = append(expired, pw.c) + // Remove the want from the activeWants map + delete(dhtm.activeWants, pw.c) } - } - // Remove cancelled or expired wants from want queue - dhtm.wantQueue = dhtm.wantQueue[removeCount:] + // Remove expired or cancelled wants from the want queue + dhtm.wantQueue = dhtm.wantQueue[1:] + } // Fire the timeout event for the expired wants if len(expired) > 0 { @@ -215,17 +218,17 @@ func (dhtm *dontHaveTimeoutMgr) checkForTimeouts() { return } - // Note: we cleared cancelled wants from the front of the queue above - oldest, ok := dhtm.pending[dhtm.wantQueue[0]] - if !ok { - panic("cancelled want found in DONT_HAVE timeout manager want queue after purge") - } - // Schedule the next check for the moment when the oldest pending want will // timeout - until := time.Until(oldest.Add(dhtm.timeout)) + oldestStart := dhtm.wantQueue[0].sent + until := time.Until(oldestStart.Add(dhtm.timeout)) if dhtm.checkForTimeoutsTimer == nil { - dhtm.checkForTimeoutsTimer = time.AfterFunc(until, dhtm.checkForTimeouts) + dhtm.checkForTimeoutsTimer = time.AfterFunc(until, func() { + dhtm.lk.Lock() + defer dhtm.lk.Unlock() + + dhtm.checkForTimeouts() + }) } else { dhtm.checkForTimeoutsTimer.Stop() dhtm.checkForTimeoutsTimer.Reset(until) @@ -242,19 +245,23 @@ func (dhtm *dontHaveTimeoutMgr) AddPending(ks []cid.Cid) { start := time.Now() dhtm.lk.Lock() + defer dhtm.lk.Unlock() - queueWasEmpty := len(dhtm.pending) == 0 + queueWasEmpty := len(dhtm.activeWants) == 0 // Record the start time for each key for _, c := range ks { - if _, ok := dhtm.pending[c]; !ok { - dhtm.pending[c] = start - dhtm.wantQueue = append(dhtm.wantQueue, c) + if _, ok := dhtm.activeWants[c]; !ok { + pw := pendingWant{ + c: c, + sent: start, + active: true, + } + dhtm.activeWants[c] = &pw + dhtm.wantQueue = append(dhtm.wantQueue, &pw) } } - dhtm.lk.Unlock() - // If there was already an earlier pending item in the queue, then there // must already be a timeout check scheduled. If there is nothing in the // queue then we should make sure to schedule a check. @@ -268,12 +275,12 @@ func (dhtm *dontHaveTimeoutMgr) CancelPending(ks []cid.Cid) { dhtm.lk.Lock() defer dhtm.lk.Unlock() - // The want has been cancelled, so remove pending timers + // Mark the wants as cancelled for _, c := range ks { - delete(dhtm.pending, c) - // Note: when we check the want queue for timeouts, we will check to - // see if there is a value in the pending map, and if not we ignore - // the queued item + if pw, ok := dhtm.activeWants[c]; ok { + pw.active = false + delete(dhtm.activeWants, c) + } } } diff --git a/internal/messagequeue/donthavetimeoutmgr_test.go b/internal/messagequeue/donthavetimeoutmgr_test.go index 16bff889..1352df69 100644 --- a/internal/messagequeue/donthavetimeoutmgr_test.go +++ b/internal/messagequeue/donthavetimeoutmgr_test.go @@ -3,6 +3,7 @@ package messagequeue import ( "context" "fmt" + "sync" "testing" "time" @@ -41,6 +42,17 @@ func (pc *mockPeerConn) Latency() time.Duration { return sum / time.Duration(len(pc.latencies)) } +type timeoutRecorder struct { + timedOutKs []cid.Cid + lk sync.Mutex +} + +func (tr *timeoutRecorder) onTimeout(tks []cid.Cid) { + tr.lk.Lock() + defer tr.lk.Unlock() + tr.timedOutKs = append(tr.timedOutKs, tks...) +} + func TestDontHaveTimeoutMgrTimeout(t *testing.T) { firstks := testutil.GenerateCids(2) secondks := append(firstks, testutil.GenerateCids(3)...) @@ -50,12 +62,9 @@ func TestDontHaveTimeoutMgrTimeout(t *testing.T) { expectedTimeout := expProcessTime + latency*time.Duration(latMultiplier) ctx := context.Background() pc := &mockPeerConn{latency: latency} + tr := timeoutRecorder{} - var timedOutKs []cid.Cid - onTimeout := func(tks []cid.Cid) { - timedOutKs = append(timedOutKs, tks...) - } - dhtm := newDontHaveTimeoutMgrWithParams(ctx, pc, onTimeout, + dhtm := newDontHaveTimeoutMgrWithParams(ctx, pc, tr.onTimeout, dontHaveTimeout, latMultiplier, expProcessTime) dhtm.Start() @@ -66,7 +75,7 @@ func TestDontHaveTimeoutMgrTimeout(t *testing.T) { time.Sleep(expectedTimeout - 5*time.Millisecond) // At this stage no keys should have timed out - if len(timedOutKs) > 0 { + if len(tr.timedOutKs) > 0 { t.Fatal("expected timeout not to have happened yet") } @@ -77,12 +86,12 @@ func TestDontHaveTimeoutMgrTimeout(t *testing.T) { time.Sleep(10 * time.Millisecond) // At this stage first set of keys should have timed out - if len(timedOutKs) != len(firstks) { + if len(tr.timedOutKs) != len(firstks) { t.Fatal("expected timeout") } // Clear the recorded timed out keys - timedOutKs = nil + tr.timedOutKs = nil // Sleep until the second set of keys should have timed out time.Sleep(expectedTimeout) @@ -90,7 +99,7 @@ func TestDontHaveTimeoutMgrTimeout(t *testing.T) { // At this stage all keys should have timed out. The second set included // the first set of keys, but they were added before the first set timed // out, so only the remaining keys should have beed added. - if len(timedOutKs) != len(secondks)-len(firstks) { + if len(tr.timedOutKs) != len(secondks)-len(firstks) { t.Fatal("expected second set of keys to timeout") } } @@ -98,17 +107,14 @@ func TestDontHaveTimeoutMgrTimeout(t *testing.T) { func TestDontHaveTimeoutMgrCancel(t *testing.T) { ks := testutil.GenerateCids(3) latency := time.Millisecond * 10 - latMultiplier := 2 - expProcessTime := 5 * time.Millisecond - expectedTimeout := expProcessTime + latency*time.Duration(latMultiplier) + latMultiplier := 1 + expProcessTime := time.Duration(0) + expectedTimeout := latency ctx := context.Background() pc := &mockPeerConn{latency: latency} + tr := timeoutRecorder{} - var timedOutKs []cid.Cid - onTimeout := func(tks []cid.Cid) { - timedOutKs = append(timedOutKs, tks...) - } - dhtm := newDontHaveTimeoutMgrWithParams(ctx, pc, onTimeout, + dhtm := newDontHaveTimeoutMgrWithParams(ctx, pc, tr.onTimeout, dontHaveTimeout, latMultiplier, expProcessTime) dhtm.Start() @@ -124,7 +130,7 @@ func TestDontHaveTimeoutMgrCancel(t *testing.T) { time.Sleep(expectedTimeout) // At this stage all non-cancelled keys should have timed out - if len(timedOutKs) != len(ks)-cancelCount { + if len(tr.timedOutKs) != len(ks)-cancelCount { t.Fatal("expected timeout") } } @@ -136,12 +142,9 @@ func TestDontHaveTimeoutRepeatedAddPending(t *testing.T) { expProcessTime := time.Duration(0) ctx := context.Background() pc := &mockPeerConn{latency: latency} + tr := timeoutRecorder{} - var timedOutKs []cid.Cid - onTimeout := func(tks []cid.Cid) { - timedOutKs = append(timedOutKs, tks...) - } - dhtm := newDontHaveTimeoutMgrWithParams(ctx, pc, onTimeout, + dhtm := newDontHaveTimeoutMgrWithParams(ctx, pc, tr.onTimeout, dontHaveTimeout, latMultiplier, expProcessTime) dhtm.Start() @@ -154,7 +157,7 @@ func TestDontHaveTimeoutRepeatedAddPending(t *testing.T) { time.Sleep(latency + 5*time.Millisecond) // At this stage all keys should have timed out - if len(timedOutKs) != len(ks) { + if len(tr.timedOutKs) != len(ks) { t.Fatal("expected timeout") } } @@ -166,14 +169,11 @@ func TestDontHaveTimeoutMgrUsesDefaultTimeoutIfPingError(t *testing.T) { expProcessTime := 2 * time.Millisecond defaultTimeout := 10 * time.Millisecond expectedTimeout := expProcessTime + defaultTimeout - + tr := timeoutRecorder{} ctx := context.Background() - var timedOutKs []cid.Cid - onTimeout := func(tks []cid.Cid) { - timedOutKs = append(timedOutKs, tks...) - } pc := &mockPeerConn{latency: latency, err: fmt.Errorf("ping error")} - dhtm := newDontHaveTimeoutMgrWithParams(ctx, pc, onTimeout, + + dhtm := newDontHaveTimeoutMgrWithParams(ctx, pc, tr.onTimeout, defaultTimeout, latMultiplier, expProcessTime) dhtm.Start() @@ -184,7 +184,7 @@ func TestDontHaveTimeoutMgrUsesDefaultTimeoutIfPingError(t *testing.T) { time.Sleep(expectedTimeout - 5*time.Millisecond) // At this stage no timeout should have happened yet - if len(timedOutKs) > 0 { + if len(tr.timedOutKs) > 0 { t.Fatal("expected timeout not to have happened yet") } @@ -192,7 +192,7 @@ func TestDontHaveTimeoutMgrUsesDefaultTimeoutIfPingError(t *testing.T) { time.Sleep(10 * time.Millisecond) // Now the keys should have timed out - if len(timedOutKs) != len(ks) { + if len(tr.timedOutKs) != len(ks) { t.Fatal("expected timeout") } } @@ -203,14 +203,11 @@ func TestDontHaveTimeoutMgrUsesDefaultTimeoutIfLatencyLonger(t *testing.T) { latMultiplier := 1 expProcessTime := time.Duration(0) defaultTimeout := 10 * time.Millisecond - + tr := timeoutRecorder{} ctx := context.Background() - var timedOutKs []cid.Cid - onTimeout := func(tks []cid.Cid) { - timedOutKs = append(timedOutKs, tks...) - } pc := &mockPeerConn{latency: latency} - dhtm := newDontHaveTimeoutMgrWithParams(ctx, pc, onTimeout, + + dhtm := newDontHaveTimeoutMgrWithParams(ctx, pc, tr.onTimeout, defaultTimeout, latMultiplier, expProcessTime) dhtm.Start() @@ -221,7 +218,7 @@ func TestDontHaveTimeoutMgrUsesDefaultTimeoutIfLatencyLonger(t *testing.T) { time.Sleep(defaultTimeout - 5*time.Millisecond) // At this stage no timeout should have happened yet - if len(timedOutKs) > 0 { + if len(tr.timedOutKs) > 0 { t.Fatal("expected timeout not to have happened yet") } @@ -229,7 +226,7 @@ func TestDontHaveTimeoutMgrUsesDefaultTimeoutIfLatencyLonger(t *testing.T) { time.Sleep(10 * time.Millisecond) // Now the keys should have timed out - if len(timedOutKs) != len(ks) { + if len(tr.timedOutKs) != len(ks) { t.Fatal("expected timeout") } } @@ -242,8 +239,11 @@ func TestDontHaveTimeoutNoTimeoutAfterShutdown(t *testing.T) { ctx := context.Background() pc := &mockPeerConn{latency: latency} + var lk sync.Mutex var timedOutKs []cid.Cid onTimeout := func(tks []cid.Cid) { + lk.Lock() + defer lk.Unlock() timedOutKs = append(timedOutKs, tks...) } dhtm := newDontHaveTimeoutMgrWithParams(ctx, pc, onTimeout, From 8d1d5ff7cecb6308d7e65d8177af83135e7fdded Mon Sep 17 00:00:00 2001 From: Dirk McCormick Date: Wed, 12 Feb 2020 13:15:55 -0500 Subject: [PATCH 10/11] test: add case to dont have timeout manager for add-cancel-add --- .../messagequeue/donthavetimeoutmgr_test.go | 45 +++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/internal/messagequeue/donthavetimeoutmgr_test.go b/internal/messagequeue/donthavetimeoutmgr_test.go index 1352df69..78e622a7 100644 --- a/internal/messagequeue/donthavetimeoutmgr_test.go +++ b/internal/messagequeue/donthavetimeoutmgr_test.go @@ -135,6 +135,51 @@ func TestDontHaveTimeoutMgrCancel(t *testing.T) { } } +func TestDontHaveTimeoutWantCancelWant(t *testing.T) { + ks := testutil.GenerateCids(3) + latency := time.Millisecond * 20 + latMultiplier := 1 + expProcessTime := time.Duration(0) + expectedTimeout := latency + ctx := context.Background() + pc := &mockPeerConn{latency: latency} + tr := timeoutRecorder{} + + dhtm := newDontHaveTimeoutMgrWithParams(ctx, pc, tr.onTimeout, + dontHaveTimeout, latMultiplier, expProcessTime) + dhtm.Start() + + // Add keys + dhtm.AddPending(ks) + + // Wait for a short time + time.Sleep(expectedTimeout - 10*time.Millisecond) + + // Cancel two keys + dhtm.CancelPending(ks[:2]) + + time.Sleep(5 * time.Millisecond) + + // Add back one cancelled key + dhtm.AddPending(ks[:1]) + + // Wait till after initial timeout + time.Sleep(10 * time.Millisecond) + + // At this stage only the key that was never cancelled should have timed out + if len(tr.timedOutKs) != 1 { + t.Fatal("expected one key to timeout") + } + + // Wait till after added back key should time out + time.Sleep(latency) + + // At this stage the key that was added back should also have timed out + if len(tr.timedOutKs) != 2 { + t.Fatal("expected added back key to timeout") + } +} + func TestDontHaveTimeoutRepeatedAddPending(t *testing.T) { ks := testutil.GenerateCids(10) latency := time.Millisecond * 5 From 1a16a42beeb081131059b31a8e5ef8b652edb862 Mon Sep 17 00:00:00 2001 From: Dirk McCormick Date: Wed, 12 Feb 2020 16:12:19 -0500 Subject: [PATCH 11/11] fix: bug with iterating over DONT_HAVE timeout manager queue --- internal/messagequeue/donthavetimeoutmgr.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/messagequeue/donthavetimeoutmgr.go b/internal/messagequeue/donthavetimeoutmgr.go index 9905076d..ee7941b6 100644 --- a/internal/messagequeue/donthavetimeoutmgr.go +++ b/internal/messagequeue/donthavetimeoutmgr.go @@ -183,7 +183,7 @@ func (dhtm *dontHaveTimeoutMgr) checkForTimeouts() { // Figure out which of the blocks that were wanted were not received // within the timeout expired := make([]cid.Cid, 0, len(dhtm.activeWants)) - for i := 0; i < len(dhtm.wantQueue); i++ { + for len(dhtm.wantQueue) > 0 { pw := dhtm.wantQueue[0] // If the want is still active