From fa11a677e0520c2f65f549c84114c4d1604a4cc1 Mon Sep 17 00:00:00 2001 From: Mikael Lixenstrand Date: Mon, 8 Sep 2014 23:32:52 +0100 Subject: [PATCH 01/47] Improve AAE fullsync by estimating number of keys This commit extends AAE fullsync to sample the AAE trees to estimate the number of keys in a given partition. This estimate is used to properly size the bloom filter, as well as enable a new percentage-based direct send threshold (eg. direct send up to 10% of differing keys). To support AAE-based key estimation, this commit changes the fullsync logic to update all AAE trees before proceeding to the exchange phase. This change is necessary because all trees must be updated and sampled to calculate a correct estimate. This commit also delays the creation of the bloom filter until it is needed. Fullsyncs that manage to send all differences directly will therefore avoid creating a bloom filter. Authored-by: Mikael Lixenstrand Authored-by: rsltrifork Rebased-by: Joseph Blomstedt --- src/riak_repl_aae_source.erl | 251 ++++++++++++++++++----------------- 1 file changed, 130 insertions(+), 121 deletions(-) diff --git a/src/riak_repl_aae_source.erl b/src/riak_repl_aae_source.erl index 99fc5173..c36d3fc9 100644 --- a/src/riak_repl_aae_source.erl +++ b/src/riak_repl_aae_source.erl @@ -42,6 +42,7 @@ timeout :: pos_integer(), wire_ver :: atom(), diff_batch_size = 1000 :: non_neg_integer(), + estimated_nr_keys :: non_neg_integer(), local_lock = false :: boolean(), owner :: pid(), proto :: term(), @@ -56,6 +57,9 @@ %% filter, but simply sent to the remote site directly. -define(GET_OBJECT_LIMIT, 1000). +%% Diff percentage needed to use bloom filter +-define(DIFF_PERCENTAGE, 5). + %%%=================================================================== %%% API %%%=================================================================== @@ -80,9 +84,6 @@ init([Cluster, Client, Transport, Socket, Partition, OwnerPid, Proto]) -> lager:info("AAE fullsync source worker started for partition ~p", [Partition]), - NumKeys = 1000000, - {ok, Bloom} = ebloom:new(NumKeys, 0.01, random:uniform(1000)), - State = #state{cluster=Cluster, client=Client, transport=Transport, @@ -91,8 +92,7 @@ init([Cluster, Client, Transport, Socket, Partition, OwnerPid, Proto]) -> built=0, owner=OwnerPid, wire_ver=w1, - proto=Proto, - bloom=Bloom }, + proto=Proto}, {ok, prepare_exchange, State}. handle_event(_Event, StateName, State) -> @@ -225,27 +225,35 @@ update_trees(start_exchange, State=#state{indexns=IndexN, owner=Owner}) when Ind {next_state, update_trees, State}; update_trees(start_exchange, State=#state{tree_pid=TreePid, index=Partition, - indexns=[IndexN|_IndexNs]}) -> - lager:info("Start exchange for partition,IndexN ~p,~p", [Partition, IndexN]), - update_request(TreePid, {Partition, undefined}, IndexN), - case send_synchronous_msg(?MSG_UPDATE_TREE, IndexN, State) of - ok -> - update_trees({tree_built, Partition, IndexN}, State); - not_responsible -> - update_trees({not_responsible, Partition, IndexN}, State) - end; + indexns=IndexNs}) -> + lager:info("Start update for partition,IndexN ~p,~p", [Partition, IndexNs]), + lists:foreach(fun(IndexN) -> + update_request(TreePid, {Partition, undefined}, IndexN), + case send_synchronous_msg(?MSG_UPDATE_TREE, IndexN, State) of + ok -> + gen_fsm:send_event(self(), {tree_built, Partition, IndexN}); + not_responsible -> + gen_fsm:send_event(self(), {not_responsible, Partition, IndexN}, State) + end + end, IndexNs), + {next_state, update_trees, State}; -update_trees({not_responsible, Partition, IndexN}, State=#state{owner=Owner}) -> +update_trees({not_responsible, Partition, IndexN}, State = #state{owner=Owner}) -> lager:debug("VNode ~p does not cover preflist ~p", [Partition, IndexN]), gen_server:cast(Owner, not_responsible), {stop, normal, State}; -update_trees({tree_built, _, _}, State) -> +update_trees({tree_built, _, _}, State = #state{indexns=IndexNs}) -> Built = State#state.built + 1, + NeededBuilts = length(IndexNs) * 2, %% All local and remote case Built of - 2 -> + NeededBuilts -> + %% Trees build now we can estimate how many keys + {ok, EstimatedNrKeys} = riak_kv_index_hashtree:estimate_keys(State#state.tree_pid), + lager:info("EstimatedNrKeys ~p for partition ~p", [EstimatedNrKeys, State#state.index]), + lager:debug("Moving to key exchange state"), gen_fsm:send_event(self(), start_key_exchange), - {next_state, key_exchange, State}; + {next_state, key_exchange, State#state{built=Built, estimated_nr_keys = EstimatedNrKeys}}; _ -> {next_state, update_trees, State#state{built=Built}} end. @@ -343,38 +351,41 @@ key_exchange(start_key_exchange, State=#state{cluster=Cluster, %% accumulates a list of one element that is the count of %% keys that differed. We can't prime the accumulator. It %% always starts as the empty list. KeyDiffs is a list of hashtree::keydiff() - Bloom = State#state.bloom, - Count0 = State#state.diff_cnt, + EstimatedNrKeys = State#state.estimated_nr_keys, Limit = app_helper:get_env(riak_repl, fullsync_direct, ?GET_OBJECT_LIMIT), - AccFun = fun(KeyDiffs, Acc0) -> - Count = case Acc0 of [C] when is_integer(C) -> C; _ -> 0 end, - FoldFun = - case (Count0+Count) > Limit of - %% Gather diff keys into a bloom filter - true -> fun(KeyDiff, AccIn) -> - accumulate_diff(KeyDiff, Bloom, AccIn, State) - end; - - %% Replicate diffs directly - false -> fun(KeyDiff, AccIn) -> - replicate_diff(KeyDiff, AccIn, State) - end - end, - - lists:foldl(FoldFun, Acc0, KeyDiffs) - end, + PercentLimit = app_helper:get_env(riak_repl, diff_percentage, ?DIFF_PERCENTAGE), + + UsedLimit = max(Limit, EstimatedNrKeys * PercentLimit div 100), + + AccFun = + fun(KeyDiffs, {CurrentDiffCount, Bloom} = Acc) -> + FoldFun = + case (Bloom =/= undefined) orelse (CurrentDiffCount > UsedLimit) of + true -> %% Gather diff keys into a bloom filter + fun(KeyDiff, AccIn) -> + accumulate_diff(KeyDiff, maybe_create_bloom(AccIn, EstimatedNrKeys), State) + end; + false -> %% Replicate diffs directly + fun(KeyDiff, AccIn) -> + replicate_diff(KeyDiff, AccIn, State) + end + end, + lists:foldl(FoldFun, Acc, KeyDiffs) + end, %% TODO: Add stats for AAE lager:debug("Starting compare for partition ~p", [Partition]), spawn_link(fun() -> StageStart=os:timestamp(), - case riak_kv_index_hashtree:compare(IndexN, Remote, AccFun, TreePid) of - [N] when is_integer(N) -> Count=N; - _ -> Count=0 - end, + {DiffCount, Bloom} = + riak_kv_index_hashtree:compare(IndexN, + Remote, + AccFun, + {State#state.diff_cnt, State#state.bloom}, + TreePid), lager:info("Full-sync with site ~p; fullsync difference generator for ~p/~p complete (~p differences, completed in ~p secs)", - [State#state.cluster, Partition, IndexN, Count, riak_repl_util:elapsed_secs(StageStart)]), - gen_fsm:send_event(SourcePid, {'$aae_src', done, Bloom, Count}) + [State#state.cluster, Partition, IndexN, DiffCount, riak_repl_util:elapsed_secs(StageStart)]), + gen_fsm:send_event(SourcePid, {'$aae_src', done, Bloom, DiffCount}) end), %% wait for differences from bloom_folder or to be done @@ -404,25 +415,26 @@ compute_differences({'$aae_src', worker_pid, WorkerPid}, WorkerPid ! {'$aae_src', ready, self()}, {next_state, compute_differences, State}; -compute_differences({'$aae_src', done, Bloom, Count}, State=#state{ indexns=IndexNs, bloom=Bloom, diff_cnt=Count0 }) +compute_differences({'$aae_src', done, Bloom, DiffCount}, State=#state{ indexns=IndexNs}) when length(IndexNs) =< 1 -> %% we just finished diffing the *last* IndexN, so we go to the vnode fold / bloom state - State2 = State#state{ diff_cnt=Count0+Count }, + State2 = State#state{ diff_cnt=DiffCount, + bloom = Bloom}, %% if we have anything in our bloom filter, start sending them now. %% this will start a worker process, which will tell us it's done with %% diffs_done once all differences are sent. - _ = finish_sending_differences(Bloom, State2), + _ = finish_sending_differences(State2), %% wait for differences from bloom_folder or to be done {next_state, send_diffs, State2}; -compute_differences({'$aae_src', done, _, Count}, State=#state{ indexns=[_|IndexNs], diff_cnt=Count0 }) -> +compute_differences({'$aae_src', done, Bloom, DiffCount}, State=#state{ indexns=[_|IndexNs]}) -> %% re-start for next indexN - gen_fsm:send_event(self(), start_exchange), - {next_state, update_trees, State#state{built=0, indexns=IndexNs, diff_cnt=Count0+Count }}. + gen_fsm:send_event(self(), start_key_exchange), + {next_state, key_exchange, State#state{indexns=IndexNs, diff_cnt=DiffCount, bloom = Bloom}}. %% state send_diffs is where we wait for diff_obj messages from the bloom folder %% and send them to the sink for each diff_obj. We eventually finish upon receipt @@ -436,22 +448,27 @@ send_diffs({diff_obj, RObj}, _From, State) -> %% Note: recv'd from an async send event send_diffs(diff_done, State) -> gen_fsm:send_event(self(), start_exchange), - {next_state, update_trees, State#state{built=0, indexns=[]}}. + {next_state, update_trees, State#state{indexns=[]}}. %%%=================================================================== %%% Internal functions %%%=================================================================== - -finish_sending_differences(Bloom, #state{index=Partition}) -> - case ebloom:elements(Bloom) == 0 of - true -> - lager:info("No differences, skipping bloom fold for partition ~p", [Partition]), +finish_sending_differences(#state{bloom = undefined,index=Partition,diff_cnt=DiffCnt,estimated_nr_keys=EstimatedNrKeys}) -> + lager:info("No Bloom folding over ~p/~p differences for partition ~p with EstimatedNrKeys ~p", + [0, DiffCnt, Partition, EstimatedNrKeys]), + gen_fsm:send_event(self(), diff_done); + +finish_sending_differences(#state{bloom = Bloom,index=Partition,diff_cnt=DiffCnt,estimated_nr_keys=EstimatedNrKeys}) -> + case ebloom:elements(Bloom) of + Count = 0 -> + lager:info("No Bloom folding over ~p/~p differences for partition ~p with EstimatedNrKeys ~p", + [Count, DiffCnt, Partition, EstimatedNrKeys]), gen_fsm:send_event(self(), diff_done); - false -> + Count-> {ok, Ring} = riak_core_ring_manager:get_my_ring(), OwnerNode = riak_core_ring:index_owner(Ring, Partition), - Count = ebloom:elements(Bloom), - lager:info("Bloom folding over ~p differences for partition ~p", [Count, Partition]), + lager:info("Bloom folding over ~p/~p differences for partition ~p with EstimatedNrKeys ~p", + [Count, DiffCnt, Partition, EstimatedNrKeys]), Self = self(), Worker = fun() -> FoldRef = make_ref(), @@ -484,6 +501,12 @@ finish_sending_differences(Bloom, #state{index=Partition}) -> spawn_link(Worker) %% this isn't the Pid we need because it's just the vnode:fold end. +maybe_create_bloom({DiffCount, undefined}, EstimatedNrKeys) -> + {ok, Bloom} = ebloom:new(max(10000, EstimatedNrKeys), 0.01, random:uniform(1000)), + {DiffCount, Bloom}; +maybe_create_bloom({DiffCount, Bloom}, _SegmentInfo) -> + {DiffCount, Bloom}. + bloom_fold({B, K}, V, {MPid, Bloom}) -> case ebloom:contains(Bloom, <>) of true -> @@ -497,72 +520,57 @@ bloom_fold({B, K}, V, {MPid, Bloom}) -> %% @private %% Returns accumulator as a list of one element that is the count of %% keys that differed. Initial value of Acc is always []. -replicate_diff(KeyDiff, Acc, State=#state{index=Partition}) -> - NumObjects = - case KeyDiff of - {remote_missing, Bin} -> - %% send object and related objects to remote - {Bucket,Key} = binary_to_term(Bin), - lager:debug("Keydiff: remote partition ~p remote missing: ~p:~p", - [Partition, Bucket, Key]), - send_missing(Bucket, Key, State); - {different, Bin} -> - %% send object and related objects to remote - {Bucket,Key} = binary_to_term(Bin), - lager:debug("Keydiff: remote partition ~p different: ~p:~p", - [Partition, Bucket, Key]), - send_missing(Bucket, Key, State); - {missing, Bin} -> - %% remote has a key we don't have. Ignore it. - {Bucket,Key} = binary_to_term(Bin), - lager:debug("Keydiff: remote partition ~p local missing: ~p:~p (ignored)", - [Partition, Bucket, Key]), - 0; - Other -> - lager:warning("Unexpected error keydiff: ~p (ignored)", [Other]), - 0 - end, - - case Acc of - [] -> - [1]; - [Count] -> - %% accrue number of differences sent from this segment - [Count+NumObjects]; - _Other -> +replicate_diff(KeyDiff, {DiffCount, Bloom} = Acc, State=#state{index=Partition}) -> + case KeyDiff of + {remote_missing, Bin} -> + %% send object and related objects to remote + {Bucket,Key} = binary_to_term(Bin), + lager:debug("Keydiff: remote partition ~p remote missing: ~p:~p", + [Partition, Bucket, Key]), + {DiffCount + send_missing(Bucket, Key, State), Bloom}; + {different, Bin} -> + %% send object and related objects to remote + {Bucket,Key} = binary_to_term(Bin), + lager:debug("Keydiff: remote partition ~p different: ~p:~p", + [Partition, Bucket, Key]), + {DiffCount + send_missing(Bucket, Key, State), Bloom}; + {missing, Bin} -> + %% remote has a key we don't have. Ignore it. + {Bucket,Key} = binary_to_term(Bin), + lager:debug("Keydiff: remote partition ~p local missing: ~p:~p (ignored)", + [Partition, Bucket, Key]), + Acc; + Other -> + lager:warning("Unexpected error keydiff: ~p (ignored)", [Other]), Acc end. -accumulate_diff(KeyDiff, Bloom, [], State) -> - accumulate_diff(KeyDiff, Bloom, [0], State); -accumulate_diff(KeyDiff, Bloom, [Count], #state{index=Partition}) -> - NumObjects = - case KeyDiff of - {remote_missing, Bin} -> - %% send object and related objects to remote - {Bucket,Key} = binary_to_term(Bin), - lager:debug("Keydiff: remote partition ~p remote missing: ~p:~p", - [Partition, Bucket, Key]), - ebloom:insert(Bloom, <>), - 1; - {different, Bin} -> - %% send object and related objects to remote - {Bucket,Key} = binary_to_term(Bin), - lager:debug("Keydiff: remote partition ~p different: ~p:~p", - [Partition, Bucket, Key]), - ebloom:insert(Bloom, <>), - 1; - {missing, Bin} -> - %% remote has a key we don't have. Ignore it. - {Bucket,Key} = binary_to_term(Bin), - lager:debug("Keydiff: remote partition ~p local missing: ~p:~p (ignored)", - [Partition, Bucket, Key]), - 0; - Other -> - lager:warning("Unexpected error keydiff: ~p (ignored)", [Other]), - 0 - end, - [Count+NumObjects]. +accumulate_diff(KeyDiff, {DiffCount, Bloom} = Acc, #state{index=Partition}) -> + case KeyDiff of + {remote_missing, Bin} -> + %% send object and related objects to remote + {Bucket,Key} = binary_to_term(Bin), + lager:debug("Keydiff: remote partition ~p remote missing: ~p:~p", + [Partition, Bucket, Key]), + ebloom:insert(Bloom, <>), + {DiffCount + 1, Bloom}; + {different, Bin} -> + %% send object and related objects to remote + {Bucket,Key} = binary_to_term(Bin), + lager:debug("Keydiff: remote partition ~p different: ~p:~p", + [Partition, Bucket, Key]), + ebloom:insert(Bloom, <>), + {DiffCount + 1, Bloom}; + {missing, Bin} -> + %% remote has a key we don't have. Ignore it. + {Bucket,Key} = binary_to_term(Bin), + lager:debug("Keydiff: remote partition ~p local missing: ~p:~p (ignored)", + [Partition, Bucket, Key]), + Acc; + Other -> + lager:warning("Unexpected error keydiff: ~p (ignored)", [Other]), + Acc + end. send_missing(RObj, State=#state{client=Client, wire_ver=Ver, proto=Proto}) -> %% we don't actually have the vclock to compare, so just send the @@ -684,3 +692,4 @@ get_reply(State=#state{transport=Transport, socket=Socket}) -> %% display and possibly retry the partition from. throw({stop, Reason, State}) end. + From e6d1ec19762529a7ae735e004b53d530a8c01d4f Mon Sep 17 00:00:00 2001 From: Mikael Lixenstrand Date: Mon, 15 Sep 2014 10:15:09 +0100 Subject: [PATCH 02/47] Add timeout to natmap_test_ --- src/riak_repl2_ip.erl | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/riak_repl2_ip.erl b/src/riak_repl2_ip.erl index 7748470c..51bdc1e3 100644 --- a/src/riak_repl2_ip.erl +++ b/src/riak_repl2_ip.erl @@ -539,7 +539,7 @@ determine_netmask_test_() -> natmap_test_() -> error_logger:tty(false), - [ + [{timeout, 30, {"forward lookups work", fun() -> Map = [ @@ -555,8 +555,8 @@ natmap_test_() -> maybe_apply_nat_map({10, 0, 0, 10}, 9090, Map)), ok end - }, - {"forward lookups with ports work", + }}, + {timeout, 30, {"forward lookups with ports work", fun() -> Map = [ {{{65, 172, 243, 10}, 10080}, {10, 0, 0, 10}}, @@ -571,8 +571,8 @@ natmap_test_() -> maybe_apply_nat_map({10, 0, 0, 10}, 9090, Map)), ok end - }, - {"reverse lookups work", + }}, + {timeout, 30, {"reverse lookups work", fun() -> Map = [ {{65, 172, 243, 10}, {10, 0, 0, 10}}, @@ -587,8 +587,8 @@ natmap_test_() -> apply_reverse_nat_map({10, 0, 0, 20}, 9090, Map)), ok end - }, - {"reverse lookups with ports work", + }}, + {timeout, 30, {"reverse lookups with ports work", fun() -> Map = [ {{{65, 172, 243, 10}, 10080}, {10, 0, 0, 10}}, @@ -603,8 +603,8 @@ natmap_test_() -> apply_reverse_nat_map({10, 0, 0, 20}, 9080, Map)), ok end - }, - {"forward lookups with hostnames work", + }}, + {timeout, 30, {"forward lookups with hostnames work", fun() -> Map = [ {{65, 172, 243, 10}, "localhost"}, @@ -615,8 +615,8 @@ natmap_test_() -> maybe_apply_nat_map({65, 172, 243, 10}, 9080, Map)), ok end - }, - {"reverse lookups with hostnames work", + }}, + {timeout, 30, {"reverse lookups with hostnames work", fun() -> {ok, {hostent, "basho.com", _, inet, 4, Addresses}} = inet_res:gethostbyname("basho.com"), Map = [ @@ -628,7 +628,7 @@ natmap_test_() -> apply_reverse_nat_map(hd(Addresses), 9080, Map)), ok end - } + }} ]. -endif. From c8ec0d929942336cfd83145d8b409509a10f08ee Mon Sep 17 00:00:00 2001 From: Mikael Lixenstrand Date: Wed, 17 Sep 2014 10:14:36 +0100 Subject: [PATCH 03/47] Use deduce_wire_version_from_proto for AAE --- src/riak_repl2_fssource.erl | 2 +- src/riak_repl_aae_source.erl | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/riak_repl2_fssource.erl b/src/riak_repl2_fssource.erl index 013cf1ee..1ccb232f 100644 --- a/src/riak_repl2_fssource.erl +++ b/src/riak_repl2_fssource.erl @@ -118,7 +118,7 @@ handle_call({connected, Socket, Transport, _Endpoint, Proto, Props}, {ok, FullsyncWorker} = riak_repl_aae_source:start_link(Cluster, Client, Transport, Socket, Partition, - self(), ClientVer), + self(), Proto), %% Give control of socket to AAE worker. It will consume all TCP messages. ok = Transport:controlling_process(Socket, FullsyncWorker), riak_repl_aae_source:start_exchange(FullsyncWorker), diff --git a/src/riak_repl_aae_source.erl b/src/riak_repl_aae_source.erl index c36d3fc9..dab53828 100644 --- a/src/riak_repl_aae_source.erl +++ b/src/riak_repl_aae_source.erl @@ -84,6 +84,7 @@ init([Cluster, Client, Transport, Socket, Partition, OwnerPid, Proto]) -> lager:info("AAE fullsync source worker started for partition ~p", [Partition]), + Ver = riak_repl_util:deduce_wire_version_from_proto(Proto), State = #state{cluster=Cluster, client=Client, transport=Transport, @@ -91,7 +92,7 @@ init([Cluster, Client, Transport, Socket, Partition, OwnerPid, Proto]) -> index=Partition, built=0, owner=OwnerPid, - wire_ver=w1, + wire_ver=Ver, proto=Proto}, {ok, prepare_exchange, State}. From a13c865193e71093fd05ea6c2a2d7e0c83173e84 Mon Sep 17 00:00:00 2001 From: Mikael Lixenstrand Date: Wed, 17 Sep 2014 10:41:47 +0100 Subject: [PATCH 04/47] Keep legacy for riak_repl_aae_source state.proto --- src/riak_repl_aae_source.erl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/riak_repl_aae_source.erl b/src/riak_repl_aae_source.erl index dab53828..b0f8ab3d 100644 --- a/src/riak_repl_aae_source.erl +++ b/src/riak_repl_aae_source.erl @@ -85,6 +85,7 @@ init([Cluster, Client, Transport, Socket, Partition, OwnerPid, Proto]) -> [Partition]), Ver = riak_repl_util:deduce_wire_version_from_proto(Proto), + {_, ClientVer, _} = Proto, State = #state{cluster=Cluster, client=Client, transport=Transport, @@ -93,7 +94,7 @@ init([Cluster, Client, Transport, Socket, Partition, OwnerPid, Proto]) -> built=0, owner=OwnerPid, wire_ver=Ver, - proto=Proto}, + proto=ClientVer}, {ok, prepare_exchange, State}. handle_event(_Event, StateName, State) -> From 30452821d9fc68fd60f232b50c4b4b43ddf99f0b Mon Sep 17 00:00:00 2001 From: Kresten Krab Thorup Date: Thu, 18 Sep 2014 18:27:19 +0200 Subject: [PATCH 05/47] Rename fullsync config parameters MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit … this needs to be followed up by appropriate changes to riak_test, doc, etc. --- src/riak_repl_aae_source.erl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/riak_repl_aae_source.erl b/src/riak_repl_aae_source.erl index b0f8ab3d..7552cdce 100644 --- a/src/riak_repl_aae_source.erl +++ b/src/riak_repl_aae_source.erl @@ -354,8 +354,8 @@ key_exchange(start_key_exchange, State=#state{cluster=Cluster, %% keys that differed. We can't prime the accumulator. It %% always starts as the empty list. KeyDiffs is a list of hashtree::keydiff() EstimatedNrKeys = State#state.estimated_nr_keys, - Limit = app_helper:get_env(riak_repl, fullsync_direct, ?GET_OBJECT_LIMIT), - PercentLimit = app_helper:get_env(riak_repl, diff_percentage, ?DIFF_PERCENTAGE), + Limit = app_helper:get_env(riak_repl, fullsync_direct_limit, ?GET_OBJECT_LIMIT), + PercentLimit = app_helper:get_env(riak_repl, fullsync_direct_percentage_limit, ?DIFF_PERCENTAGE), UsedLimit = max(Limit, EstimatedNrKeys * PercentLimit div 100), From 779c1ecd942d91beb644824a1b6d6eecea0a53f1 Mon Sep 17 00:00:00 2001 From: Joseph Blomstedt Date: Mon, 6 Oct 2014 15:22:01 -0700 Subject: [PATCH 06/47] Remove unused riak_repl_aae_source:replicate_diff/3 --- src/riak_repl_aae_source.erl | 30 ------------------------------ 1 file changed, 30 deletions(-) diff --git a/src/riak_repl_aae_source.erl b/src/riak_repl_aae_source.erl index 7dd9e004..b6804282 100644 --- a/src/riak_repl_aae_source.erl +++ b/src/riak_repl_aae_source.erl @@ -25,8 +25,6 @@ -export([init/1, handle_event/3, handle_sync_event/4, handle_info/3, terminate/3, code_change/4]). --export([replicate_diff/3]). - -type index() :: non_neg_integer(). -type index_n() :: {index(), pos_integer()}. @@ -517,34 +515,6 @@ bloom_fold({B, K}, V, {MPid, Bloom}) -> end, {MPid, Bloom}. -%% @private -%% Returns accumulator as a list of one element that is the count of -%% keys that differed. Initial value of Acc is always []. -replicate_diff(KeyDiff, {DiffCount, Bloom} = Acc, State=#state{index=Partition}) -> - case KeyDiff of - {remote_missing, Bin} -> - %% send object and related objects to remote - {Bucket,Key} = binary_to_term(Bin), - lager:debug("Keydiff: remote partition ~p remote missing: ~p:~p", - [Partition, Bucket, Key]), - {DiffCount + send_missing(Bucket, Key, State), Bloom}; - {different, Bin} -> - %% send object and related objects to remote - {Bucket,Key} = binary_to_term(Bin), - lager:debug("Keydiff: remote partition ~p different: ~p:~p", - [Partition, Bucket, Key]), - {DiffCount + send_missing(Bucket, Key, State), Bloom}; - {missing, Bin} -> - %% remote has a key we don't have. Ignore it. - {Bucket,Key} = binary_to_term(Bin), - lager:debug("Keydiff: remote partition ~p local missing: ~p:~p (ignored)", - [Partition, Bucket, Key]), - Acc; - Other -> - lager:warning("Unexpected error keydiff: ~p (ignored)", [Other]), - Acc - end. - accumulate_diff(KeyDiff, Exchange, State=#state{index=Partition}) -> case KeyDiff of {remote_missing, Bin} -> From 83dd1929aff4697fecee109627f891de64398e20 Mon Sep 17 00:00:00 2001 From: Mikael Lixenstrand Date: Thu, 16 Oct 2014 11:48:22 +0100 Subject: [PATCH 07/47] Remove redundant and unnecessary logging --- src/riak_repl2_fscoordinator.erl | 2 -- src/riak_repl2_fssource.erl | 5 ++--- src/riak_repl_aae_source.erl | 15 ++++----------- 3 files changed, 6 insertions(+), 16 deletions(-) diff --git a/src/riak_repl2_fscoordinator.erl b/src/riak_repl2_fscoordinator.erl index d0722a24..b23cf38f 100644 --- a/src/riak_repl2_fscoordinator.erl +++ b/src/riak_repl2_fscoordinator.erl @@ -768,8 +768,6 @@ remote_node_available({_Partition, _, RemoteNode}, Busies) -> start_fssource(Partition2={Partition,_,_} = PartitionVal, Ip, Port, State) -> #state{owners = Owners} = State, LocalNode = proplists:get_value(Partition, Owners), - lager:info("Starting fssource for ~p on ~p to ~p", [Partition, LocalNode, - Ip]), case riak_repl2_fssource_sup:enable(LocalNode, Partition, {Ip, Port}) of {ok, Pid} -> link(Pid), diff --git a/src/riak_repl2_fssource.erl b/src/riak_repl2_fssource.erl index 1ccb232f..b01cb82d 100644 --- a/src/riak_repl2_fssource.erl +++ b/src/riak_repl2_fssource.erl @@ -81,9 +81,8 @@ init([Partition, IP]) -> end. handle_call({connected, Socket, Transport, _Endpoint, Proto, Props}, - _From, State=#state{ip=IP, partition=Partition, strategy=RequestedStrategy}) -> + _From, State=#state{partition=Partition, strategy=RequestedStrategy}) -> Cluster = proplists:get_value(clustername, Props), - lager:info("Fullsync connection to ~p for ~p",[IP, Partition]), SocketTag = riak_repl_util:generate_socket_tag("fs_source", Transport, Socket), lager:debug("Keeping stats for " ++ SocketTag), @@ -287,7 +286,7 @@ maybe_exchange_caps(_, Caps, Socket, Transport) -> %% Start a connection to the remote sink node at IP, using the given fullsync strategy, %% for the given partition. The protocol version will be determined from the strategy. connect(IP, Strategy, Partition) -> - lager:info("Connecting to remote ~p for partition ~p", [IP, Partition]), + lager:debug("Connecting to remote ~p for partition ~p", [IP, Partition]), TcpOptions = [{keepalive, true}, {nodelay, true}, {packet, 4}, diff --git a/src/riak_repl_aae_source.erl b/src/riak_repl_aae_source.erl index b6804282..43fb9bb0 100644 --- a/src/riak_repl_aae_source.erl +++ b/src/riak_repl_aae_source.erl @@ -87,8 +87,7 @@ cancel_fullsync(Pid) -> %%%=================================================================== init([Cluster, Client, Transport, Socket, Partition, OwnerPid, Proto]) -> - lager:info("AAE fullsync source worker started for partition ~p", - [Partition]), + Ver = riak_repl_util:deduce_wire_version_from_proto(Proto), {_, ClientVer, _} = Proto, @@ -230,7 +229,6 @@ update_trees(cancel_fullsync, State) -> update_trees(start_exchange, State=#state{tree_pid=TreePid, index=Partition, indexns=IndexNs}) -> - lager:info("Start update for partition,IndexN ~p,~p", [Partition, IndexNs]), lists:foreach(fun(IndexN) -> update_request(TreePid, {Partition, undefined}, IndexN), case send_synchronous_msg(?MSG_UPDATE_TREE, IndexN, State) of @@ -253,7 +251,6 @@ update_trees({tree_built, _, _}, State = #state{indexns=IndexNs}) -> NeededBuilts -> %% Trees built now we can estimate how many keys {ok, EstimatedNrKeys} = riak_kv_index_hashtree:estimate_keys(State#state.tree_pid), - lager:info("EstimatedNrKeys ~p for partition ~p", [EstimatedNrKeys, State#state.index]), lager:debug("Moving to key exchange state"), key_exchange(init, State#state{built=Built, estimated_nr_keys = EstimatedNrKeys}); @@ -288,8 +285,6 @@ key_exchange(cancel_fullsync, State) -> {stop, normal, State}; key_exchange(finish_fullsync, State=#state{owner=Owner}) -> send_complete(State), - lager:info("AAE fullsync source completed partition ~p", - [State#state.index]), riak_repl2_fssource:fullsync_complete(Owner), %% TODO: Why stay in key_exchange? Should we stop instead? {next_state, key_exchange, State}; @@ -368,8 +363,8 @@ key_exchange(start_key_exchange, State=#state{cluster=Cluster, spawn_link(fun() -> StageStart=os:timestamp(), Exchange2 = riak_kv_index_hashtree:compare(IndexN, Remote, AccFun, Exchange, TreePid), - lager:info("Full-sync with site ~p; fullsync difference generator for ~p complete (completed in ~p secs)", - [State#state.cluster, Partition, riak_repl_util:elapsed_secs(StageStart)]), + lager:debug("Full-sync with site ~p; fullsync difference generator for ~p complete (completed in ~p secs)", + [State#state.cluster, Partition, riak_repl_util:elapsed_secs(StageStart)]), gen_fsm:send_event(SourcePid, {'$aae_src', done, Exchange2}) end), @@ -413,12 +408,10 @@ maybe_send_direct(#exchange{mode=inline, count=Count, limit=Limit}, lager:info("Directly sent ~b differences inline for partition ~p", [Sent, Partition]), ok; -maybe_send_direct(#exchange{buffer=Buffer}, State=#state{index=Partition}) -> +maybe_send_direct(#exchange{buffer=Buffer}, State) -> Keys = [{Bucket, Key} || {_, {Bucket, Key}} <- ets:tab2list(Buffer)], true = ets:delete(Buffer), Sorted = lists:sort(Keys), - Count = length(Sorted), - lager:info("Directly sending ~p differences for partition ~p", [Count, Partition]), _ = [send_missing(Bucket, Key, State) || {Bucket, Key} <- Sorted], ok. From e43257a45b1bf6f25f0134c2b9bb39140ecf2c40 Mon Sep 17 00:00:00 2001 From: Mikael Lixenstrand Date: Thu, 16 Oct 2014 17:18:56 +0100 Subject: [PATCH 08/47] Revert some of "Remove redundant and unnecessary logging" Keep needed for testing as debug. --- src/riak_repl2_fssource.erl | 3 ++- src/riak_repl_aae_source.erl | 10 ++++++++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/src/riak_repl2_fssource.erl b/src/riak_repl2_fssource.erl index b01cb82d..fffb69dc 100644 --- a/src/riak_repl2_fssource.erl +++ b/src/riak_repl2_fssource.erl @@ -81,8 +81,9 @@ init([Partition, IP]) -> end. handle_call({connected, Socket, Transport, _Endpoint, Proto, Props}, - _From, State=#state{partition=Partition, strategy=RequestedStrategy}) -> + _From, State=#state{ip=IP, partition=Partition, strategy=RequestedStrategy}) -> Cluster = proplists:get_value(clustername, Props), + lager:info("Fullsync connection to ~p for ~p",[IP, Partition]), SocketTag = riak_repl_util:generate_socket_tag("fs_source", Transport, Socket), lager:debug("Keeping stats for " ++ SocketTag), diff --git a/src/riak_repl_aae_source.erl b/src/riak_repl_aae_source.erl index 43fb9bb0..23126c48 100644 --- a/src/riak_repl_aae_source.erl +++ b/src/riak_repl_aae_source.erl @@ -87,7 +87,8 @@ cancel_fullsync(Pid) -> %%%=================================================================== init([Cluster, Client, Transport, Socket, Partition, OwnerPid, Proto]) -> - + lager:debug("AAE fullsync source worker started for partition ~p", + [Partition]), Ver = riak_repl_util:deduce_wire_version_from_proto(Proto), {_, ClientVer, _} = Proto, @@ -251,6 +252,7 @@ update_trees({tree_built, _, _}, State = #state{indexns=IndexNs}) -> NeededBuilts -> %% Trees built now we can estimate how many keys {ok, EstimatedNrKeys} = riak_kv_index_hashtree:estimate_keys(State#state.tree_pid), + lager:debug("EstimatedNrKeys ~p for partition ~p", [EstimatedNrKeys, State#state.index]), lager:debug("Moving to key exchange state"), key_exchange(init, State#state{built=Built, estimated_nr_keys = EstimatedNrKeys}); @@ -285,6 +287,8 @@ key_exchange(cancel_fullsync, State) -> {stop, normal, State}; key_exchange(finish_fullsync, State=#state{owner=Owner}) -> send_complete(State), + lager:debug("AAE fullsync source completed partition ~p", + [State#state.index]), riak_repl2_fssource:fullsync_complete(Owner), %% TODO: Why stay in key_exchange? Should we stop instead? {next_state, key_exchange, State}; @@ -408,10 +412,12 @@ maybe_send_direct(#exchange{mode=inline, count=Count, limit=Limit}, lager:info("Directly sent ~b differences inline for partition ~p", [Sent, Partition]), ok; -maybe_send_direct(#exchange{buffer=Buffer}, State) -> +maybe_send_direct(#exchange{buffer=Buffer}, State=#state{index=Partition}) -> Keys = [{Bucket, Key} || {_, {Bucket, Key}} <- ets:tab2list(Buffer)], true = ets:delete(Buffer), Sorted = lists:sort(Keys), + Count = length(Sorted), + lager:debug("Directly sending ~p differences for partition ~p", [Count, Partition]), _ = [send_missing(Bucket, Key, State) || {Bucket, Key} <- Sorted], ok. From 67b95356e04a8ab2a03f16c3d0b85ad4928334eb Mon Sep 17 00:00:00 2001 From: Mikael Lixenstrand Date: Tue, 21 Oct 2014 10:17:58 +0100 Subject: [PATCH 09/47] Fix xref and dialyzer --- rebar.config | 2 +- src/riak_repl_aae_source.erl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/rebar.config b/rebar.config index 7a076b92..9c9ebdb9 100644 --- a/rebar.config +++ b/rebar.config @@ -10,6 +10,6 @@ {lager, "2.0.3", {git, "git://github.com/basho/lager.git", {tag, "2.0.3"}}}, {ranch, "0.4.0-p1", {git, "git://github.com/basho/ranch.git", {tag, "0.4.0-p1"}}}, {ebloom, ".*", {git, "git://github.com/basho/ebloom.git", {tag, "2.0.0"}}}, - {riak_kv, ".*", {git, "git://github.com/basho/riak_kv.git", {branch, "2.0"}}}, + {riak_kv, ".*", {git, "git://github.com/basho/riak_kv.git", {branch, "feature/aae-estimate-keys"}}}, {riak_repl_pb_api, ".*", {git, "git@github.com:basho/riak_repl_pb_api.git", {branch, "2.0"}}} ]}. diff --git a/src/riak_repl_aae_source.erl b/src/riak_repl_aae_source.erl index 23126c48..9a4fa7cb 100644 --- a/src/riak_repl_aae_source.erl +++ b/src/riak_repl_aae_source.erl @@ -236,7 +236,7 @@ update_trees(start_exchange, State=#state{tree_pid=TreePid, ok -> gen_fsm:send_event(self(), {tree_built, Partition, IndexN}); not_responsible -> - gen_fsm:send_event(self(), {not_responsible, Partition, IndexN}, State) + gen_fsm:send_event(self(), {not_responsible, Partition, IndexN}) end end, IndexNs), {next_state, update_trees, State}; From 1114a70b0a790102f507bdd2c54eedf4f4b4936a Mon Sep 17 00:00:00 2001 From: Micah Warren Date: Mon, 29 Sep 2014 15:13:26 -0500 Subject: [PATCH 10/47] Implement soft_exit, primarily for aae_fullsyn. Problem: transient failures of aae, such as trees not yet built or locks not being aquired, would cause an aae fullsync process to exit abnormally. This could happen several times in a row, creating log spam. Resolution: the concept of soft_exit. A soft_exit is a message sent from a soon to be exiting process to a soft_linked process. The exiting process would then exit normally, while any soft_linked processes could handle the soft_exit message in a similar fashion as an exit message. This would indicate an exit reason that should be handled, but not bad enough to have the system logger know about it. The soft_exit message sent from the aae worker to the fscoordinator is as simple as `{soft_exit, pid(), term()}'. The current implementation is not generic. There can only one soft_link to the aae, and there's no general mechanism to use soft_link's or soft_exits elsewhere in the code base. Sorry. Another change rolled into this is consistent use of a #partition_info record in the fscoordinator, and error tracking the fscoordinator's state. By swapping to useing a single data structure in the partition queue, whereis waiting list, and purgatory queues it makes it easier to understand the fscordinator (as there is less code modify structures). This is a forward port of the fix done for 1.4. Conflicts favor existing code where it does not directly effect the fix. Conflicts: Makefile rebar.config src/riak_repl2_fssource.erl src/riak_repl2_rtq_proxy.erl src/riak_repl_aae_source.erl test/riak_core_cluster_mgr_tests.erl --- include/riak_repl.hrl | 3 + src/riak_repl2_fscoordinator.erl | 393 +++++++++++++++++++------------ src/riak_repl2_fssource.erl | 84 ++++++- src/riak_repl_aae_source.erl | 9 +- src/riak_repl_wm_stats.erl | 1 - 5 files changed, 319 insertions(+), 171 deletions(-) diff --git a/include/riak_repl.hrl b/include/riak_repl.hrl index 2dd83a58..fae6a6e7 100644 --- a/include/riak_repl.hrl +++ b/include/riak_repl.hrl @@ -28,6 +28,9 @@ -define(DEFAULT_SOURCE_RETRIES, infinity). %% How many times we should retry when failing a reservation -define(DEFAULT_RESERVE_RETRIES, 0). +%% How many times during a fullsync we should retry a partion that has sent +%% a 'soft_exit' message to the coordinator +-define(DEFAULT_SOURCE_SOFT_RETRIES, infinity). %% 20 seconds. sources should claim within 5 seconds, but give them a little more time -define(RESERVATION_TIMEOUT, (20 * 1000)). -define(DEFAULT_MAX_FS_BUSIES_TOLERATED, 10). diff --git a/src/riak_repl2_fscoordinator.erl b/src/riak_repl2_fscoordinator.erl index a2b60f75..acf4fddb 100644 --- a/src/riak_repl2_fscoordinator.erl +++ b/src/riak_repl2_fscoordinator.erl @@ -61,12 +61,16 @@ partition_queue = queue:new(), retries = dict:new(), reserve_retries = dict:new(), + soft_retries = dict:new(), whereis_waiting = [], busy_nodes = sets:new(), running_sources = [], + purgatory = queue:new(), + dropped = [], successful_exits = 0, error_exits = 0, retry_exits = 0, + soft_retry_exits = 0, pending_fullsync = false, dirty_nodes = ordsets:new(), % these nodes should run fullsync dirty_nodes_during_fs = ordsets:new(), % these nodes reported realtime errors @@ -77,6 +81,13 @@ stat_cache = #stat_cache{} }). +-record(partition_info, { + index, + node, + running_source, + whereis_tref +}). + %% ------------------------------------------------------------------ %% API Function Exports %% ------------------------------------------------------------------ @@ -253,10 +264,12 @@ handle_call(status, _From, State = #state{socket=Socket}) -> {cluster, State#state.other_cluster}, {queued, queue:len(State#state.partition_queue)}, {in_progress, length(State#state.running_sources)}, + {waiting_for_retry, queue:len(State#state.purgatory)}, {starting, length(State#state.whereis_waiting)}, {successful_exits, State#state.successful_exits}, {error_exits, State#state.error_exits}, {retry_exits, State#state.retry_exits}, + {soft_retry_exits, State#state.soft_retry_exits}, {busy_nodes, sets:size(State#state.busy_nodes)}, {last_running_refresh, StatFreshness}, {running_stats, SourceStats}, @@ -321,7 +334,7 @@ handle_cast({connected, Socket, Transport, _Endpoint, _Proto}, State) -> handle_cast({connect_failed, _From, Why}, State) -> lager:warning("Fullsync remote connection to ~p failed due to ~p, retrying", [State#state.other_cluster, Why]), - {stop, normal, State}; + {stop, connect_failed, State}; handle_cast(start_fullsync, #state{socket=undefined} = State) -> %% not connected yet... @@ -345,6 +358,9 @@ handle_cast(start_fullsync, State) -> partition_queue = queue:from_list(Partitions), retries = dict:new(), reserve_retries = dict:new(), + purgatory = queue:new(), + soft_retries = dict:new(), + dropped = [], successful_exits = 0, error_exits = 0, retry_exits = 0, @@ -356,18 +372,20 @@ handle_cast(start_fullsync, State) -> handle_cast(stop_fullsync, State) -> % exit all running, cancel all timers, and reset the state. - _ = [erlang:cancel_timer(Tref) || {_, {_, Tref}} <- State#state.whereis_waiting], - _ = [begin + [erlang:cancel_timer(Tref) || #partition_info{whereis_tref = Tref} <- State#state.whereis_waiting], + [begin unlink(Pid), riak_repl2_fssource:stop_fullsync(Pid), riak_repl2_fssource_sup:disable(node(Pid), Part) - end || {Pid, {Part, _PartN}} <- State#state.running_sources], + end || #partition_info{index = Part, running_source = Pid} <- State#state.running_sources], State2 = State#state{ largest_n = undefined, owners = [], partition_queue = queue:new(), retries = dict:new(), reserve_retries = dict:new(), + purgatory = queue:new(), + dropped = [], whereis_waiting = [], running_sources = [] }, @@ -380,17 +398,16 @@ handle_cast(_Msg, State) -> %% @hidden handle_info({'EXIT', Pid, Cause}, #state{socket=Socket, transport=Transport}=State) when Cause =:= normal; Cause =:= shutdown -> - lager:debug("Fssource ~p exited normally", [Pid]), - PartitionEntry = lists:keytake(Pid, 1, State#state.running_sources), + lager:debug("fssource ~p exited normally", [Pid]), + PartitionEntry = lists:keytake(Pid, #partition_info.running_source, State#state.running_sources), case PartitionEntry of false -> % late exit or otherwise non-existant {noreply, State}; - {value, {Pid, {Index, _, _}=Partition}, Running} -> - + {value, Partition, Running} -> + #partition_info{index = Index, node = Node} = Partition, % likely a slot on the remote node opened up, so re-enable that % remote node for whereis requests. - {_, _, Node} = Partition, NewBusies = sets:del_element(Node, State#state.busy_nodes), % ensure we unreserve the partition on the remote node @@ -406,68 +423,22 @@ handle_info({'EXIT', Pid, Cause}, maybe_complete_fullsync(Running, State2) end; -handle_info({'EXIT', Pid, Cause}, - #state{socket=Socket, transport=Transport}=State) -> - lager:warning("Fssource ~p exited abnormally: ~p", [Pid, Cause]), - PartitionEntry = lists:keytake(Pid, 1, State#state.running_sources), - case PartitionEntry of - false -> - % late exit - {noreply, State}; - {value, {Pid, {Index, _, _}=Partition}, Running} -> +handle_info({soft_exit, Pid, Cause}, State) -> + lager:info("fssource ~p soft exit with reason ~p", [Pid, Cause]), + handle_abnormal_exit(soft_exit, Pid, Cause, State); - % even a bad exit opens a slot on the remote node - {_, _, Node} = Partition, - NewBusies = sets:del_element(Node, State#state.busy_nodes), - - % ensure we unreserve the partition on the remote node - % instead of waiting for a timeout. - Transport:send(Socket, term_to_binary({unreserve, Index})), - - % stats - #state{partition_queue = PQueue, retries = Retries0} = State, - - RetryLimit = app_helper:get_env(riak_repl, max_fssource_retries, - ?DEFAULT_SOURCE_RETRIES), - Retries = dict:update_counter(Partition, 1, Retries0), - - case dict:fetch(Partition, Retries) of - N when N > RetryLimit, is_integer(RetryLimit) -> - lager:warning("Fullsync dropping partition: ~p, ~p" - " failed retries", - [Partition, RetryLimit]), - ErrorExits = State#state.error_exits + 1, - State2 = State#state{busy_nodes = NewBusies, - retries = Retries, - running_sources = Running, - error_exits = ErrorExits}, - maybe_complete_fullsync(Running, State2); - _ -> %% have not run out of retries yet - % reset for retry later - lager:debug("Fssource rescheduling partition: ~p", - [Partition]), - PQueue2 = queue:in(Partition, PQueue), - RetryExits = State#state.retry_exits + 1, - State2 = State#state{partition_queue = PQueue2, - retries = Retries, - busy_nodes = NewBusies, - running_sources = Running, - retry_exits = RetryExits}, - State3 = start_up_reqs(State2), - {noreply, State3} - end - end; +handle_info({'EXIT', Pid, Cause}, State) -> + lager:info("fssource ~p exited abnormally: ~p", [Pid, Cause]), + handle_abnormal_exit('EXIT', Pid, Cause, State); handle_info({Partition, whereis_timeout}, State) -> #state{whereis_waiting = Waiting} = State, - case proplists:get_value(Partition, Waiting) of - undefined -> + case lists:keytake(Partition, #partition_info.index, Waiting) of + false -> % late timeout. {noreply, State}; - {N, NodeData, _Tref} -> - Waiting2 = proplists:delete(Partition, Waiting), - Partition1 = {Partition, N, NodeData}, - Q = queue:in(Partition1, State#state.partition_queue), + {value, PartitionInfo, Waiting2} -> + Q = queue:in(PartitionInfo#partition_info{whereis_tref = undefined}, State#state.partition_queue), State2 = State#state{whereis_waiting = Waiting2, partition_queue = Q}, State3 = start_up_reqs(State2), {noreply, State3} @@ -523,6 +494,80 @@ handle_info({'DOWN', Mon, process, Pid, Why}, #state{stat_cache = #stat_cache{wo handle_info(_Info, State) -> {noreply, State}. +handle_abnormal_exit(ExitType, Pid, Cause, State) -> + PartitionEntry = lists:keytake(Pid, #partition_info.running_source, State#state.running_sources), + handle_abnormal_exit(ExitType, Pid, Cause, PartitionEntry, State). + +handle_abnormal_exit(_ExtiType, _Pid, _Cause, false, State) -> + % late exit + {noreply, State}; + +handle_abnormal_exit(ExitType, Pid, _Cause, {value, PartitionWithSource, Running}, State) -> + + Partition = PartitionWithSource#partition_info{running_source = undefined}, + + #partition_info{index = Index, node = Node} = Partition, + #state{socket = Socket, transport = Transport} = State, + % even a bad exit opens a slot on the remote node + NewBusies = sets:del_element(Node, State#state.busy_nodes), + + % ensure we unreserve the partition on the remote node + % instead of waiting for a timeout. + Transport:send(Socket, term_to_binary({unreserve, Index})), + + % stats + #state{partition_queue = PQueue} = State, + + State2 = State#state{busy_nodes = NewBusies, running_sources = Running}, + {ErrorCount, State3} = increment_error_dict(Partition, ExitType, State2), + + case ExitType of + soft_exit -> + lager:debug("putting partition ~p in purgatory due to soft exit of ~p", [Index, Pid]), + _ = flush_exit_message(Pid), + State4 = start_up_reqs(State3), + SoftRetryLimit = app_helper:get_env(riak_repl, max_fssource_soft_retries, ?DEFAULT_SOURCE_SOFT_RETRIES), + SoftRetryCount = State4#state.soft_retry_exits + 1, + if + SoftRetryLimit =:= infinity -> + Purgatory = queue:in(Partition, State4#state.purgatory), + {noreply, State4#state{purgatory = Purgatory, soft_retry_exits = SoftRetryCount}}; + + SoftRetryLimit < ErrorCount -> + lager:info("Discaring partition ~p since it has reached the soft exit retry limit of ~p", [Partition#partition_info.index, SoftRetryLimit]), + ErrorExits1 = State4#state.error_exits + 1, + Dropped = [Partition#partition_info.index | State4#state.dropped], + {noreply, State4#state{error_exits = ErrorExits1, dropped = Dropped}}; + true -> + Purgatory = queue:in(Partition, State4#state.purgatory), + {noreply, State4#state{purgatory = Purgatory, soft_retry_exits = SoftRetryCount}} + end; + + 'EXIT' -> + lager:debug("Incrementing retries for partition ~p due to error exit of ~p", [Index, Pid]), + RetryLimit = app_helper:get_env(riak_repl, max_fssource_retries, + ?DEFAULT_SOURCE_RETRIES), + + if + ErrorCount > RetryLimit -> + lager:warning("fssource dropping partition: ~p, ~p failed" + "retries", [Partition, RetryLimit]), + ErrorExits = State#state.error_exits + 1, + State4 = State3#state{ error_exits = ErrorExits}, + Dropped = [Partition#partition_info.index | State4#state.dropped], + maybe_complete_fullsync(Running, State4#state{dropped = Dropped}); + true -> %% have not run out of retries yet + % reset for retry later + lager:info("fssource rescheduling partition: ~p", + [Partition]), + PQueue2 = queue:in(Partition, PQueue), + RetryExits = State3#state.retry_exits + 1, + State4 = State3#state{partition_queue = PQueue2, + retry_exits = RetryExits}, + State5 = start_up_reqs(State4), + {noreply, State5} + end + end. %% @hidden terminate(_Reason, _State) -> @@ -541,109 +586,91 @@ code_change(_OldVsn, State, _Extra) -> % we stash on our side what nodes gave a busy reply so we don't send too many % pointless whereis requests. handle_socket_msg({location, Partition, {Node, Ip, Port}}, #state{whereis_waiting = Waiting} = State) -> - case proplists:get_value(Partition, Waiting) of - undefined -> + case lists:keytake(Partition, #partition_info.index, Waiting) of + false -> State; - {N, _OldNode, Tref} -> - _ = erlang:cancel_timer(Tref), - Waiting2 = proplists:delete(Partition, Waiting), + {value, PartitionInfo, Waiting2} -> + Tref = PartitionInfo#partition_info.whereis_tref, + erlang:cancel_timer(Tref), % we don't know for sure it's no longer busy until we get a busy reply NewBusies = sets:del_element(Node, State#state.busy_nodes), State2 = State#state{whereis_waiting = Waiting2, busy_nodes = NewBusies}, - Partition2 = {Partition, N, Node}, + Partition2 = PartitionInfo#partition_info{node = Node, whereis_tref = undefined}, State3 = start_fssource(Partition2, Ip, Port, State2), start_up_reqs(State3) end; handle_socket_msg({location_busy, Partition}, #state{whereis_waiting = Waiting} = State) -> - lager:debug("Location_busy, partition = ~p", [Partition]), - case proplists:get_value(Partition, Waiting) of - undefined -> + lager:debug("anya location_busy, partition = ~p", [Partition]), + case lists:keytake(Partition, #partition_info.index, Waiting) of + false -> State; - {N, OldNode, Tref} -> - lager:info("Partition ~p is too busy on cluster ~p at node ~p", - [Partition, State#state.other_cluster, OldNode]), - _ = erlang:cancel_timer(Tref), - Waiting2 = proplists:delete(Partition, Waiting), + {value, PartitionInfo, Waiting2} -> + lager:info("anya Partition ~p is too busy on cluster ~p at node ~p", + [Partition, State#state.other_cluster, PartitionInfo#partition_info.node]), + Tref = PartitionInfo#partition_info.whereis_tref, + erlang:cancel_timer(Tref), State2 = State#state{whereis_waiting = Waiting2}, - Partition2 = {Partition, N, OldNode}, + Partition2 = PartitionInfo#partition_info{whereis_tref = undefined}, PQueue = State2#state.partition_queue, PQueue2 = queue:in(Partition2, PQueue), - NewBusies = sets:add_element(OldNode, State#state.busy_nodes), + NewBusies = sets:add_element(Partition2#partition_info.node, State#state.busy_nodes), State3 = State2#state{partition_queue = PQueue2, busy_nodes = NewBusies}, start_up_reqs(State3) end; handle_socket_msg({location_busy, Partition, Node}, #state{whereis_waiting = Waiting} = State) -> - case proplists:get_value(Partition, Waiting) of - undefined -> + case lists:keytake(Partition, #partition_info.index, Waiting) of + false -> State; - {N, _OldNode, Tref} -> + {value, PartitionInfo, Waiting2} -> lager:info("Partition ~p is too busy on cluster ~p at node ~p", [Partition, State#state.other_cluster, Node]), - _ = erlang:cancel_timer(Tref), + Tref = PartitionInfo#partition_info.whereis_tref, + erlang:cancel_timer(Tref), - Waiting2 = proplists:delete(Partition, Waiting), State2 = State#state{whereis_waiting = Waiting2}, - Partition2 = {Partition, N, Node}, + Partition2 = PartitionInfo#partition_info{node = Node}, PQueue = State2#state.partition_queue, PQueue2 = queue:in(Partition2, PQueue), NewBusies = sets:add_element(Node, State#state.busy_nodes), State3 = State2#state{partition_queue = PQueue2, busy_nodes = NewBusies}, start_up_reqs(State3) end; -handle_socket_msg({location_down, Partition}, - #state{whereis_waiting=Waiting0} = State) -> - case proplists:get_value(Partition, Waiting0) of - undefined -> + +handle_socket_msg({location_down, Partition}, #state{whereis_waiting=Waiting} = State) -> + lager:warning("anya location_down, partition = ~p", [Partition]), + case lists:keytake(Partition, #partition_info.index, Waiting) of + false -> State; - {N, OldNode, Tref} -> - handle_location_down({Partition, N, OldNode, Tref}, State) + {value, PartitionInfo, Waiting2} -> + lager:info("Partition ~p is unavailable on cluster ~p", + [Partition, State#state.other_cluster]), + Tref = PartitionInfo#partition_info.whereis_tref, + erlang:cancel_timer(Tref), + Dropped = [Partition | State#state.dropped], + State2 = State#state{whereis_waiting = Waiting2, dropped = Dropped}, + start_up_reqs(State2) end; -handle_socket_msg({location_down, Partition, Node}, - #state{whereis_waiting=Waiting0} = State) -> - case proplists:get_value(Partition, Waiting0) of - undefined -> +handle_socket_msg({location_down, Partition, _Node}, #state{whereis_waiting=Waiting} = State) -> + case lists:keytake(Partition, #partition_info.index, Waiting) of + false -> State; - {N, _OldNode, Tref} -> - handle_location_down({Partition, N, Node, Tref}, State) - end. - -handle_location_down({Partition, N, Node, Tref}, - #state{reserve_retries=Retries0, - partition_queue=PQueue0, - whereis_waiting=Waiting0} = State) -> - lager:info("Partition ~p is unavailable on cluster ~p", - [Partition, State#state.other_cluster]), - - RetryLimit = app_helper:get_env(riak_repl, - max_reserve_retries, - ?DEFAULT_RESERVE_RETRIES), - - Retries = dict:update_counter(Partition, 1, Retries0), - - _ = erlang:cancel_timer(Tref), - - case dict:fetch(Partition, Retries) of - X when X > RetryLimit, is_integer(RetryLimit) -> - lager:warning("Fullsync dropping partition: ~p, ~p location_down failed retries", - [Partition, RetryLimit]), - Waiting = proplists:delete(Partition, Waiting0), - ErrorExits = State#state.error_exits + 1, - State2 = State#state{whereis_waiting = Waiting, - error_exits = ErrorExits, - reserve_retries = Retries}, - start_up_reqs(State2); - _ -> - lager:warning("Fssource rescheduling partition after location_down: ~p ~p < ~p", - [Partition, N, RetryLimit]), - Waiting = proplists:delete(Partition, Waiting0), - Partition2 = {Partition, N, Node}, - PQueue = queue:in(Partition2, PQueue0), - RetryExits = State#state.retry_exits + 1, - State2 = State#state{whereis_waiting=Waiting, - partition_queue = PQueue, - reserve_retries = Retries, - retry_exits = RetryExits}, - start_up_reqs(State2) + {value, PartitionInfo, Waiting2} -> + Tref = PartitionInfo#partition_info.whereis_tref, + erlang:cancel_timer(Tref), + RetryLimit = app_helper:get_env(riak_repl, max_reserve_retries, ?DEFAULT_RESERVE_RETRIES), + lager:info("Partition ~p is unavailable on cluster ~p", [Partition, State#state.other_cluster]), + State2 = State#state{whereis_waiting = Waiting2}, + {RetriedCount, State3} = increment_error_dict(PartitionInfo, State#state.reserve_retries, State2), + State4 = case RetriedCount of + N when N > RetryLimit, is_integer(N) -> + lager:warning("Fullsync dropping partition ~p, ~p location_down failed retries", [PartitionInfo#partition_info.index, RetryLimit]), + Dropped = [Partition | State#state.dropped], + State3#state{dropped = Dropped}; + _ -> + PQueue = queue:in(PartitionInfo, State3#state.partition_queue), + State3#state{partition_queue = PQueue} + end, + start_up_reqs(State4) end. % try our best to reach maximum capacity by sending as many whereis requests @@ -654,7 +681,17 @@ start_up_reqs(State) -> Running = length(State#state.running_sources), Waiting = length(State#state.whereis_waiting), StartupCount = Max - Running - Waiting, - start_up_reqs(State, StartupCount). + State2 = maybe_pop_from_purgatory(State), + start_up_reqs(State2, StartupCount). + +maybe_pop_from_purgatory(State) -> + case queue:out(State#state.purgatory) of + {empty, _} -> + State; + {{value, Partition}, NewPurgatory} -> + PartitionQ2 = queue:in(Partition, State#state.partition_queue), + State#state{purgatory = NewPurgatory, partition_queue = PartitionQ2} + end. start_up_reqs(State, N) when N < 1 -> State; @@ -693,14 +730,16 @@ send_next_whereis_req(State) -> % one of those to finish and try again {defer, State#state{partition_queue = Queue}}; - {Pval, N, RemoteNode} = P -> + P when is_record(P, partition_info) -> + #partition_info{index = Pval} = P, #state{transport = Transport, socket = Socket, whereis_waiting = Waiting} = State, Tref = erlang:send_after(?WAITING_TIMEOUT, self(), {Pval, whereis_timeout}), - Waiting2 = [{Pval, {N, RemoteNode, Tref}} | Waiting], + PartitionInfo2 = P#partition_info{whereis_tref = Tref}, + Waiting2 = [PartitionInfo2 | Waiting], {ok, {PeerIP, PeerPort}} = Transport:peername(Socket), lager:debug("Sending whereis request for partition ~p", [P]), Transport:send(Socket, - term_to_binary({whereis, element(1, P), PeerIP, PeerPort})), + term_to_binary({whereis, Pval, PeerIP, PeerPort})), {ok, State#state{partition_queue = Queue, whereis_waiting = Waiting2}} end @@ -740,13 +779,14 @@ below_max_sources(State) -> Max = app_helper:get_env(riak_repl, max_fssource_cluster, ?DEFAULT_SOURCE_PER_CLUSTER), ( length(State#state.running_sources) + length(State#state.whereis_waiting) ) < Max. -node_available({Partition, _, _}, Owners, Waiting) -> +node_available(PartitionInfo, Owners, Waiting) -> + Partition = PartitionInfo#partition_info.index, LocalNode = proplists:get_value(Partition, Owners), Max = app_helper:get_env(riak_repl, max_fssource_node, ?DEFAULT_SOURCE_PER_NODE), try riak_repl2_fssource_sup:enabled(LocalNode) of RunningList -> PartsSameNode = [Part || {Part, PNode} <- Owners, PNode =:= LocalNode], - PartsWaiting = [Part || {Part, _} <- Waiting, lists:member(Part, PartsSameNode)], + PartsWaiting = [Part || #partition_info{index = Part} <- Waiting, lists:member(Part, PartsSameNode)], if ( length(PartsWaiting) + length(RunningList) ) < Max -> case proplists:get_value(Partition, RunningList) of @@ -765,12 +805,13 @@ node_available({Partition, _, _}, Owners, Waiting) -> skip end. -remote_node_available({_Partition, _, undefined}, _Busies) -> +remote_node_available(Partition, _Busies) when Partition#partition_info.node =:= undefined -> true; -remote_node_available({_Partition, _, RemoteNode}, Busies) -> - not sets:is_element(RemoteNode, Busies). +remote_node_available(Partition, Busies) -> + not sets:is_element(Partition#partition_info.node, Busies). -start_fssource(Partition2={Partition,_,_} = PartitionVal, Ip, Port, State) -> +start_fssource(PartitionVal, Ip, Port, State) -> + Partition = PartitionVal#partition_info.index, #state{owners = Owners} = State, LocalNode = proplists:get_value(Partition, Owners), lager:info("Starting fssource for ~p on ~p to ~p", [Partition, LocalNode, @@ -778,7 +819,8 @@ start_fssource(Partition2={Partition,_,_} = PartitionVal, Ip, Port, State) -> case riak_repl2_fssource_sup:enable(LocalNode, Partition, {Ip, Port}) of {ok, Pid} -> link(Pid), - Running = orddict:store(Pid, PartitionVal, State#state.running_sources), + _ = riak_repl2_fssource:soft_link(Pid), + Running = lists:keystore(Pid, #partition_info.running_source, State#state.running_sources, PartitionVal#partition_info{running_source = Pid}), State#state{running_sources = Running}; {error, Reason} -> case Reason of @@ -796,7 +838,7 @@ start_fssource(Partition2={Partition,_,_} = PartitionVal, Ip, Port, State) -> end, #state{transport = Transport, socket = Socket} = State, Transport:send(Socket, term_to_binary({unreserve, Partition})), - PQueue = queue:in(Partition2, State#state.partition_queue), + PQueue = queue:in(PartitionVal, State#state.partition_queue), State#state{partition_queue=PQueue} end. @@ -822,7 +864,7 @@ sort_partitions(Ring) -> sort_partitions(OffsetPartitions, BigN, []). sort_partitions([], _, Acc) -> - [{P,N,undefined} || {P,N} <- lists:reverse(Acc)]; + [#partition_info{index = P} || {P,_N} <- lists:reverse(Acc)]; sort_partitions(In, N, Acc) -> Split = min(length(In), N) - 1, {A, [P|B]} = lists:split(Split, In), @@ -880,7 +922,8 @@ gather_source_stats(PDict) -> gather_source_stats([], Acc) -> lists:reverse(Acc); -gather_source_stats([{Pid, _} | Tail], Acc) -> +gather_source_stats([PartitionInfo | Tail], Acc) -> + Pid = PartitionInfo#partition_info.running_source, try riak_repl2_fssource:legacy_status(Pid, infinity) of Stats -> gather_source_stats(Tail, [{riak_repl_util:safe_pid_to_list(Pid), Stats} | Acc]) @@ -892,10 +935,11 @@ gather_source_stats([{Pid, _} | Tail], Acc) -> is_fullsync_in_progress(State) -> QEmpty = queue:is_empty(State#state.partition_queue), + PurgatoryEmpty = queue:is_empty(State#state.purgatory), Waiting = State#state.whereis_waiting, Running = State#state.running_sources, - case {QEmpty, Waiting, Running} of - {true, [], []} -> + case {QEmpty, PurgatoryEmpty, Waiting, Running} of + {true, true, [], []} -> false; _ -> true @@ -904,9 +948,10 @@ is_fullsync_in_progress(State) -> maybe_complete_fullsync(Running, State) -> EmptyRunning = Running == [], QEmpty = queue:is_empty(State#state.partition_queue), + PurgatoryEmpty = queue:is_empty(State#state.purgatory), Waiting = State#state.whereis_waiting, - case {EmptyRunning, QEmpty, Waiting} of - {true, true, []} -> + case {EmptyRunning, QEmpty, PurgatoryEmpty, Waiting} of + {true, true, true, []} -> MyClusterName = riak_core_connection:symbolic_clustername(), lager:info("Fullsync complete from ~s to ~s", [MyClusterName, State#state.other_cluster]), @@ -971,3 +1016,39 @@ notify_rt_dirty_nodes(State = #state{dirty_nodes = DirtyNodes, nodeset_to_string_list(Set) -> string:join([erlang:atom_to_list(V) || V <- ordsets:to_list(Set)],","). + +increment_error_dict(PartitionInfo, ExitType, State) when is_record(PartitionInfo, partition_info) -> + increment_error_dict(PartitionInfo#partition_info.index, ExitType, State); + +increment_error_dict(PartitionIndex, soft_exit, State) -> + increment_error_dict(PartitionIndex, #state.soft_retries, State); + +increment_error_dict(PartitionIndex, 'EXIT', State) -> + increment_error_dict(PartitionIndex, #state.retries, State); + +increment_error_dict(PartitionIndex, ElementN, State) when is_integer(ElementN) -> + Dict = element(ElementN, State), + Dict2 = dict:update_counter(PartitionIndex, 1, Dict), + State2 = setelement(ElementN, State, Dict2), + {dict:fetch(PartitionIndex, Dict2), State2}. + +% If we are linked to a remote pid, it is possible for the disterl to +% reconnect at a bad time and lose the exit message, thus we cannot +% rely solely on the exit message to flush it. What we can do is monitor +% the process. 'DOWN' messages always come in after the exit message, so +% if we get the 'DOWN' message first, we know the exit message is never +% going to arrive, and is effectively flushed. If we get the exit first, +% we can flush the down message next, since that will arrive as a noproc +% in any case. +flush_exit_message(Pid) -> + Mon = erlang:monitor(process, Pid), + receive + {'EXIT', Pid, _} -> + receive + {'DOWN', Mon, process, Pid, _} -> + ok + end; + {'DOWN', Mon, process, Pid, _} -> + ok + end. + diff --git a/src/riak_repl2_fssource.erl b/src/riak_repl2_fssource.erl index 013cf1ee..9e80c42b 100644 --- a/src/riak_repl2_fssource.erl +++ b/src/riak_repl2_fssource.erl @@ -3,8 +3,9 @@ -behaviour(gen_server). %% API --export([start_link/2, connected/6, connect_failed/3, start_fullsync/1, - stop_fullsync/1, cluster_name/1, legacy_status/2, fullsync_complete/1]). +-export([start_link/2, start_link/3, connected/6, connect_failed/3, + start_fullsync/1, stop_fullsync/1, fullsync_complete/1, + cluster_name/1, legacy_status/2, soft_link/1]). %% gen_server callbacks -export([init/1, handle_call/3, handle_cast/2, handle_info/2, @@ -19,11 +20,15 @@ connection_ref, fullsync_worker, work_dir, - strategy + strategy, + owner }). start_link(Partition, IP) -> - gen_server:start_link(?MODULE, [Partition, IP], []). + start_link(Partition, IP, undefined). + +start_link(Partition, IP, Owner) -> + gen_server:start_link(?MODULE, [Partition, IP, Owner], []). %% connection manager callbacks connected(Socket, Transport, Endpoint, Proto, Pid, Props) -> @@ -51,9 +56,33 @@ cluster_name(Pid) -> legacy_status(Pid, Timeout) -> gen_server:call(Pid, legacy_status, Timeout). +%% @doc Create a 'soft link' between the calling process and the fssource. +%% A soft-link allows for a soft_exit message to be sent before a normal +%% exit to any process that has created a soft link. Only one link is +%% held at a time, and alink is in only one direction (the fssource +%% reports to calling process). +soft_link(Pid) -> + % not using default long timeout because this is primarily used by the + % fscoordinator, and we don't want to potentially block that for up to + % 2 minutes. 15 seconds is bad enough in a worst case scenario. + try gen_server:call(Pid, {soft_link, self()}, timer:seconds(15)) of + ok -> % older versions returned 'ok' for the catchall + false; + true -> + true + catch + _What:Reason -> + lager:debug("Could not create soft link to ~p from ~p due to ~p", [Pid, self(), Reason]), + {error, Reason} + end. + %% gen server init([Partition, IP]) -> + init([Partition, IP, undefined]); + +init([Partition, IP, Owner]) -> + RequestedStrategy = app_helper:get_env(riak_repl, fullsync_strategy, ?DEFAULT_FULLSYNC_STRATEGY), @@ -77,7 +106,9 @@ init([Partition, IP]) -> end; {error, Reason} -> %% the vnode is probably busy. Try again later. - {stop, Reason} + {stop, Reason}; + {ok, State}-> + {ok, State#state{owner = Owner}} end. handle_call({connected, Socket, Transport, _Endpoint, Proto, Props}, @@ -149,9 +180,22 @@ handle_call(stop_fullsync, _From, State=#state{fullsync_worker=FSW, handle_call(legacy_status, _From, State=#state{fullsync_worker=FSW, socket=Socket, strategy=Strategy}) -> - Res = case is_pid(FSW) andalso is_process_alive(FSW) of - true -> gen_fsm:sync_send_all_state_event(FSW, status, infinity); - false -> [] + lager:debug("Sending status to ~p", [FSW]), + Res = case is_pid(FSW) of + true -> + % try/catch because there may be a message in the pid's + % mailbox that will cause it to exit before it gets to our + % status request message. + try gen_fsm:sync_send_all_state_event(FSW, status, infinity) of + SyncSendRes -> + SyncSendRes + catch + What:Why -> + lager:notice("Error getting fullsync worker ~p status: ~p:~p", [FSW, What, Why]), + [] + end; + false -> + [] end, SocketStats = riak_core_tcp_mon:format_socket_stats( riak_core_tcp_mon:socket_status(Socket), []), @@ -172,6 +216,10 @@ handle_call(cluster_name, _From, State) -> ClusterName end, {reply, Name, State}; +handle_call({soft_link, NewOwner}, _From, State) -> + lager:debug("Changing soft_link from ~p to ~p", [State#state.owner, NewOwner]), + State2 = State#state{owner = NewOwner}, + {reply, true, State2}; handle_call(_Msg, _From, State) -> {reply, ok, State}. @@ -221,16 +269,20 @@ handle_info({Proto, Socket, Data}, gen_fsm:send_event(State#state.fullsync_worker, Msg), {noreply, State} end; +handle_info({soft_exit, Pid, Reason}, State = #state{fullsync_worker = Pid}) -> + lager:debug("Fullsync worker exited normally, but really wanted it to be ~p", [Reason]), + maybe_soft_exit(Reason, State); handle_info(_Msg, State) -> {noreply, State}. terminate(_Reason, #state{fullsync_worker=FSW, work_dir=WorkDir}) -> - %% check if process alive only if it's defined - case is_pid(FSW) andalso is_process_alive(FSW) of + %% try to exit the fullsync worker; if we're dying because it did, + %% don't worry about the error (cause it's already dead). + case is_pid(FSW) of false -> ok; true -> - gen_fsm:sync_send_all_state_event(FSW, stop) + catch gen_fsm:sync_send_all_state_event(FSW, stop) end, case WorkDir of undefined -> ok; @@ -306,3 +358,13 @@ connect(IP, Strategy, Partition) -> lager:warning("Error connecting to remote ~p for partition ~p", [IP, Partition]), {error, Reason} end. + +maybe_soft_exit(Reason, State) -> + case State#state.owner of + undefined -> + {stop, Reason, State}; + Owner -> + Owner ! {soft_exit, self(), Reason}, + {stop, normal, State} + end. + diff --git a/src/riak_repl_aae_source.erl b/src/riak_repl_aae_source.erl index cae1fa7e..125b7381 100644 --- a/src/riak_repl_aae_source.erl +++ b/src/riak_repl_aae_source.erl @@ -120,7 +120,8 @@ handle_info({'DOWN', TreeMref, process, Pid, Why}, _StateName, State=#state{tree %% Local hashtree process went down. Stop exchange. lager:info("Monitored pid ~p, AAE Hashtree process went down because: ~p", [Pid, Why]), send_complete(State), - {stop, {aae_hashtree_went_down, Why}, State}; + State#state.owner ! {soft_exit, self(), {aae_hastree_went_down, Why}}, + {stop, normal, State}; handle_info(Error={'DOWN', _, _, _, _}, _StateName, State) -> %% Something else exited. Stop exchange. lager:info("Something went down ~p", [Error]), @@ -195,7 +196,8 @@ prepare_exchange(start_exchange, State0=#state{transport=Transport, Error -> lager:info("AAE source failed get_lock for partition ~p, got ~p", [Partition, Error]), - {stop, Error, State} + State#state.owner ! {soft_exit, self(), Error}, + {stop, normal, State} end; {error, wrong_node} -> {stop, wrong_node, State0} @@ -209,7 +211,8 @@ prepare_exchange(start_exchange, State=#state{index=Partition}) -> lager:info("Remote lock tree for partition ~p failed, got ~p", [Partition, Error]), send_complete(State), - {stop, {remote, Error}, State} + State#state.owner ! {soft_exit, self(), {remote, Error}}, + {stop, normal, State} end. %% @doc Now that locks have been acquired, ask both the local and remote diff --git a/src/riak_repl_wm_stats.erl b/src/riak_repl_wm_stats.erl index 4302080e..02e9a43c 100644 --- a/src/riak_repl_wm_stats.erl +++ b/src/riak_repl_wm_stats.erl @@ -125,7 +125,6 @@ format_pid_stat(Pair) -> jsonify_stats([], Acc) -> - %?debugFmt("Got []: Acc: ~w", [Acc]), lists:flatten(lists:reverse(Acc)); jsonify_stats([{fullsync, Num, _Left}|T], Acc) -> From a1166e1bff09ca3f3f89383632cba450c1815f34 Mon Sep 17 00:00:00 2001 From: Micah Warren Date: Fri, 21 Nov 2014 14:56:24 -0600 Subject: [PATCH 11/47] Fixed invalid call to update error count track. Increment_error_dict expects the partition, elementN of error dict, and the state. It pulls the dict out of the state so it put it back in place, thus just returning the state. So this call that passed the dict in was wrong. --- src/riak_repl2_fscoordinator.erl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/riak_repl2_fscoordinator.erl b/src/riak_repl2_fscoordinator.erl index acf4fddb..bffc68dc 100644 --- a/src/riak_repl2_fscoordinator.erl +++ b/src/riak_repl2_fscoordinator.erl @@ -660,7 +660,7 @@ handle_socket_msg({location_down, Partition, _Node}, #state{whereis_waiting=Wait RetryLimit = app_helper:get_env(riak_repl, max_reserve_retries, ?DEFAULT_RESERVE_RETRIES), lager:info("Partition ~p is unavailable on cluster ~p", [Partition, State#state.other_cluster]), State2 = State#state{whereis_waiting = Waiting2}, - {RetriedCount, State3} = increment_error_dict(PartitionInfo, State#state.reserve_retries, State2), + {RetriedCount, State3} = increment_error_dict(PartitionInfo, #state.reserve_retries, State2), State4 = case RetriedCount of N when N > RetryLimit, is_integer(N) -> lager:warning("Fullsync dropping partition ~p, ~p location_down failed retries", [PartitionInfo#partition_info.index, RetryLimit]), From c36c3ca876cabac73826fe29f6e94b5acdf55f8e Mon Sep 17 00:00:00 2001 From: "Engel A. Sanchez" Date: Wed, 26 Nov 2014 14:15:08 -0500 Subject: [PATCH 12/47] Fix error/retry exit counts on location down msgs When a partition is not available, perhaps after a number of retries, the error exits stat should be incremented. Also, the retry exits stat should be incremented on each retry. This was discovered when backporting the repl_location_failures riak_test. --- src/riak_repl2_fscoordinator.erl | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/src/riak_repl2_fscoordinator.erl b/src/riak_repl2_fscoordinator.erl index bffc68dc..ae434920 100644 --- a/src/riak_repl2_fscoordinator.erl +++ b/src/riak_repl2_fscoordinator.erl @@ -647,7 +647,10 @@ handle_socket_msg({location_down, Partition}, #state{whereis_waiting=Waiting} = Tref = PartitionInfo#partition_info.whereis_tref, erlang:cancel_timer(Tref), Dropped = [Partition | State#state.dropped], - State2 = State#state{whereis_waiting = Waiting2, dropped = Dropped}, + #state{retry_exits = RetryExits, error_exits = ErrorExits} = State, + State2 = State#state{whereis_waiting = Waiting2, dropped = Dropped, + retry_exits = RetryExits + 1, + error_exits = ErrorExits + 1}, start_up_reqs(State2) end; handle_socket_msg({location_down, Partition, _Node}, #state{whereis_waiting=Waiting} = State) -> @@ -665,10 +668,16 @@ handle_socket_msg({location_down, Partition, _Node}, #state{whereis_waiting=Wait N when N > RetryLimit, is_integer(N) -> lager:warning("Fullsync dropping partition ~p, ~p location_down failed retries", [PartitionInfo#partition_info.index, RetryLimit]), Dropped = [Partition | State#state.dropped], - State3#state{dropped = Dropped}; + #state{retry_exits = RetryExits, + error_exits = ErrorExits} = State, + State3#state{dropped = Dropped, + error_exits = ErrorExits + 1, + retry_exits = RetryExits + 1}; _ -> PQueue = queue:in(PartitionInfo, State3#state.partition_queue), - State3#state{partition_queue = PQueue} + #state{retry_exits = RetryExits} = State, + State3#state{partition_queue = PQueue, + retry_exits = RetryExits + 1} end, start_up_reqs(State4) end. From 5479089f99605f99dfc6a8665561e072c5fc707a Mon Sep 17 00:00:00 2001 From: "Engel A. Sanchez" Date: Fri, 5 Dec 2014 11:54:57 -0500 Subject: [PATCH 13/47] Fix dialyzer warnings The one in riak_repl2_fssource is a legit bug in the code --- src/riak_repl2_fscoordinator.erl | 24 +++++++++++++----------- src/riak_repl2_fssource.erl | 8 +++----- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/src/riak_repl2_fscoordinator.erl b/src/riak_repl2_fscoordinator.erl index ae434920..fba95217 100644 --- a/src/riak_repl2_fscoordinator.erl +++ b/src/riak_repl2_fscoordinator.erl @@ -372,12 +372,14 @@ handle_cast(start_fullsync, State) -> handle_cast(stop_fullsync, State) -> % exit all running, cancel all timers, and reset the state. - [erlang:cancel_timer(Tref) || #partition_info{whereis_tref = Tref} <- State#state.whereis_waiting], - [begin - unlink(Pid), - riak_repl2_fssource:stop_fullsync(Pid), - riak_repl2_fssource_sup:disable(node(Pid), Part) - end || #partition_info{index = Part, running_source = Pid} <- State#state.running_sources], + _ = [erlang:cancel_timer(Tref) || #partition_info{whereis_tref = Tref} + <- State#state.whereis_waiting], + _ = [begin + unlink(Pid), + riak_repl2_fssource:stop_fullsync(Pid), + riak_repl2_fssource_sup:disable(node(Pid), Part) + end || #partition_info{index = Part, running_source = Pid} + <- State#state.running_sources], State2 = State#state{ largest_n = undefined, owners = [], @@ -591,7 +593,7 @@ handle_socket_msg({location, Partition, {Node, Ip, Port}}, #state{whereis_waitin State; {value, PartitionInfo, Waiting2} -> Tref = PartitionInfo#partition_info.whereis_tref, - erlang:cancel_timer(Tref), + _ = erlang:cancel_timer(Tref), % we don't know for sure it's no longer busy until we get a busy reply NewBusies = sets:del_element(Node, State#state.busy_nodes), State2 = State#state{whereis_waiting = Waiting2, busy_nodes = NewBusies}, @@ -608,7 +610,7 @@ handle_socket_msg({location_busy, Partition}, #state{whereis_waiting = Waiting} lager:info("anya Partition ~p is too busy on cluster ~p at node ~p", [Partition, State#state.other_cluster, PartitionInfo#partition_info.node]), Tref = PartitionInfo#partition_info.whereis_tref, - erlang:cancel_timer(Tref), + _ = erlang:cancel_timer(Tref), State2 = State#state{whereis_waiting = Waiting2}, Partition2 = PartitionInfo#partition_info{whereis_tref = undefined}, PQueue = State2#state.partition_queue, @@ -624,7 +626,7 @@ handle_socket_msg({location_busy, Partition, Node}, #state{whereis_waiting = Wai {value, PartitionInfo, Waiting2} -> lager:info("Partition ~p is too busy on cluster ~p at node ~p", [Partition, State#state.other_cluster, Node]), Tref = PartitionInfo#partition_info.whereis_tref, - erlang:cancel_timer(Tref), + _ = erlang:cancel_timer(Tref), State2 = State#state{whereis_waiting = Waiting2}, @@ -645,7 +647,7 @@ handle_socket_msg({location_down, Partition}, #state{whereis_waiting=Waiting} = lager:info("Partition ~p is unavailable on cluster ~p", [Partition, State#state.other_cluster]), Tref = PartitionInfo#partition_info.whereis_tref, - erlang:cancel_timer(Tref), + _ = erlang:cancel_timer(Tref), Dropped = [Partition | State#state.dropped], #state{retry_exits = RetryExits, error_exits = ErrorExits} = State, State2 = State#state{whereis_waiting = Waiting2, dropped = Dropped, @@ -659,7 +661,7 @@ handle_socket_msg({location_down, Partition, _Node}, #state{whereis_waiting=Wait State; {value, PartitionInfo, Waiting2} -> Tref = PartitionInfo#partition_info.whereis_tref, - erlang:cancel_timer(Tref), + _ = erlang:cancel_timer(Tref), RetryLimit = app_helper:get_env(riak_repl, max_reserve_retries, ?DEFAULT_RESERVE_RETRIES), lager:info("Partition ~p is unavailable on cluster ~p", [Partition, State#state.other_cluster]), State2 = State#state{whereis_waiting = Waiting2}, diff --git a/src/riak_repl2_fssource.erl b/src/riak_repl2_fssource.erl index 9e80c42b..d193f8af 100644 --- a/src/riak_repl2_fssource.erl +++ b/src/riak_repl2_fssource.erl @@ -101,14 +101,12 @@ init([Partition, IP, Owner]) -> case connect(IP, SupportedStrategy, Partition) of {error, Reason} -> {stop, Reason}; - Result -> - Result + {ok, State}-> + {ok, State#state{owner = Owner}} end; {error, Reason} -> %% the vnode is probably busy. Try again later. - {stop, Reason}; - {ok, State}-> - {ok, State#state{owner = Owner}} + {stop, Reason} end. handle_call({connected, Socket, Transport, _Endpoint, Proto, Props}, From 4bf87a66840420b4c7a47cc347ce5f4dd82218a1 Mon Sep 17 00:00:00 2001 From: Micah Warren Date: Mon, 8 Dec 2014 15:57:43 -0600 Subject: [PATCH 14/47] Added last_fullsync_completed stat tracking. --- src/riak_repl2_fscoordinator.erl | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/riak_repl2_fscoordinator.erl b/src/riak_repl2_fscoordinator.erl index a2b60f75..718e69a8 100644 --- a/src/riak_repl2_fscoordinator.erl +++ b/src/riak_repl2_fscoordinator.erl @@ -74,6 +74,7 @@ fullsyncs_completed = 0, fullsync_start_time = undefined, last_fullsync_duration = undefined, + last_fullsync_completed = undefined, stat_cache = #stat_cache{} }). @@ -249,6 +250,11 @@ handle_call(status, _From, State = #state{socket=Socket}) -> undefined -> undefined; _N -> calendar:gregorian_seconds_to_datetime(State#state.fullsync_start_time) end, + FinishTime = + case State#state.last_fullsync_completed of + undefined -> undefined; + LastFSCompleted -> calendar:gregorian_seconds_to_datetime(LastFSCompleted) + end, SelfStats = [ {cluster, State#state.other_cluster}, {queued, queue:len(State#state.partition_queue)}, @@ -264,6 +270,7 @@ handle_call(status, _From, State = #state{socket=Socket}) -> {fullsyncs_completed, State#state.fullsyncs_completed}, {last_fullsync_started, StartTime}, {last_fullsync_duration, State#state.last_fullsync_duration}, + {last_fullsync_completed, FinishTime}, {fullsync_suggested, nodeset_to_string_list(State#state.dirty_nodes)}, {fullsync_suggested_during_fs, @@ -922,7 +929,8 @@ maybe_complete_fullsync(Running, State) -> {noreply, State2#state{running_sources = Running, fullsyncs_completed = TotalFullsyncs, fullsync_start_time = undefined, - last_fullsync_duration=ElapsedSeconds + last_fullsync_duration=ElapsedSeconds, + last_fullsync_completed = Finish }}; _ -> % there's something waiting for a response. From 38b7b4b1e357e4abae3ed827ad72ea5da559db24 Mon Sep 17 00:00:00 2001 From: Mikael Lixenstrand Date: Tue, 9 Dec 2014 10:43:43 +0000 Subject: [PATCH 15/47] Revert riak_kv deps changes --- rebar.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rebar.config b/rebar.config index 9c9ebdb9..7a076b92 100644 --- a/rebar.config +++ b/rebar.config @@ -10,6 +10,6 @@ {lager, "2.0.3", {git, "git://github.com/basho/lager.git", {tag, "2.0.3"}}}, {ranch, "0.4.0-p1", {git, "git://github.com/basho/ranch.git", {tag, "0.4.0-p1"}}}, {ebloom, ".*", {git, "git://github.com/basho/ebloom.git", {tag, "2.0.0"}}}, - {riak_kv, ".*", {git, "git://github.com/basho/riak_kv.git", {branch, "feature/aae-estimate-keys"}}}, + {riak_kv, ".*", {git, "git://github.com/basho/riak_kv.git", {branch, "2.0"}}}, {riak_repl_pb_api, ".*", {git, "git@github.com:basho/riak_repl_pb_api.git", {branch, "2.0"}}} ]}. From ce0c1b46740cbbf932b4326ba94981682a992f83 Mon Sep 17 00:00:00 2001 From: Mikael Lixenstrand Date: Wed, 10 Dec 2014 16:53:02 +0000 Subject: [PATCH 16/47] Update logging after review --- src/riak_repl_aae_source.erl | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/riak_repl_aae_source.erl b/src/riak_repl_aae_source.erl index 9a4fa7cb..c5de2b5e 100644 --- a/src/riak_repl_aae_source.erl +++ b/src/riak_repl_aae_source.erl @@ -242,7 +242,8 @@ update_trees(start_exchange, State=#state{tree_pid=TreePid, {next_state, update_trees, State}; update_trees({not_responsible, Partition, IndexN}, State = #state{owner=Owner}) -> - lager:debug("VNode ~p does not cover preflist ~p", [Partition, IndexN]), + lager:debug("Skipping AAE fullsync tree update for vnode ~p because" + " it is not responsible for the preflist ~p", [Partition, IndexN]), gen_server:cast(Owner, not_responsible), {stop, normal, State}; update_trees({tree_built, _, _}, State = #state{indexns=IndexNs}) -> @@ -449,7 +450,7 @@ send_diffs(diff_done, State) -> %%%=================================================================== finish_sending_differences(#exchange{bloom=undefined, count=DiffCnt}, #state{index=Partition, estimated_nr_keys=EstimatedNrKeys}) -> - lager:info("No Bloom folding over ~p/~p differences for partition ~p with EstimatedNrKeys ~p", + lager:info("Syncing without bloom ~p/~p differences for partition ~p with EstimatedNrKeys ~p", [0, DiffCnt, Partition, EstimatedNrKeys]), gen_fsm:send_event(self(), diff_done); @@ -457,7 +458,7 @@ finish_sending_differences(#exchange{bloom=Bloom, count=DiffCnt}, #state{index=Partition, estimated_nr_keys=EstimatedNrKeys}) -> case ebloom:elements(Bloom) of Count = 0 -> - lager:info("No Bloom folding over ~p/~p differences for partition ~p with EstimatedNrKeys ~p", + lager:info("Syncing without bloom ~p/~p differences for partition ~p with EstimatedNrKeys ~p", [Count, DiffCnt, Partition, EstimatedNrKeys]), gen_fsm:send_event(self(), diff_done); Count -> From da47f0a18a53847ea8f220eca0cecc84b1d637b0 Mon Sep 17 00:00:00 2001 From: Mikael Lixenstrand Date: Mon, 15 Dec 2014 10:43:36 +0000 Subject: [PATCH 17/47] Cancel directly on not_responsible from remote cluster Remove loop so we can receive cancel_fullsync during update of remote trees. --- src/riak_repl_aae_source.erl | 41 +++++++++++++++++------------------- 1 file changed, 19 insertions(+), 22 deletions(-) diff --git a/src/riak_repl_aae_source.erl b/src/riak_repl_aae_source.erl index c5de2b5e..70f4a4e2 100644 --- a/src/riak_repl_aae_source.erl +++ b/src/riak_repl_aae_source.erl @@ -221,32 +221,28 @@ prepare_exchange(start_exchange, State=#state{index=Partition}) -> %% a timely manner, the exchange will timeout. Since the trees will %% continue to finish the update even after the exchange times out, %% a future exchange should eventually make progress. -update_trees(init, State) -> - update_trees(start_exchange, State); +update_trees(init, State=#state{indexns=IndexNs}) -> + update_trees({start_exchange, IndexNs}, State); update_trees(cancel_fullsync, State) -> lager:info("AAE fullsync source cancelled for partition ~p", [State#state.index]), send_complete(State), {stop, normal, State}; -update_trees(start_exchange, State=#state{tree_pid=TreePid, - index=Partition, - indexns=IndexNs}) -> - lists:foreach(fun(IndexN) -> - update_request(TreePid, {Partition, undefined}, IndexN), - case send_synchronous_msg(?MSG_UPDATE_TREE, IndexN, State) of - ok -> - gen_fsm:send_event(self(), {tree_built, Partition, IndexN}); - not_responsible -> - gen_fsm:send_event(self(), {not_responsible, Partition, IndexN}) - end - end, IndexNs), - {next_state, update_trees, State}; - -update_trees({not_responsible, Partition, IndexN}, State = #state{owner=Owner}) -> - lager:debug("Skipping AAE fullsync tree update for vnode ~p because" - " it is not responsible for the preflist ~p", [Partition, IndexN]), - gen_server:cast(Owner, not_responsible), - {stop, normal, State}; -update_trees({tree_built, _, _}, State = #state{indexns=IndexNs}) -> +update_trees({start_exchange, [IndexHead|IndexTail]}, State=#state{tree_pid=TreePid, + index=Partition, + owner=Owner}) -> + update_request(TreePid, {Partition, undefined}, IndexHead), + case send_synchronous_msg(?MSG_UPDATE_TREE, IndexHead, State) of + ok -> + gen_fsm:send_event(self(), {tree_built, Partition, IndexHead, IndexTail}), + {next_state, update_trees, State}; + not_responsible -> + lager:debug("Skipping AAE fullsync tree update for vnode ~p because" + " it is not responsible for the preflist ~p", [Partition, IndexHead]), + gen_server:cast(Owner, not_responsible), + {stop, normal, State} + end; + +update_trees({tree_built, _, _, IndexTail}, State = #state{indexns=IndexNs}) -> Built = State#state.built + 1, NeededBuilts = length(IndexNs) * 2, %% All local and remote case Built of @@ -258,6 +254,7 @@ update_trees({tree_built, _, _}, State = #state{indexns=IndexNs}) -> lager:debug("Moving to key exchange state"), key_exchange(init, State#state{built=Built, estimated_nr_keys = EstimatedNrKeys}); _ -> + gen_fsm:send_event(self(), {start_exchange, IndexTail}), {next_state, update_trees, State#state{built=Built}} end. From 2da227cfd7a3bd2f61eeeda50753d18631520c97 Mon Sep 17 00:00:00 2001 From: Kelly McLaughlin Date: Mon, 15 Dec 2014 09:19:58 -0700 Subject: [PATCH 18/47] Address some minor bugs around establishing SSL connections A few minor bugs were discovered while investigating riak_test failures. * The ssl application is explicitly started in riak_core_connection:try_ssl/0. The statement in the function expects the call to ssl:start/0 to always return ok, but in some cases the ssl application is already started and the call returns {error, {already_started, ssl}} instead. This should not represent an error condition, but as written an exception is generated in this case. This resulted in riak_test runs of replication tests that exercise SSL to stall. Really there is no reason to attempt to start the ssl application at this point in the code. The ssl application is an application dependency of the riak_repl application and should be started by the call to riak_core_util:start_app_deps in riak_repl_app:start/2. Removing the attempt to start ssl in riak_core_connection to avoid confusion. * The first handle_info function clause in riak_core_connection that handles a message received while in the wait_for_capabilities state attempts to use SSL by calling the try_ssl/4 function. If it succeeds a pair is returned whose elements are the name of the new transport and a new socket for the SSL connection. However the new socket was not being used for subsequent calls to send and setopts and this caused failure of several riak_tests. * The non-test function clause of riak_core_cluster_conn:request_cluster_name/1 contained assumptions about the transport in use and explicitly called inet:setopts/2. This does not work when SSL is used and also caused several test failures. The function has been changed so that the specified transport is used for the call to setopts instead. --- src/riak_core_cluster_conn.erl | 2 +- src/riak_core_connection.erl | 6 ++---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/src/riak_core_cluster_conn.erl b/src/riak_core_cluster_conn.erl index 51c08b26..0bad9bd2 100644 --- a/src/riak_core_cluster_conn.erl +++ b/src/riak_core_cluster_conn.erl @@ -362,7 +362,7 @@ code_change(_OldVsn, StateName, State, _Extra) -> request_cluster_name(#state{mode=test}) -> ok; request_cluster_name(#state{socket=Socket, transport=Transport}) -> - _ = inet:setopts(Socket, [{active, once}]), + _ = Transport:setopts(Socket, [{active, once}]), Transport:send(Socket, ?CTRL_ASK_NAME). -spec request_member_ips(state()) -> ok | {error, term()}. diff --git a/src/riak_core_connection.erl b/src/riak_core_connection.erl index df2f4749..b2bc586d 100644 --- a/src/riak_core_connection.erl +++ b/src/riak_core_connection.erl @@ -205,8 +205,8 @@ handle_info({_Transport, Socket, Data}, wait_for_capabilities, State = #state{so {stop, Error, State}; {NewTransport, NewSocket} -> FullProto = {State#state.protocol, State#state.protovers}, - NewTransport:send(Socket, erlang:term_to_binary(FullProto)), - NewTransport:setopts(Socket, [{active, once}]), + NewTransport:send(NewSocket, erlang:term_to_binary(FullProto)), + NewTransport:setopts(NewSocket, [{active, once}]), State2 = State#state{transport = NewTransport, socket = NewSocket, remote_capabilities = TheirCaps}, {next_state, wait_for_protocol, State2} end; @@ -260,7 +260,6 @@ try_ssl(Socket, Transport, MyCaps, TheirCaps) -> {Transport, Socket}; {true, true} -> lager:info("~p and ~p agreed to use SSL", [MyName, TheirName]), - ok = ssl:start(), case riak_core_ssl_util:upgrade_client_to_ssl(Socket, riak_core) of {ok, SSLSocket} -> {ranch_ssl, SSLSocket}; @@ -270,4 +269,3 @@ try_ssl(Socket, Transport, MyCaps, TheirCaps) -> {error, Reason} end end. - From 6bdc099c9f09ebb3b0bc2f039585001ebd9a8b01 Mon Sep 17 00:00:00 2001 From: Mikael Lixenstrand Date: Tue, 16 Dec 2014 10:19:55 +0000 Subject: [PATCH 19/47] handle not_responsible for local partitions --- src/riak_repl_aae_source.erl | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/src/riak_repl_aae_source.erl b/src/riak_repl_aae_source.erl index 70f4a4e2..a405d1e8 100644 --- a/src/riak_repl_aae_source.erl +++ b/src/riak_repl_aae_source.erl @@ -233,16 +233,25 @@ update_trees({start_exchange, [IndexHead|IndexTail]}, State=#state{tree_pid=Tree update_request(TreePid, {Partition, undefined}, IndexHead), case send_synchronous_msg(?MSG_UPDATE_TREE, IndexHead, State) of ok -> - gen_fsm:send_event(self(), {tree_built, Partition, IndexHead, IndexTail}), + gen_fsm:send_event(self(), tree_built), + case IndexTail of + [] -> ok; + _ -> gen_fsm:send_event(self(), {start_exchange, IndexTail}) + end, {next_state, update_trees, State}; not_responsible -> lager:debug("Skipping AAE fullsync tree update for vnode ~p because" " it is not responsible for the preflist ~p", [Partition, IndexHead]), gen_server:cast(Owner, not_responsible), {stop, normal, State} - end; + end; + +update_trees({not_responsible, Partition, IndexN}, State=#state{owner=Owner}) -> + lager:debug("VNode ~p does not cover preflist ~p", [Partition, IndexN]), + gen_server:cast(Owner, not_responsible), + {stop, normal, State}; -update_trees({tree_built, _, _, IndexTail}, State = #state{indexns=IndexNs}) -> +update_trees(tree_built, State = #state{indexns=IndexNs}) -> Built = State#state.built + 1, NeededBuilts = length(IndexNs) * 2, %% All local and remote case Built of @@ -254,7 +263,6 @@ update_trees({tree_built, _, _, IndexTail}, State = #state{indexns=IndexNs}) -> lager:debug("Moving to key exchange state"), key_exchange(init, State#state{built=Built, estimated_nr_keys = EstimatedNrKeys}); _ -> - gen_fsm:send_event(self(), {start_exchange, IndexTail}), {next_state, update_trees, State#state{built=Built}} end. @@ -641,7 +649,7 @@ update_request(Tree, {Index, _}, IndexN) -> as_event(fun() -> case riak_kv_index_hashtree:update(IndexN, Tree) of ok -> - {tree_built, Index, IndexN}; + tree_built; not_responsible -> {not_responsible, Index, IndexN} end From 90733b65592e71a7e0a90693d594701fe318c49f Mon Sep 17 00:00:00 2001 From: Micah Warren Date: Wed, 17 Dec 2014 12:48:36 -0600 Subject: [PATCH 20/47] Added test and fix to coord_serv not givin list for status. --- src/riak_repl2_fscoordinator_serv.erl | 2 +- test/riak_repl2_fscoordinator_serv_tests.erl | 30 ++++++++++++++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) create mode 100644 test/riak_repl2_fscoordinator_serv_tests.erl diff --git a/src/riak_repl2_fscoordinator_serv.erl b/src/riak_repl2_fscoordinator_serv.erl index 71216d75..4a3db511 100644 --- a/src/riak_repl2_fscoordinator_serv.erl +++ b/src/riak_repl2_fscoordinator_serv.erl @@ -51,7 +51,7 @@ status() -> LeaderNode = riak_repl2_leader:leader_node(), case LeaderNode of undefined -> - {[], []}; + []; _ -> case riak_repl2_fscoordinator_serv_sup:started() of [] -> diff --git a/test/riak_repl2_fscoordinator_serv_tests.erl b/test/riak_repl2_fscoordinator_serv_tests.erl new file mode 100644 index 00000000..ce7a86f4 --- /dev/null +++ b/test/riak_repl2_fscoordinator_serv_tests.erl @@ -0,0 +1,30 @@ +-module(riak_repl2_fscoordinator_serv_tests). + +-ifdef(TEST). +-include_lib("eunit/include/eunit.hrl"). + +status_no_leader_test_() -> + {setup, fun() -> + ok = meck:new(riak_repl2_leader, [non_strict]), + ok = meck:expect(riak_repl2_leader, leader_node, fun() -> + undefined + end), + ok = meck:new(riak_repl2_fscoordinator_serv_sup, [non_strict]), + ok = meck:expect(riak_repl2_fscoordinator_serv_sup, started, fun() -> + [] + end) + end, + fun(_) -> + meck:unload(riak_repl2_fscoordinator_serv_sup), + meck:unload(riak_repl2_leader) + end, + fun(_) -> [ + + {"without a leader, status/0 returns an empty list", fun() -> + Got = riak_repl2_fscoordinator_serv:status(), + ?assertEqual([], Got) + end} + + ] end}. + +-endif. From 39fa59f9c5982c8c575e0de85b31e45efa4b62f3 Mon Sep 17 00:00:00 2001 From: Micah Warren Date: Wed, 17 Dec 2014 13:19:57 -0600 Subject: [PATCH 21/47] Removed pointless check for leader. --- src/riak_repl2_fscoordinator_serv.erl | 14 ++------- test/riak_repl2_fscoordinator_serv_tests.erl | 30 -------------------- 2 files changed, 2 insertions(+), 42 deletions(-) delete mode 100644 test/riak_repl2_fscoordinator_serv_tests.erl diff --git a/src/riak_repl2_fscoordinator_serv.erl b/src/riak_repl2_fscoordinator_serv.erl index 4a3db511..a11bd32b 100644 --- a/src/riak_repl2_fscoordinator_serv.erl +++ b/src/riak_repl2_fscoordinator_serv.erl @@ -48,18 +48,8 @@ start_link(Socket, Transport, Proto, Props) -> %% @doc Get the stats for every serv. %% @see status/1 status() -> - LeaderNode = riak_repl2_leader:leader_node(), - case LeaderNode of - undefined -> - []; - _ -> - case riak_repl2_fscoordinator_serv_sup:started() of - [] -> - []; - Repls -> - [status(Pid) || {_Remote, Pid} <- Repls] - end - end. + Repls = riak_repl2_fscoordinator_serv_sup:started(), + [status(Pid) || {_Remove, Pid} <- Repls]. %% @doc Get the status for the given serv. -spec status(Pid :: pid()) -> [tuple()]. diff --git a/test/riak_repl2_fscoordinator_serv_tests.erl b/test/riak_repl2_fscoordinator_serv_tests.erl deleted file mode 100644 index ce7a86f4..00000000 --- a/test/riak_repl2_fscoordinator_serv_tests.erl +++ /dev/null @@ -1,30 +0,0 @@ --module(riak_repl2_fscoordinator_serv_tests). - --ifdef(TEST). --include_lib("eunit/include/eunit.hrl"). - -status_no_leader_test_() -> - {setup, fun() -> - ok = meck:new(riak_repl2_leader, [non_strict]), - ok = meck:expect(riak_repl2_leader, leader_node, fun() -> - undefined - end), - ok = meck:new(riak_repl2_fscoordinator_serv_sup, [non_strict]), - ok = meck:expect(riak_repl2_fscoordinator_serv_sup, started, fun() -> - [] - end) - end, - fun(_) -> - meck:unload(riak_repl2_fscoordinator_serv_sup), - meck:unload(riak_repl2_leader) - end, - fun(_) -> [ - - {"without a leader, status/0 returns an empty list", fun() -> - Got = riak_repl2_fscoordinator_serv:status(), - ?assertEqual([], Got) - end} - - ] end}. - --endif. From 8f079a2afba0efce5bf1428975302940ad3f3acc Mon Sep 17 00:00:00 2001 From: "Engel A. Sanchez" Date: Tue, 23 Dec 2014 18:04:45 -0500 Subject: [PATCH 22/47] Remove partition from purgatory when giving up When a partition has hit the soft exit limit, we add it to the dropped list, but forgot to remove it from the purgatory list. So it may actually be retried later. --- src/riak_repl2_fscoordinator.erl | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/riak_repl2_fscoordinator.erl b/src/riak_repl2_fscoordinator.erl index ef9870b5..aa6f391f 100644 --- a/src/riak_repl2_fscoordinator.erl +++ b/src/riak_repl2_fscoordinator.erl @@ -543,10 +543,14 @@ handle_abnormal_exit(ExitType, Pid, _Cause, {value, PartitionWithSource, Running {noreply, State4#state{purgatory = Purgatory, soft_retry_exits = SoftRetryCount}}; SoftRetryLimit < ErrorCount -> - lager:info("Discaring partition ~p since it has reached the soft exit retry limit of ~p", [Partition#partition_info.index, SoftRetryLimit]), + lager:info("Discarding partition ~p since it has reached the soft exit retry limit of ~p", [Partition#partition_info.index, SoftRetryLimit]), ErrorExits1 = State4#state.error_exits + 1, Dropped = [Partition#partition_info.index | State4#state.dropped], - {noreply, State4#state{error_exits = ErrorExits1, dropped = Dropped}}; + Purgatory = queue:filter(fun(P) -> P =/= Partition end, + State4#state.purgatory), + {noreply, State4#state{error_exits = ErrorExits1, + purgatory = Purgatory, + dropped = Dropped}}; true -> Purgatory = queue:in(Partition, State4#state.purgatory), {noreply, State4#state{purgatory = Purgatory, soft_retry_exits = SoftRetryCount}} From 31a1657d43820bf411c3e54e38195945cfe85fb4 Mon Sep 17 00:00:00 2001 From: "Engel A. Sanchez" Date: Tue, 23 Dec 2014 18:22:30 -0500 Subject: [PATCH 23/47] Silence noisy lager output during tests --- test/riak_repl_test_util.erl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/riak_repl_test_util.erl b/test/riak_repl_test_util.erl index 3e0616ea..176389ed 100644 --- a/test/riak_repl_test_util.erl +++ b/test/riak_repl_test_util.erl @@ -105,9 +105,9 @@ maybe_start_lager(_) -> start_lager(). start_lager() -> - %error_logger:tty(false), {ok, Started} = application:ensure_all_started(lager), - lager:set_loglevel(lager_console_backend, debug), + % But keep it quiet, please + lager:set_loglevel(lager_console_backend, '=emergency'), Started. %% @doc Stop the applications listsed. The list is assumed to be in the From 982eb528b5562d91021b5f7194dd6b849bd995ba Mon Sep 17 00:00:00 2001 From: "John R. Daily" Date: Tue, 19 Aug 2014 10:30:13 -0400 Subject: [PATCH 24/47] Fixes to make edoc happy --- src/riak_core_service_mgr.erl | 2 +- src/riak_repl2_rtq.erl | 2 +- src/riak_repl_fullsync_helper.erl | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/riak_core_service_mgr.erl b/src/riak_core_service_mgr.erl index 42d308c6..af492cf3 100644 --- a/src/riak_core_service_mgr.erl +++ b/src/riak_core_service_mgr.erl @@ -144,7 +144,7 @@ is_registered(ProtocolId) -> %% @doc Register a callback function that will get called periodically or %% when the connection status of services changes. The function will -%% receive a list of tuples: `{, }' where stats +%% receive a list of tuples: {<protocol-id>, <stats>} where stats %% holds the number of open connections that have been accepted for that %% protocol type. This can be used to report load, in the form of %% connected-ness, for each protocol type, to remote clusters, e.g., diff --git a/src/riak_repl2_rtq.erl b/src/riak_repl2_rtq.erl index 1085617c..2cb84d26 100644 --- a/src/riak_repl2_rtq.erl +++ b/src/riak_repl2_rtq.erl @@ -100,7 +100,7 @@ start_link() -> -type start_option() :: overload_threshold_option() | overload_recover_option(). -type start_options() :: [start_option()]. %% @doc Start linked, registers to module name, with given options. This makes -%% testing some options a bit easier as it removes a dependence on `app_helper'. +%% testing some options a bit easier as it removes a dependance on app_helper. -spec start_link(Options :: start_options()) -> {'ok', pid()}. start_link(Options) -> case ets:info(?overload_ets) of diff --git a/src/riak_repl_fullsync_helper.erl b/src/riak_repl_fullsync_helper.erl index d23386e6..ecbfb32e 100644 --- a/src/riak_repl_fullsync_helper.erl +++ b/src/riak_repl_fullsync_helper.erl @@ -393,7 +393,7 @@ missing_key(PBKey, DiffState) -> %% the same, then a badfun error will occur since the MD5s of the %% modules are not the same. %% -%% See [http://www.javalimit.com/2010/05/passing-funs-to-other-erlang-nodes.html] +%% See http://www.javalimit.com/2010/05/passing-funs-to-other-erlang-nodes.html keylist_fold({B,Key}=K, V, {MPid, Count, Total}) -> try H = hash_object(B,Key,V), From 05822f4b7132032cafd04361c4f13bdeeebf4014 Mon Sep 17 00:00:00 2001 From: Kresten Krab Thorup Date: Mon, 6 Oct 2014 13:55:58 +0200 Subject: [PATCH 25/47] Add new cluster membership function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This implementes riak_core_cluster_serv {1,1} with new membership function on the server side to give list of {node(), {IP,Port} | unreachable} for *all* members of remote cluster. Nodes for which the cluster_serv cannot RPC to the given node yield ‘unreachable’ in stead of an IP/Port. --- include/riak_core_connection.hrl | 3 ++- src/riak_core_cluster_conn.erl | 20 ++++++++++------ src/riak_core_cluster_mgr.erl | 34 +++++++++++++++++++++++---- src/riak_core_cluster_mgr_sup.erl | 1 + src/riak_core_cluster_serv.erl | 11 +++++++++ src/riak_repl_app.erl | 39 ++++++++++++++++++++----------- 6 files changed, 83 insertions(+), 25 deletions(-) diff --git a/include/riak_core_connection.hrl b/include/riak_core_connection.hrl index 2ce0c5c8..1006efb3 100644 --- a/include/riak_core_connection.hrl +++ b/include/riak_core_connection.hrl @@ -22,12 +22,13 @@ %% handshake messages to safely initiate a connection. Let's not accept %% a connection to a telnet session by accident! --define(CTRL_REV, {1,0}). +-define(CTRL_REV, {1,1}). -define(CTRL_HELLO, <<"riak-ctrl:hello">>). -define(CTRL_TELL_IP_ADDR, <<"riak-ctrl:ip_addr">>). -define(CTRL_ACK, <<"riak-ctrl:ack">>). -define(CTRL_ASK_NAME, <<"riak-ctrl:ask_name">>). -define(CTRL_ASK_MEMBERS, <<"riak-ctrl:ask_members">>). +-define(CTRL_ALL_MEMBERS, <<"riak-ctrl:all_members">>). -define(CONNECTION_SETUP_TIMEOUT, 10000). diff --git a/src/riak_core_cluster_conn.erl b/src/riak_core_cluster_conn.erl index 0bad9bd2..c3830877 100644 --- a/src/riak_core_cluster_conn.erl +++ b/src/riak_core_cluster_conn.erl @@ -86,8 +86,9 @@ connection_timeout :: timeout(), transport :: atom(), address :: peer_address(), - connection_props :: proplists:proplist(), - transport_msgs :: ranch_transport_messages()}). + connection_props :: proplist:proplist(), + transport_msgs :: ranch_transport_messages(), + proto_version :: {non_neg_integer(), non_neg_integer()} }). -type state() :: #state{}. %%%=================================================================== @@ -121,14 +122,17 @@ status(Ref, Timeout) -> connected(Socket, Transport, Addr, - {?REMOTE_CLUSTER_PROTO_ID, _MyVer, _RemoteVer}, + {?REMOTE_CLUSTER_PROTO_ID, + _MyVer ={CommonMajor,LocalMinor}, + _RemoteVer={CommonMajor,RemoteMinor}}, {_Remote, Client}, Props) -> %% give control over the socket to the `Client' process. %% tell client we're connected and to whom Transport:controlling_process(Socket, Client), gen_fsm:send_event(Client, - {connected_to_remote, Socket, Transport, Addr, Props}). + {connected_to_remote, Socket, Transport, Addr, Props, + {CommonMajor, min(LocalMinor,RemoteMinor)}}). -spec connect_failed({term(), term()}, {error, term()}, {_, atom() | pid() | port() | {atom(), _} | {via, _, _}}) -> ok. connect_failed({_Proto, _Vers}, {error, _}=Error, {_Remote, Client}) -> @@ -189,7 +193,7 @@ connecting({connect_failed, Error}, State=#state{remote=Remote}) -> %% This is fatal! We are being supervised by conn_sup and if we %% die, it will restart us. {stop, Error, State}; -connecting({connected_to_remote, Socket, Transport, Addr, Props}, State) -> +connecting({connected_to_remote, Socket, Transport, Addr, Props, ProtoVersion}, State) -> RemoteName = proplists:get_value(clustername, Props), _ = lager:debug("Cluster Manager control channel client connected to" " remote ~p at ~p named ~p", @@ -202,7 +206,9 @@ connecting({connected_to_remote, Socket, Transport, Addr, Props}, State) -> transport=Transport, address=Addr, connection_props=Props, - transport_msgs = TransportMsgs}, + transport_msgs = TransportMsgs, + proto_version=ProtoVersion + }, _ = request_cluster_name(UpdState), {next_state, waiting_for_cluster_name, UpdState, ?CONNECTION_SETUP_TIMEOUT}; connecting(poll_cluster, State) -> @@ -383,7 +389,7 @@ initiate_connection(State=#state{remote=Remote}) -> %% `riak_core_connection_mgr::connect/4' is incorrect. {ok, Ref} = riak_core_connection_mgr:connect( Remote, - {{?REMOTE_CLUSTER_PROTO_ID, [{1,0}]}, + {{?REMOTE_CLUSTER_PROTO_ID, [{1,1},{1,0}]}, {?CTRL_OPTIONS, ?MODULE, {Remote, self()}}}, default), State#state{connection_ref=Ref}. diff --git a/src/riak_core_cluster_mgr.erl b/src/riak_core_cluster_mgr.erl index 8a70241c..6ad59a0b 100644 --- a/src/riak_core_cluster_mgr.erl +++ b/src/riak_core_cluster_mgr.erl @@ -70,6 +70,7 @@ leader_node = undefined :: undefined | node(), gc_interval = infinity, member_fun = fun(_Addr) -> [] end, % return members of local cluster + all_member_fun = fun(_Addr) -> [] end, % return members of local cluster restore_targets_fun = fun() -> [] end, % returns persisted cluster targets save_members_fun = fun(_C,_M) -> ok end, % persists remote cluster members balancer_fun = fun(Addrs) -> Addrs end, % registered balancer function @@ -82,6 +83,7 @@ get_leader/0, get_is_leader/0, register_member_fun/1, + register_all_member_fun/1, register_restore_cluster_targets_fun/1, register_save_cluster_members_fun/1, add_remote_cluster/1, remove_remote_cluster/1, @@ -100,7 +102,7 @@ %% internal functions -export([%ctrlService/5, ctrlServiceProcess/5, round_robin_balancer/1, cluster_mgr_sites_fun/0, - get_my_members/1]). + get_my_members/1, get_all_members/1]). -export([ensure_valid_ip_addresses/1]). @@ -139,6 +141,12 @@ get_is_leader() -> register_member_fun(MemberFun) -> gen_server:cast(?SERVER, {register_member_fun, MemberFun}). +%% @doc Register a function that will get called to get out local riak node +%% member's IP addrs. MemberFun(ip_addr()) -> [{node(),{IP,Port}}] were IP is a string +-spec register_all_member_fun(MemberFun :: fun((ip_addr()) -> [{atom(),{string(),pos_integer()}}])) -> 'ok'. +register_all_member_fun(MemberFun) -> + gen_server:cast(?SERVER, {register_all_member_fun, MemberFun}). + register_restore_cluster_targets_fun(ReadClusterFun) -> gen_server:cast(?SERVER, {register_restore_cluster_targets_fun, ReadClusterFun}). @@ -168,6 +176,9 @@ get_connections() -> get_my_members(MyAddr) -> gen_server:call(?SERVER, {get_my_members, MyAddr}, infinity). +get_all_members(MyAddr) -> + gen_server:call(?SERVER, {get_all_members, MyAddr}, infinity). + %% @doc Return a list of the known IP addresses of all nodes in the remote cluster. get_ipaddrs_of_cluster(ClusterName) -> gen_server:call(?SERVER, {get_known_ipaddrs_of_cluster, {name,ClusterName}}, infinity). @@ -190,7 +201,7 @@ init(Defaults) -> %% start our cluster_mgr service if not already started. case riak_core_service_mgr:is_registered(?CLUSTER_PROTO_ID) of false -> - ServiceProto = {?CLUSTER_PROTO_ID, [{1,0}]}, + ServiceProto = {?CLUSTER_PROTO_ID, [{1,1}, {1,0}]}, %ServiceSpec = {ServiceProto, {?CTRL_OPTIONS, ?MODULE, ctrlService, []}}, ServiceSpec = {ServiceProto, {?CTRL_OPTIONS, riak_core_cluster_serv, start_link, []}}, riak_core_service_mgr:sync_register_service(ServiceSpec, {round_robin,?MAX_CONS}); @@ -226,7 +237,18 @@ handle_call(get_is_leader, _From, State) -> handle_call({get_my_members, MyAddr}, _From, State) -> %% This doesn't need to call the leader. MemberFun = State#state.member_fun, - MyMembers = [{string_of_ip(IP),Port} || {IP,Port} <- MemberFun(MyAddr)], + MyMembers = [{string_of_ip(IP),Port} || {IP,Port} <- MemberFun(MyAddr), is_integer(Port)], + {reply, MyMembers, State}; + +handle_call({get_all_members, MyAddr}, _From, State) -> + %% This doesn't need to call the leader. + AllMemberFun = State#state.all_member_fun, + MyMembers = lists:map(fun({Node,{IP,Port}}) when is_integer(Port) -> + {Node,{string_of_ip(IP),Port}}; + ({Node,_}) -> + {Node, unreachable} + end, + AllMemberFun(MyAddr)), {reply, MyMembers, State}; handle_call(leader_node, _From, State) -> @@ -309,6 +331,9 @@ handle_cast({set_gc_interval, Interval}, State) -> handle_cast({register_member_fun, Fun}, State) -> {noreply, State#state{member_fun=Fun}}; +handle_cast({register_all_member_fun, Fun}, State) -> + {noreply, State#state{all_member_fun=Fun}}; + handle_cast({register_save_cluster_members_fun, Fun}, State) -> {noreply, State#state{save_members_fun=Fun}}; @@ -422,9 +447,10 @@ register_defaults(Defaults, State) -> case Defaults of [] -> State; - [MembersFun, SaveFun, RestoreFun] -> + [MembersFun, AllMembersFun, SaveFun, RestoreFun] -> lager:debug("Registering default cluster manager functions."), State#state{member_fun=MembersFun, + all_member_fun=AllMembersFun, save_members_fun=SaveFun, restore_targets_fun=RestoreFun} end. diff --git a/src/riak_core_cluster_mgr_sup.erl b/src/riak_core_cluster_mgr_sup.erl index 0b476c6b..454b31a0 100644 --- a/src/riak_core_cluster_mgr_sup.erl +++ b/src/riak_core_cluster_mgr_sup.erl @@ -19,6 +19,7 @@ start_link() -> %% @doc supervisor callback. init([]) -> ClusterMgrDefaults = [fun riak_repl_app:cluster_mgr_member_fun/1, + fun riak_repl_app:cluster_mgr_all_member_fun/1, fun riak_repl_app:cluster_mgr_write_cluster_members_to_ring/2, fun riak_repl_app:cluster_mgr_read_cluster_targets_from_ring/0], Processes = diff --git a/src/riak_core_cluster_serv.erl b/src/riak_core_cluster_serv.erl index 915142dd..5df5d93d 100644 --- a/src/riak_core_cluster_serv.erl +++ b/src/riak_core_cluster_serv.erl @@ -115,6 +115,17 @@ handle_socket_info(?CTRL_ASK_MEMBERS, Transport, Socket, State) -> {stop, Else, State} end; +handle_socket_info(?CTRL_ALL_MEMBERS, Transport, Socket, State) -> + case read_ip_address(Socket, Transport, State#state.remote_addr) of + {ok, RemoteConnectedToIp} -> + Members = gen_server:call(?CLUSTER_MANAGER_SERVER, {get_all_members, RemoteConnectedToIp}, infinity), + ok = Transport:send(Socket, term_to_binary(Members)), + ok = Transport:setopts(Socket, [{active, once}]), + {noreply, State}; + Else -> + {stop, Else, State} + end; + handle_socket_info(OtherData, _Transport, _Socket, State) -> ok = lager:warning("Some other data from the socket: ~p", [OtherData]), {stop, {error, unrecognized_request}, State}. diff --git a/src/riak_repl_app.erl b/src/riak_repl_app.erl index 9d3bc88c..8d05f497 100644 --- a/src/riak_repl_app.erl +++ b/src/riak_repl_app.erl @@ -8,6 +8,7 @@ % versions < 1.3 -export([get_matching_address/2, cluster_mgr_member_fun/1, + cluster_mgr_all_member_fun/1, cluster_mgr_write_cluster_members_to_ring/2, cluster_mgr_read_cluster_targets_from_ring/0]). @@ -156,7 +157,19 @@ prune_old_workdirs(WorkRoot) -> end. %% Get the list of nodes of our ring +%% This list includes all up-nodes, that host the riak_kv service cluster_mgr_member_fun({IP, Port}) -> + lists_shuffle([ {XIP,XPort} || {_Node,{XIP,XPort}} <- cluster_mgr_members({IP, Port}, riak_core_node_watcher:nodes(riak_kv)), + is_integer(XPort) ]). + +%% this list includes *all* members of the ring (even those marked down). +%% returns a list [ { node(), {IP, Port} | unreachable }, ... ] +cluster_mgr_all_member_fun({IP, Port}) -> + {ok, Ring} = riak_core_ring_manager:get_my_ring(), + cluster_mgr_members({IP, Port}, riak_core_ring:all_members(Ring)). + +cluster_mgr_members({IP, Port}, Nodes) -> + %% find the subnet for the interface we connected to {ok, MyIPs} = inet:getifaddrs(), {ok, NormIP} = riak_repl_util:normalize_ip(IP), @@ -170,12 +183,11 @@ cluster_mgr_member_fun({IP, Port}) -> lager:warning("Connected IP not present locally, must be NAT. Returning ~p", [{IP,Port}]), %% might as well return the one IP we know will work - [{IP, Port}]; + [{node(), {IP, Port}}]; CIDR -> ?TRACE(lager:notice("CIDR is ~p", [CIDR])), %AddressMask = riak_repl2_ip:mask_address(NormIP, MyMask), %?TRACE(lager:notice("address mask is ~p", [AddressMask])), - Nodes = riak_core_node_watcher:nodes(riak_kv), {Results, BadNodes} = rpc:multicall(Nodes, riak_repl2_ip, get_matching_address, [RealIP, CIDR]), % when this code was written, a multicall will list the results @@ -187,27 +199,28 @@ cluster_mgr_member_fun({IP, Port}) -> case RealIP == NormIP of true -> %% No nat, just return the results - lists_shuffle(Results2); + Results2; false -> %% NAT is in effect - NatRes = lists:foldl(fun({XIP, XPort}, Acc) -> + NatRes = lists:foldl(fun({XNode, {XIP, XPort}}, Acc) -> case riak_repl2_ip:apply_reverse_nat_map(XIP, XPort, Map) of error -> %% there's no NAT configured for this IP! %% location_down is the closest thing we %% can reply with. lager:warning("There's no NAT mapping for" - "~p:~b to an external IP", - [XIP, XPort]), + "~p:~b to an external IP (node: ~p)", + [XIP, XPort, XNode]), Acc; {ExternalIP, ExternalPort} -> - [{ExternalIP, ExternalPort}|Acc]; + [{XNode, {ExternalIP, ExternalPort}} | Acc]; ExternalIP -> - [{ExternalIP, XPort}|Acc] + [{XNode, {ExternalIP, XPort}}|Acc] end end, [], Results2), - lager:debug("~p -> ~p", [Results2, NatRes]), - lists_shuffle(NatRes) + Results3 = NatRes ++ [ {Node, unreachable} || Node <- BadNodes ], + lager:debug("nat: ~p -> ~p", [Results2, Results3]), + Results3 end end. @@ -218,9 +231,9 @@ maybe_retry_ip_rpc(Results, Nodes, BadNodes, Args) -> ({{badrpc, {'EXIT', {undef, _StrackTrace}}}, Node}) -> RPCResult = riak_core_util:safe_rpc(Node, riak_repl_app, get_matching_address, Args), lager:debug("rpc to get_matching_address: ~p", [RPCResult]), - RPCResult; - ({Result, _Node}) -> - Result + {Node, RPCResult}; + ({Result, Node}) -> + {Node, Result} end, lists:map(MaybeRetry, Zipped). From b0a173f3ec4bfd1f11b4e112c6bdc4f6915231f3 Mon Sep 17 00:00:00 2001 From: Kresten Krab Thorup Date: Mon, 6 Oct 2014 15:35:22 +0200 Subject: [PATCH 26/47] Add client code for {1,1} cluster protocol Use new all_members message if remote is 1,1+ For 1,0 emulate new semantics by keeping old IP:PORTs around until cluster_mgr restart. Implement new seeded sort+shuffle for result of calling cluster_mgr:get_ipaddrs_of_cluster/1. --- src/riak_core_cluster_conn.erl | 64 ++++++++++++++++++++++++++++++---- src/riak_core_cluster_mgr.erl | 45 ++++++++++++++++++++---- 2 files changed, 96 insertions(+), 13 deletions(-) diff --git a/src/riak_core_cluster_conn.erl b/src/riak_core_cluster_conn.erl index c3830877..0b1e2db6 100644 --- a/src/riak_core_cluster_conn.erl +++ b/src/riak_core_cluster_conn.erl @@ -75,13 +75,14 @@ -type remote() :: {cluster_by_name, clustername()} | {cluster_by_addr, ip_addr()}. -type peer_address() :: {string(), pos_integer()}. +-type node_address() :: {atom(), peer_address()}. -type ranch_transport_messages() :: {atom(), atom(), atom()}. -record(state, {mode :: atom(), remote :: remote(), socket :: port(), name :: clustername(), previous_name="undefined" :: clustername(), - members=[] :: [peer_address()], + members=[] :: [peer_address() | node_address()], connection_ref :: reference(), connection_timeout :: timeout(), transport :: atom(), @@ -246,13 +247,20 @@ waiting_for_cluster_name(_, _From, _State) -> {reply, ok, waiting_for_cluster_name, _State}. %% Async message handling for the `waiting_for_cluster_members' state -waiting_for_cluster_members({cluster_members, Members}, State) -> +waiting_for_cluster_members({cluster_members, NewMembers}, State = #state{ proto_version={1,0} }) -> #state{address=Addr, name=Name, previous_name=PreviousName, + members=OldMembers, remote=Remote} = State, - %% This is the first time we're updating the cluster manager - %% with the name of this cluster, so it's old name is undefined. + %% this is 1.0 code. NewMembers is list of {IP,Port} + + SortedNew = ordsets:from_list(NewMembers), + Members = NewMembers ++ lists:filter( fun(Mem) -> + not ordsets:is_element(Mem, SortedNew) + end, + OldMembers ), + ClusterUpdatedMsg = {cluster_updated, PreviousName, Name, @@ -261,6 +269,36 @@ waiting_for_cluster_members({cluster_members, Members}, State) -> Remote}, gen_server:cast(?CLUSTER_MANAGER_SERVER, ClusterUpdatedMsg), {next_state, connected, State#state{members=Members}}; +waiting_for_cluster_members({all_cluster_members, NewMembers}, State) -> + #state{address=Addr, + name=Name, + previous_name=PreviousName, + members=OldMembers, + remote=Remote} = State, + + %% this is 1.1+ code. Members is list of {node,{IP,Port}} + + Members = lists:foldl( fun(Elm={_Node,{_Ip,Port}}, Acc) when is_integer(Port) -> + [Elm|Acc]; + ({Node,_}, Acc) -> + case lists:keyfind(Node, 1, OldMembers) of + Elm={Node,{_IP,Port}} when is_integer(Port) -> + [Elm|Acc]; + _ -> + Acc + end + end, + [], + NewMembers ), + + ClusterUpdatedMsg = {cluster_updated, + PreviousName, + Name, + [Member || {_Node,Member} <- Members], + Addr, + Remote}, + gen_server:cast(?CLUSTER_MANAGER_SERVER, ClusterUpdatedMsg), + {next_state, connected, State#state{members=Members}}; waiting_for_cluster_members(_, _State) -> {next_state, waiting_for_cluster_members, _State}. @@ -313,11 +351,18 @@ handle_info({TransOK, Socket, Name}, {next_state, waiting_for_cluster_name, State}; handle_info({TransOK, Socket, Members}, waiting_for_cluster_members, - State=#state{socket=Socket, transport_msgs = {TransOK, _, _}}) -> + State=#state{socket=Socket, transport_msgs = {TransOK, _, _}, proto_version={1,0}}) -> Transport = State#state.transport, gen_fsm:send_event(self(), {cluster_members, binary_to_term(Members)}), _ = Transport:setopts(Socket, [{active, once}]), {next_state, waiting_for_cluster_members, State}; +handle_info({TransOK, Socket, Members}, + waiting_for_cluster_members, + State=#state{socket=Socket, transport_msgs = {TransOK, _, _}}) -> + Transport = State#state.transport, + gen_fsm:send_event(self(), {all_cluster_members, binary_to_term(Members)}), + _ = Transport:setopts(Socket, [{active, once}]), + {next_state, waiting_for_cluster_members, State}; handle_info({TransOK, Socket, Data}, StateName, State=#state{address=Addr, @@ -374,12 +419,19 @@ request_cluster_name(#state{socket=Socket, transport=Transport}) -> -spec request_member_ips(state()) -> ok | {error, term()}. request_member_ips(#state{mode=test}) -> ok; -request_member_ips(#state{socket=Socket, transport=Transport}) -> +request_member_ips(#state{socket=Socket, transport=Transport, proto_version={1,0}}) -> Transport:send(Socket, ?CTRL_ASK_MEMBERS), %% get the IP we think we've connected to {ok, {PeerIP, PeerPort}} = Transport:peername(Socket), %% make it a string PeerIPStr = inet_parse:ntoa(PeerIP), + Transport:send(Socket, term_to_binary({PeerIPStr, PeerPort})); +request_member_ips(#state{socket=Socket, transport=Transport, proto_version={1,1}}) -> + Transport:send(Socket, ?CTRL_ALL_MEMBERS), + %% get the IP we think we've connected to + {ok, {PeerIP, PeerPort}} = Transport:peername(Socket), + %% make it a string + PeerIPStr = inet_parse:ntoa(PeerIP), Transport:send(Socket, term_to_binary({PeerIPStr, PeerPort})). initiate_connection(State=#state{mode=test}) -> diff --git a/src/riak_core_cluster_mgr.erl b/src/riak_core_cluster_mgr.erl index 6ad59a0b..0da54ded 100644 --- a/src/riak_core_cluster_mgr.erl +++ b/src/riak_core_cluster_mgr.erl @@ -78,7 +78,7 @@ }). -export([start_link/0, - start_link/3, + start_link/4, set_leader/2, get_leader/0, get_is_leader/0, @@ -113,8 +113,8 @@ start_link() -> gen_server:start_link({local, ?SERVER}, ?MODULE, [], []). -start_link(DefaultLocator, DefaultSave, DefaultRestore) -> - Args = [DefaultLocator, DefaultSave, DefaultRestore], +start_link(DefaultLocator, DefaultAllLocator, DefaultSave, DefaultRestore) -> + Args = [DefaultLocator, DefaultAllLocator, DefaultSave, DefaultRestore], Options = [], gen_server:start_link({local, ?SERVER}, ?MODULE, Args, Options). @@ -181,7 +181,12 @@ get_all_members(MyAddr) -> %% @doc Return a list of the known IP addresses of all nodes in the remote cluster. get_ipaddrs_of_cluster(ClusterName) -> - gen_server:call(?SERVER, {get_known_ipaddrs_of_cluster, {name,ClusterName}}, infinity). + case gen_server:call(?SERVER, {get_known_ipaddrs_of_cluster, {name,ClusterName}}, infinity) of + {ok, Reply} -> + shuffle_remote_ipaddrs(Reply); + Reply -> + Reply + end. %% @doc stops the local server. -spec stop() -> 'ok'. @@ -372,9 +377,6 @@ handle_cast({remove_remote_cluster, Cluster}, State) -> {noreply, State2}; %% The client connection recived (or polled for) an update from the remote cluster. -%% Note that here, the Members are sorted in least connected order. Preserve that. -%% TODO: we really want to keep all nodes we have every seen, except remove nodes -%% that explicitly leave the cluster or show up in other clusters. handle_cast({cluster_updated, "undefined", NewName, Members, Addr, {cluster_by_addr, _CAddr}=Remote}, State) -> %% replace connection by address with connection by clustername if that would be safe. @@ -753,3 +755,32 @@ connect_to_persisted_clusters(State) -> _ -> ok end. + +shuffle_with_seed(List, Seed={_,_,_}) -> + _ = random:seed(Seed), + [E || {E, _} <- lists:keysort(2, [{Elm, random:uniform()} || Elm <- List])]; +shuffle_with_seed(List, Seed) -> + <<_:10,S1:50,S2:50,S3:50>> = crypto:hash(sha, term_to_binary(Seed)), + shuffle_with_seed(List, {S1,S2,S3}). + + +shuffle_remote_ipaddrs(RemoteUnsorted) -> + {ok, MyRing} = riak_core_ring_manager:get_my_ring(), + SortedNodes = lists:sort(riak_core_ring:all_members(MyRing)), + NodesTagged = lists:zip( lists:seq(1, length(SortedNodes)), SortedNodes ), + case lists:keyfind(node(), 2, NodesTagged) of + {MyPos, _} -> + OurClusterName = riak_core_connection:symbolic_clustername(), + RemoteAddrs = shuffle_with_seed( lists:sort( RemoteUnsorted ), [OurClusterName] ), + + %% MyPos is the position if *this* node in the sorted list of + %% all nodes in my ring. Now choose the node at the corresponding + %% index in RemoteAddrs as out "buddy" + SplitPos = ((MyPos-1) rem length(RemoteAddrs)), + case lists:split(SplitPos,RemoteAddrs) of + {BeforeBuddy,[Buddy|AfterBuddy]} -> + {ok, [Buddy | shuffle_with_seed(AfterBuddy ++ BeforeBuddy, node()) ]} + end; + false -> + {ok, shuffle_with_seed(lists:sort( RemoteUnsorted ), node())} + end. From 3c9f7ee6c709de5752f157e23749d393f9d7aef4 Mon Sep 17 00:00:00 2001 From: Kresten Krab Thorup Date: Tue, 7 Oct 2014 13:53:25 +0200 Subject: [PATCH 27/47] Register default all_member_fun --- src/riak_repl_app.erl | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/riak_repl_app.erl b/src/riak_repl_app.erl index 8d05f497..4b536598 100644 --- a/src/riak_repl_app.erl +++ b/src/riak_repl_app.erl @@ -74,6 +74,11 @@ start(_Type, _StartArgs) -> riak_core_cluster_mgr:register_member_fun( fun cluster_mgr_member_fun/1), + %% register functions for cluster manager to find it's own + %% nodes' ip addrs + riak_core_cluster_mgr:register_all_member_fun( + fun cluster_mgr_all_member_fun/1), + %% cluster manager leader will follow repl leader riak_repl2_leader:register_notify_fun( fun riak_core_cluster_mgr:set_leader/2), From 570420d9d05fe7e559cd47d8eb4747d78149b474 Mon Sep 17 00:00:00 2001 From: Kresten Krab Thorup Date: Tue, 7 Oct 2014 15:25:33 +0200 Subject: [PATCH 28/47] Add rtsource_conn:address, and :reconnect The first tells the caller the address currently connected to. The second tells the rtsource_conn to try (if possible) to switch to an alternative connection. --- src/riak_repl2_rt.erl | 6 +++-- src/riak_repl2_rtsource_conn.erl | 41 +++++++++++++++++++++++++++++++- 2 files changed, 44 insertions(+), 3 deletions(-) diff --git a/src/riak_repl2_rt.erl b/src/riak_repl2_rt.erl index 49bf504f..7624ced4 100644 --- a/src/riak_repl2_rt.erl +++ b/src/riak_repl2_rt.erl @@ -124,8 +124,10 @@ ensure_rt(WantEnabled0, WantStarted0) -> end. register_remote_locator() -> - Locator = fun(Name, _Policy) -> - riak_core_cluster_mgr:get_ipaddrs_of_cluster(Name) + Locator = fun (_, {use_only, Addrs}) -> + {ok, Addrs}; + (Name, _Policy) -> + riak_core_cluster_mgr:get_ipaddrs_of_cluster(Name) end, ok = riak_core_connection_mgr:register_locator(rt_repl, Locator). diff --git a/src/riak_repl2_rtsource_conn.erl b/src/riak_repl2_rtsource_conn.erl index a6dddb63..41cb8638 100644 --- a/src/riak_repl2_rtsource_conn.erl +++ b/src/riak_repl2_rtsource_conn.erl @@ -44,6 +44,7 @@ -export([start_link/1, stop/1, status/1, status/2, + address/1, maybe_reconnect/2, legacy_status/1, legacy_status/2]). %% connection manager callbacks @@ -81,6 +82,12 @@ start_link(Remote) -> stop(Pid) -> gen_server:call(Pid, stop, ?LONG_TIMEOUT). +address(Pid) -> + gen_server:call(Pid, address, ?LONG_TIMEOUT). + +maybe_reconnect(Pid, BetterAddrs) -> + gen_server:cast(Pid, {maybe_reconnect, BetterAddrs}). + status(Pid) -> status(Pid, infinity). @@ -143,6 +150,8 @@ init([Remote]) -> handle_call(stop, _From, State) -> {stop, normal, ok, State}; +handle_call(address, _From, State = #state{ address=A }) -> + {reply, {ok, A}, State}; handle_call(status, _From, State = #state{remote = R, address = _A, transport = T, socket = S, helper_pid = H, @@ -246,7 +255,37 @@ handle_cast({connect_failed, _HelperPid, Reason}, State = #state{remote = Remote}) -> lager:warning("Realtime replication connection to site ~p failed - ~p\n", [Remote, Reason]), - {stop, normal, State}. + {stop, normal, State}; +handle_cast({maybe_reconnect, BetterAddrs}, State=#state{ remote=Remote }) -> + + lager:info("trying reconnect to one of: ~p", [BetterAddrs]), + + %% if we have a pending connection attempt - drop that + riak_core_connection_mgr:disconnect({rt_repl, Remote}), + + TcpOptions = [{keepalive, true}, + {nodelay, true}, + {packet, 0}, + {active, false}], + % nodes running 1.3.1 have a bug in the service_mgr module. + % this bug prevents it from being able to negotiate a version list longer + % than 2. Until we no longer support communicating with that version, + % we need to artifically truncate the version list. + % TODO: expand version list or remove comment when we no + % longer support 1.3.1 + % prefered version list: [{2,0}, {1,5}, {1,1}, {1,0}] + ClientSpec = {{realtime,[{3,0}, {2,0}, {1,5}]}, {TcpOptions, ?MODULE, self()}}, + + %% Todo: check for bad remote name + lager:debug("re-connecting to remote ~p", [Remote]), + case riak_core_connection_mgr:connect({rt_repl, Remote}, ClientSpec, {use_only, BetterAddrs}) of + {ok, Ref} -> + lager:debug("connecting ref ~p", [Ref]), + {noreply, State#state{ connection_ref = Ref}}; + {error, Reason}-> + lager:warning("Error connecting to remote ~p (ignoring as we're reconnecting)", [Reason]), + {noreply, State} + end. handle_info({Proto, _S, TcpBin}, State= #state{cont = Cont}) when Proto == tcp; Proto == ssl -> From 3efd0d7821ba237f34c0a9b22d80cd1ca16f8fdf Mon Sep 17 00:00:00 2001 From: Kresten Krab Thorup Date: Tue, 7 Oct 2014 15:26:56 +0200 Subject: [PATCH 29/47] Expose filter_blacklisted_ipaddrs function --- src/riak_core_connection_mgr.erl | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/riak_core_connection_mgr.erl b/src/riak_core_connection_mgr.erl index 7abc1a41..b46b059b 100644 --- a/src/riak_core_connection_mgr.erl +++ b/src/riak_core_connection_mgr.erl @@ -113,6 +113,7 @@ reset_backoff/0, get_request_states/0, get_connection_errors/1, + filter_blacklisted_ipaddrs/1, stop/0 ]). @@ -204,6 +205,10 @@ get_request_states() -> get_connection_errors(Addr) -> gen_server:call(?SERVER, {get_connection_errors, Addr}). +%% @doc Remove the blacklisted addresses from given list +filter_blacklisted_ipaddrs(Addrs) -> + gen_server:call(?SERVER, {filter_blacklisted_ipaddrs, Addrs}). + %% doc Stop the server and sever all connections. stop() -> gen_server:call(?SERVER, stop). @@ -287,6 +292,10 @@ handle_call({get_connection_errors, Addr}, _From, State = #state{endpoints=Endpo {reply, [], State} end; +handle_call({filter_blacklisted_ipaddrs, Addrs}, _From, State=#state{ endpoints=Eps }) -> + Answer = filter_blacklisted_endpoints(Addrs, Eps), + {reply, Answer, State}; + handle_call(_Unhandled, _From, State) -> lager:debug("Unhandled gen_server call: ~p", [_Unhandled]), {reply, {error, unhandled}, State}. From 06ebbe576f63cff95599ec11de2b6c66c9a2a5b4 Mon Sep 17 00:00:00 2001 From: Kresten Krab Thorup Date: Tue, 7 Oct 2014 15:27:34 +0200 Subject: [PATCH 30/47] Extra code in ring change hook to maybe reconnect --- src/riak_repl2_rt.erl | 50 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 49 insertions(+), 1 deletion(-) diff --git a/src/riak_repl2_rt.erl b/src/riak_repl2_rt.erl index 7624ced4..dfe7c292 100644 --- a/src/riak_repl2_rt.erl +++ b/src/riak_repl2_rt.erl @@ -79,7 +79,8 @@ ensure_rt(WantEnabled0, WantStarted0) -> Status = riak_repl2_rtq:status(), CStatus = proplists:get_value(consumers, Status, []), Enabled = lists:sort([Remote || {Remote, _Stats} <- CStatus]), - Started = lists:sort([Remote || {Remote, _Pid} <- riak_repl2_rtsource_conn_sup:enabled()]), + Connections = riak_repl2_rtsource_conn_sup:enabled(), + Started = lists:sort([Remote || {Remote, _Pid} <- Connections]), ToEnable = WantEnabled -- Enabled, ToDisable = Enabled -- WantEnabled, @@ -96,6 +97,40 @@ ensure_rt(WantEnabled0, WantStarted0) -> application:set_env(riak_repl, rtenabled, true) end, + %% Check if we're connected to our preferred peer(s). If not, initate reconnect + {ok, Ring} = riak_core_ring_manager:get_my_ring(), + ToValidate = Started -- ToStop, + _ = [case lists:keyfind(Remote, 1, Connections) of + {_, PID} -> + {ok, ConnectedAddr} = riak_repl2_rtsource_conn:address(PID), + Addrs = riak_repl_ring:get_clusterIpAddrs(Ring, Remote), + {ok, ShuffledAddrs} = riak_core_cluster_mgr:shuffle_remote_ipaddrs(Addrs), + case ShuffledAddrs of + [ConnectedAddr|_] -> + ok; % we're already connected to the ideal buddy + + _ -> + %% compute the addrs that are "better" than the currently connected addr + BetterAddrs = keepwhile(fun(A) -> A /= ConnectedAddr end, + ShuffledAddrs), + + %% remove those that are blacklisted anyway + case riak_core_connection_mgr:filter_blacklisted_ipaddrs(BetterAddrs) of + [ConnectedAddr|_] -> + ok; % this is as good as it gets ... + + [] -> + ok; % hmm? Good thing we're already connected to someone! + + BetterRemoteAddrs-> + %% should try to reconnect to one of those... + riak_repl2_rtsource_conn:maybe_reconnect(PID, BetterRemoteAddrs) + end + end; + false -> + ok + end || Remote <- ToValidate ], + case ToEnable ++ ToDisable ++ ToStart ++ ToStop of [] -> []; @@ -240,3 +275,16 @@ set_bucket_meta(Obj) -> _B -> orddict:store(?BT_META_TYPED_BUCKET, false, M) end. + +keepwhile(Pred, List) -> + keepwhile(Pred, List, []). + +keepwhile(_, [], Acc) -> + lists:reverse(Acc); +keepwhile(Pred, [E|Rest], Acc) -> + case Pred(E) of + true -> + keepwhile(Pred, Rest, [E|Acc]); + false -> + lists:reverse(Acc) + end. From eb0bd2cbedd624e0fdcd517e8a43bedd4be97e0c Mon Sep 17 00:00:00 2001 From: Kresten Krab Thorup Date: Tue, 7 Oct 2014 15:37:35 +0200 Subject: [PATCH 31/47] improve ip address comparison --- src/riak_repl2_rt.erl | 32 ++++++++++++++++++++------------ src/riak_repl2_rtsource_conn.erl | 3 ++- 2 files changed, 22 insertions(+), 13 deletions(-) diff --git a/src/riak_repl2_rt.erl b/src/riak_repl2_rt.erl index dfe7c292..6bb8e1fc 100644 --- a/src/riak_repl2_rt.erl +++ b/src/riak_repl2_rt.erl @@ -105,26 +105,27 @@ ensure_rt(WantEnabled0, WantStarted0) -> {ok, ConnectedAddr} = riak_repl2_rtsource_conn:address(PID), Addrs = riak_repl_ring:get_clusterIpAddrs(Ring, Remote), {ok, ShuffledAddrs} = riak_core_cluster_mgr:shuffle_remote_ipaddrs(Addrs), - case ShuffledAddrs of - [ConnectedAddr|_] -> + case (ShuffledAddrs /= []) andalso same_ipaddr(ConnectedAddr, hd(ShuffledAddrs)) of + true -> ok; % we're already connected to the ideal buddy - _ -> + false -> %% compute the addrs that are "better" than the currently connected addr - BetterAddrs = keepwhile(fun(A) -> A /= ConnectedAddr end, + BetterAddrs = keepwhile(fun(A) -> not same_ipaddr(A, ConnectedAddr) end, ShuffledAddrs), %% remove those that are blacklisted anyway case riak_core_connection_mgr:filter_blacklisted_ipaddrs(BetterAddrs) of - [ConnectedAddr|_] -> - ok; % this is as good as it gets ... - [] -> - ok; % hmm? Good thing we're already connected to someone! - - BetterRemoteAddrs-> - %% should try to reconnect to one of those... - riak_repl2_rtsource_conn:maybe_reconnect(PID, BetterRemoteAddrs) + ok; + UsefulAddrs -> + case same_ipaddr(ConnectedAddr, hd(UsefulAddrs)) of + true -> + ok; % this is as good as it gets ... + false -> + %% should try to reconnect to one of those... + riak_repl2_rtsource_conn:maybe_reconnect(PID, UsefulAddrs) + end end end; false -> @@ -288,3 +289,10 @@ keepwhile(Pred, [E|Rest], Acc) -> false -> lists:reverse(Acc) end. + + +same_ipaddr({IP,Port}, {IP,Port}) -> + true; +same_ipaddr(_,_) -> + %% TODO: make sure we support string format + false. diff --git a/src/riak_repl2_rtsource_conn.erl b/src/riak_repl2_rtsource_conn.erl index 41cb8638..47053774 100644 --- a/src/riak_repl2_rtsource_conn.erl +++ b/src/riak_repl2_rtsource_conn.erl @@ -276,7 +276,8 @@ handle_cast({maybe_reconnect, BetterAddrs}, State=#state{ remote=Remote }) -> % prefered version list: [{2,0}, {1,5}, {1,1}, {1,0}] ClientSpec = {{realtime,[{3,0}, {2,0}, {1,5}]}, {TcpOptions, ?MODULE, self()}}, - %% Todo: check for bad remote name + %% TODO: SCHEDULE THIS RECONNECT SOME RANDOM TIME IN THE FUTURE??? + lager:debug("re-connecting to remote ~p", [Remote]), case riak_core_connection_mgr:connect({rt_repl, Remote}, ClientSpec, {use_only, BetterAddrs}) of {ok, Ref} -> From 651ee80fc0e20691e4f399068f3b24acedecac30 Mon Sep 17 00:00:00 2001 From: rsltrifork Date: Tue, 7 Oct 2014 16:27:09 +0200 Subject: [PATCH 32/47] fixes to make riak start --- src/riak_core_cluster_mgr.erl | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/riak_core_cluster_mgr.erl b/src/riak_core_cluster_mgr.erl index 0da54ded..dfe17c77 100644 --- a/src/riak_core_cluster_mgr.erl +++ b/src/riak_core_cluster_mgr.erl @@ -92,7 +92,8 @@ get_ipaddrs_of_cluster/1, set_gc_interval/1, stop/0, - connect_to_clusters/0 + connect_to_clusters/0, + shuffle_remote_ipaddrs/1 ]). %% gen_server callbacks @@ -764,6 +765,8 @@ shuffle_with_seed(List, Seed) -> shuffle_with_seed(List, {S1,S2,S3}). +shuffle_remote_ipaddrs([]) -> + {ok, []}; shuffle_remote_ipaddrs(RemoteUnsorted) -> {ok, MyRing} = riak_core_ring_manager:get_my_ring(), SortedNodes = lists:sort(riak_core_ring:all_members(MyRing)), From 981ec6abab656cd0da96ea4a18e4bff6ad5d6c4b Mon Sep 17 00:00:00 2001 From: rsltrifork Date: Thu, 9 Oct 2014 12:43:49 +0200 Subject: [PATCH 33/47] improved logging --- src/riak_core_cluster_mgr.erl | 4 +-- src/riak_core_service_conn.erl | 2 +- src/riak_repl2_rt.erl | 57 +++++++++++++--------------------- 3 files changed, 24 insertions(+), 39 deletions(-) diff --git a/src/riak_core_cluster_mgr.erl b/src/riak_core_cluster_mgr.erl index dfe17c77..872ce28c 100644 --- a/src/riak_core_cluster_mgr.erl +++ b/src/riak_core_cluster_mgr.erl @@ -550,8 +550,8 @@ save_cluster(NewName, OldMembers, ReturnedMembers, State) -> [NewName, Members]); _ -> persist_members_to_ring(State, NewName, Members), - lager:info("Cluster Manager: updated ~p with members: ~p", - [NewName, Members]) + lager:info("Cluster Manager: updated ~p with members: ~p OldMembers ~p", + [NewName, Members, OldMembers]) end end, %% clear out these IPs from other clusters diff --git a/src/riak_core_service_conn.erl b/src/riak_core_service_conn.erl index 838a1c3b..be3caafe 100644 --- a/src/riak_core_service_conn.erl +++ b/src/riak_core_service_conn.erl @@ -217,7 +217,7 @@ choose_version({ClientProto,ClientVersions}=_CProtocol, HostProtocols) -> case [H || {{HostProto,_Versions},_Rest}=H <- HostProtocols, ClientProto == HostProto] of [] -> %% oops! The host does not support this sub protocol type - lager:error("Failed to find host support for protocol: ~p", [ClientProto]), + lager:error("Failed to find host support for protocol: ~p, HostProtocols = ~p", [ClientProto, HostProtocols]), lager:debug("choose_version: no common protocols"), {error,protocol_not_supported}; [{{_HostProto,HostVersions},Rest}=_Matched | _DuplicatesIgnored] -> diff --git a/src/riak_repl2_rt.erl b/src/riak_repl2_rt.erl index 6bb8e1fc..dd603cef 100644 --- a/src/riak_repl2_rt.erl +++ b/src/riak_repl2_rt.erl @@ -105,27 +105,24 @@ ensure_rt(WantEnabled0, WantStarted0) -> {ok, ConnectedAddr} = riak_repl2_rtsource_conn:address(PID), Addrs = riak_repl_ring:get_clusterIpAddrs(Ring, Remote), {ok, ShuffledAddrs} = riak_core_cluster_mgr:shuffle_remote_ipaddrs(Addrs), + lager:debug("ShuffledAddrs: ~p, ConnectedAddr: ~p", [ShuffledAddrs, ConnectedAddr]), case (ShuffledAddrs /= []) andalso same_ipaddr(ConnectedAddr, hd(ShuffledAddrs)) of true -> ok; % we're already connected to the ideal buddy false -> %% compute the addrs that are "better" than the currently connected addr - BetterAddrs = keepwhile(fun(A) -> not same_ipaddr(A, ConnectedAddr) end, - ShuffledAddrs), + BetterAddrs = lists:takewhile(fun(A) -> not same_ipaddr(ConnectedAddr, A) end, + ShuffledAddrs), %% remove those that are blacklisted anyway - case riak_core_connection_mgr:filter_blacklisted_ipaddrs(BetterAddrs) of + UsefulAddrs = riak_core_connection_mgr:filter_blacklisted_ipaddrs(BetterAddrs), + lager:debug("BetterAddrs: ~p, UsefulAddrs ~p", [BetterAddrs, UsefulAddrs]), + case UsefulAddrs of [] -> ok; UsefulAddrs -> - case same_ipaddr(ConnectedAddr, hd(UsefulAddrs)) of - true -> - ok; % this is as good as it gets ... - false -> - %% should try to reconnect to one of those... - riak_repl2_rtsource_conn:maybe_reconnect(PID, UsefulAddrs) - end + riak_repl2_rtsource_conn:maybe_reconnect(PID, UsefulAddrs) end end; false -> @@ -146,9 +143,9 @@ ensure_rt(WantEnabled0, WantStarted0) -> %% Stop running sources, re-register to get rid of pending %% deliver functions _ = [begin - _ = riak_repl2_rtsource_conn_sup:disable(Remote), - riak_repl2_rtq:register(Remote) - end || Remote <- ToStop], + _ = riak_repl2_rtsource_conn_sup:disable(Remote), + riak_repl2_rtq:register(Remote) + end || Remote <- ToStop], %% Unregister disabled sources, freeing up the queue _ = [riak_repl2_rtq:unregister(Remote) || Remote <- ToDisable], @@ -164,7 +161,7 @@ register_remote_locator() -> {ok, Addrs}; (Name, _Policy) -> riak_core_cluster_mgr:get_ipaddrs_of_cluster(Name) - end, + end, ok = riak_core_connection_mgr:register_locator(rt_repl, Locator). %% Register an active realtime sink (supervised under ranch) @@ -188,11 +185,11 @@ postcommit(RObj) -> Meta = set_bucket_meta(RObj), BinObjs = case orddict:fetch(?BT_META_TYPED_BUCKET, Meta) of - false -> - riak_repl_util:to_wire(w1, Objects); - true -> - riak_repl_util:to_wire(w2, Objects) - end, + false -> + riak_repl_util:to_wire(w1, Objects); + true -> + riak_repl_util:to_wire(w2, Objects) + end, %% try the proxy first, avoids race conditions with unregister() %% during shutdown case whereis(riak_repl2_rtq_proxy) of @@ -208,7 +205,7 @@ postcommit(RObj) -> %% gen_server callbacks init([]) -> - {ok, #state{}}. + {ok, #state{}}. handle_call(status, _From, State = #state{sinks = SinkPids}) -> Timeout = app_helper:get_env(riak_repl, status_timeout, 5000), @@ -277,22 +274,10 @@ set_bucket_meta(Obj) -> orddict:store(?BT_META_TYPED_BUCKET, false, M) end. -keepwhile(Pred, List) -> - keepwhile(Pred, List, []). - -keepwhile(_, [], Acc) -> - lists:reverse(Acc); -keepwhile(Pred, [E|Rest], Acc) -> - case Pred(E) of - true -> - keepwhile(Pred, Rest, [E|Acc]); - false -> - lists:reverse(Acc) - end. - - same_ipaddr({IP,Port}, {IP,Port}) -> true; -same_ipaddr(_,_) -> - %% TODO: make sure we support string format +same_ipaddr({_IP1,_Port1}, {_IP2,_Port2}) -> + false; +same_ipaddr(X,Y) -> + lager:warning("ipaddrs have unexpected format! ~p, ~p", [X,Y]), false. From c1b18ab0f3217db39a7442021496ebf519bec384 Mon Sep 17 00:00:00 2001 From: rsltrifork Date: Thu, 9 Oct 2014 13:25:46 +0200 Subject: [PATCH 34/47] change blacklist behaviour to not blacklist unkown endpoints --- src/riak_core_connection_mgr.erl | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/riak_core_connection_mgr.erl b/src/riak_core_connection_mgr.erl index b46b059b..e133e80f 100644 --- a/src/riak_core_connection_mgr.erl +++ b/src/riak_core_connection_mgr.erl @@ -335,7 +335,7 @@ handle_info({backoff_timer, Addr}, State = #state{endpoints = EPs}) -> {noreply, State#state{endpoints = orddict:store(Addr,EP2,EPs)}}; error -> %% TODO: Should never happen because the Addr came from the EP list. - {norepy, State} + {noreply, State} end; handle_info({retry_req, Ref}, State = #state{pending = Pending}) -> case lists:keyfind(Ref, #req.ref, Pending) of @@ -653,12 +653,14 @@ update_endpoints(Addrs, Endpoints) -> %% Return the addresses of non-blacklisted endpoints that are also %% members of the list EpAddrs. filter_blacklisted_endpoints(EpAddrs, AllEps) -> + lager:info("filter_blacklisted_endpoints(EpAddrs ~p, AllEps ~p) ->", [EpAddrs, AllEps]), PredicateFun = (fun(Addr) -> case orddict:find(Addr, AllEps) of {ok, EP} -> EP#ep.is_black_listed == false; error -> - false + %% If we don't know this endpoint, it is not blacklisted. + true end end), lists:filter(PredicateFun, EpAddrs). From b4df4c91689ee9ae39c23fb31335eb3de28550a5 Mon Sep 17 00:00:00 2001 From: rsltrifork Date: Thu, 9 Oct 2014 14:30:35 +0200 Subject: [PATCH 35/47] delayed rebalance of rt connections --- src/riak_repl2_rt.erl | 35 +--------- src/riak_repl2_rtsource_conn.erl | 108 ++++++++++++++++++++++--------- 2 files changed, 78 insertions(+), 65 deletions(-) diff --git a/src/riak_repl2_rt.erl b/src/riak_repl2_rt.erl index dd603cef..09365423 100644 --- a/src/riak_repl2_rt.erl +++ b/src/riak_repl2_rt.erl @@ -97,34 +97,11 @@ ensure_rt(WantEnabled0, WantStarted0) -> application:set_env(riak_repl, rtenabled, true) end, - %% Check if we're connected to our preferred peer(s). If not, initate reconnect - {ok, Ring} = riak_core_ring_manager:get_my_ring(), + %% For each connection to validate, call maybe_rebalance_delayed to handle the potential need to rebalance connections. ToValidate = Started -- ToStop, _ = [case lists:keyfind(Remote, 1, Connections) of {_, PID} -> - {ok, ConnectedAddr} = riak_repl2_rtsource_conn:address(PID), - Addrs = riak_repl_ring:get_clusterIpAddrs(Ring, Remote), - {ok, ShuffledAddrs} = riak_core_cluster_mgr:shuffle_remote_ipaddrs(Addrs), - lager:debug("ShuffledAddrs: ~p, ConnectedAddr: ~p", [ShuffledAddrs, ConnectedAddr]), - case (ShuffledAddrs /= []) andalso same_ipaddr(ConnectedAddr, hd(ShuffledAddrs)) of - true -> - ok; % we're already connected to the ideal buddy - - false -> - %% compute the addrs that are "better" than the currently connected addr - BetterAddrs = lists:takewhile(fun(A) -> not same_ipaddr(ConnectedAddr, A) end, - ShuffledAddrs), - - %% remove those that are blacklisted anyway - UsefulAddrs = riak_core_connection_mgr:filter_blacklisted_ipaddrs(BetterAddrs), - lager:debug("BetterAddrs: ~p, UsefulAddrs ~p", [BetterAddrs, UsefulAddrs]), - case UsefulAddrs of - [] -> - ok; - UsefulAddrs -> - riak_repl2_rtsource_conn:maybe_reconnect(PID, UsefulAddrs) - end - end; + riak_repl2_rtsource_conn:maybe_rebalance_delayed(PID); false -> ok end || Remote <- ToValidate ], @@ -273,11 +250,3 @@ set_bucket_meta(Obj) -> _B -> orddict:store(?BT_META_TYPED_BUCKET, false, M) end. - -same_ipaddr({IP,Port}, {IP,Port}) -> - true; -same_ipaddr({_IP1,_Port1}, {_IP2,_Port2}) -> - false; -same_ipaddr(X,Y) -> - lager:warning("ipaddrs have unexpected format! ~p, ~p", [X,Y]), - false. diff --git a/src/riak_repl2_rtsource_conn.erl b/src/riak_repl2_rtsource_conn.erl index 47053774..6e76bbb3 100644 --- a/src/riak_repl2_rtsource_conn.erl +++ b/src/riak_repl2_rtsource_conn.erl @@ -44,7 +44,7 @@ -export([start_link/1, stop/1, status/1, status/2, - address/1, maybe_reconnect/2, + address/1, maybe_rebalance_delayed/1, legacy_status/1, legacy_status/2]). %% connection manager callbacks @@ -85,8 +85,12 @@ stop(Pid) -> address(Pid) -> gen_server:call(Pid, address, ?LONG_TIMEOUT). -maybe_reconnect(Pid, BetterAddrs) -> - gen_server:cast(Pid, {maybe_reconnect, BetterAddrs}). +%% @doc Check if we need to rebalance. +%% If we do, delay some time, recheck that we still +%% need to rebalance, and if we still do, then execute +%% reconnection to the better sink node. +maybe_rebalance_delayed(Pid) -> + gen_server:cast(Pid, rebalance_delayed). status(Pid) -> status(Pid, infinity). @@ -256,37 +260,10 @@ handle_cast({connect_failed, _HelperPid, Reason}, lager:warning("Realtime replication connection to site ~p failed - ~p\n", [Remote, Reason]), {stop, normal, State}; -handle_cast({maybe_reconnect, BetterAddrs}, State=#state{ remote=Remote }) -> - lager:info("trying reconnect to one of: ~p", [BetterAddrs]), +handle_cast(rebalance_delayed, State) -> + {noreply, rebalance(State, delayed)}. - %% if we have a pending connection attempt - drop that - riak_core_connection_mgr:disconnect({rt_repl, Remote}), - - TcpOptions = [{keepalive, true}, - {nodelay, true}, - {packet, 0}, - {active, false}], - % nodes running 1.3.1 have a bug in the service_mgr module. - % this bug prevents it from being able to negotiate a version list longer - % than 2. Until we no longer support communicating with that version, - % we need to artifically truncate the version list. - % TODO: expand version list or remove comment when we no - % longer support 1.3.1 - % prefered version list: [{2,0}, {1,5}, {1,1}, {1,0}] - ClientSpec = {{realtime,[{3,0}, {2,0}, {1,5}]}, {TcpOptions, ?MODULE, self()}}, - - %% TODO: SCHEDULE THIS RECONNECT SOME RANDOM TIME IN THE FUTURE??? - - lager:debug("re-connecting to remote ~p", [Remote]), - case riak_core_connection_mgr:connect({rt_repl, Remote}, ClientSpec, {use_only, BetterAddrs}) of - {ok, Ref} -> - lager:debug("connecting ref ~p", [Ref]), - {noreply, State#state{ connection_ref = Ref}}; - {error, Reason}-> - lager:warning("Error connecting to remote ~p (ignoring as we're reconnecting)", [Reason]), - {noreply, State} - end. handle_info({Proto, _S, TcpBin}, State= #state{cont = Cont}) when Proto == tcp; Proto == ssl -> @@ -337,6 +314,8 @@ handle_info({heartbeat_timeout, HBSent}, State = #state{hb_sent_q = HBSentQ, lager:debug("hb_sent_q_len after heartbeat_timeout: ~p", [queue:len(HBSentQ)]), {stop, normal, State} end; +handle_info(rebalance_now, State) -> + {noreply, rebalance(State, now)}; handle_info(Msg, State) -> lager:warning("Unhandled info: ~p", [Msg]), {noreply, State}. @@ -348,6 +327,63 @@ terminate(_Reason, #state{helper_pid=_HelperPid, remote=Remote}) -> code_change(_OldVsn, State, _Extra) -> {ok, State}. +rebalance(State=#state{address=ConnectedAddr, remote=Remote}, NowOrDelayed) -> + {ok, Ring} = riak_core_ring_manager:get_my_ring(), + Addrs = riak_repl_ring:get_clusterIpAddrs(Ring, Remote), + {ok, ShuffledAddrs} = riak_core_cluster_mgr:shuffle_remote_ipaddrs(Addrs), + lager:debug("ShuffledAddrs: ~p, ConnectedAddr: ~p", [ShuffledAddrs, ConnectedAddr]), + case (ShuffledAddrs /= []) andalso same_ipaddr(ConnectedAddr, hd(ShuffledAddrs)) of + true -> + State; % we're already connected to the ideal buddy + + false -> + %% compute the addrs that are "better" than the currently connected addr + BetterAddrs = lists:takewhile(fun(A) -> not same_ipaddr(ConnectedAddr, A) end, + ShuffledAddrs), + + %% remove those that are blacklisted anyway + UsefulAddrs = riak_core_connection_mgr:filter_blacklisted_ipaddrs(BetterAddrs), + lager:debug("BetterAddrs: ~p, UsefulAddrs ~p", [BetterAddrs, UsefulAddrs]), + case UsefulAddrs of + [] -> + State; + UsefulAddrs -> + case NowOrDelayed of + now -> + do_maybe_reconnect(State, UsefulAddrs); + delayed -> + MaxDelaySecs = application:get_env(riak_repl, realtime_connection_rebalance_max_delay_secs, 5*60), + DelayMillis = round(MaxDelaySecs * 1000 * crypto:rand_uniform(0, 1000)), + erlang:send_after(DelayMillis, self(), rebalance_now), + State + end + end + end. + + +do_maybe_reconnect(State=#state{remote=Remote}, BetterAddrs) -> + lager:info("trying reconnect to one of: ~p", [BetterAddrs]), + + %% if we have a pending connection attempt - drop that + riak_core_connection_mgr:disconnect({rt_repl, Remote}), + + TcpOptions = [{keepalive, true}, + {nodelay, true}, + {packet, 0}, + {active, false}], + ClientSpec = {{realtime,[{3,0}, {2,0}, {1,5}]}, {TcpOptions, ?MODULE, self()}}, + + lager:debug("re-connecting to remote ~p", [Remote]), + case riak_core_connection_mgr:connect({rt_repl, Remote}, ClientSpec, {use_only, BetterAddrs}) of + {ok, Ref} -> + lager:debug("connecting ref ~p", [Ref]), + {noreply, State#state{ connection_ref = Ref}}; + {error, Reason}-> + lager:warning("Error connecting to remote ~p (ignoring as we're reconnecting)", [Reason]), + {noreply, State} + end. + + cancel_timer(undefined) -> ok; cancel_timer(TRef) -> _ = erlang:cancel_timer(TRef). @@ -434,6 +470,14 @@ schedule_heartbeat(State) -> lager:warning("Heartbeat is misconfigured and is not a valid integer."), State. +same_ipaddr({IP,Port}, {IP,Port}) -> + true; +same_ipaddr({_IP1,_Port1}, {_IP2,_Port2}) -> + false; +same_ipaddr(X,Y) -> + lager:warning("ipaddrs have unexpected format! ~p, ~p", [X,Y]), + false. + %% =================================================================== %% EUnit tests %% =================================================================== From ef29773a1655e17b64a2e35ea696d0d3156cf9f9 Mon Sep 17 00:00:00 2001 From: rsltrifork Date: Thu, 9 Oct 2014 14:41:23 +0200 Subject: [PATCH 36/47] refac --- src/riak_repl2_rtsource_conn.erl | 39 ++++++++++++++++++++------------ 1 file changed, 24 insertions(+), 15 deletions(-) diff --git a/src/riak_repl2_rtsource_conn.erl b/src/riak_repl2_rtsource_conn.erl index 6e76bbb3..2a265470 100644 --- a/src/riak_repl2_rtsource_conn.erl +++ b/src/riak_repl2_rtsource_conn.erl @@ -262,7 +262,7 @@ handle_cast({connect_failed, _HelperPid, Reason}, {stop, normal, State}; handle_cast(rebalance_delayed, State) -> - {noreply, rebalance(State, delayed)}. + {noreply, maybe_rebalance(State, delayed)}. handle_info({Proto, _S, TcpBin}, State= #state{cont = Cont}) @@ -315,7 +315,7 @@ handle_info({heartbeat_timeout, HBSent}, State = #state{hb_sent_q = HBSentQ, {stop, normal, State} end; handle_info(rebalance_now, State) -> - {noreply, rebalance(State, now)}; + {noreply, maybe_rebalance(State, now)}; handle_info(Msg, State) -> lager:warning("Unhandled info: ~p", [Msg]), {noreply, State}. @@ -327,14 +327,28 @@ terminate(_Reason, #state{helper_pid=_HelperPid, remote=Remote}) -> code_change(_OldVsn, State, _Extra) -> {ok, State}. -rebalance(State=#state{address=ConnectedAddr, remote=Remote}, NowOrDelayed) -> +maybe_rebalance(State, NowOrDelayed) -> + case should_rebalance(State) of + no -> + State; + {yes, UsefulAddrs} -> + case NowOrDelayed of + now -> + reconnect(State, UsefulAddrs); + delayed -> + erlang:send_after(rebalance_delay_millis(), self(), rebalance_now), + State + end + end. + +should_rebalance(#state{address=ConnectedAddr, remote=Remote}) -> {ok, Ring} = riak_core_ring_manager:get_my_ring(), Addrs = riak_repl_ring:get_clusterIpAddrs(Ring, Remote), {ok, ShuffledAddrs} = riak_core_cluster_mgr:shuffle_remote_ipaddrs(Addrs), lager:debug("ShuffledAddrs: ~p, ConnectedAddr: ~p", [ShuffledAddrs, ConnectedAddr]), case (ShuffledAddrs /= []) andalso same_ipaddr(ConnectedAddr, hd(ShuffledAddrs)) of true -> - State; % we're already connected to the ideal buddy + no; % we're already connected to the ideal buddy false -> %% compute the addrs that are "better" than the currently connected addr @@ -346,22 +360,17 @@ rebalance(State=#state{address=ConnectedAddr, remote=Remote}, NowOrDelayed) -> lager:debug("BetterAddrs: ~p, UsefulAddrs ~p", [BetterAddrs, UsefulAddrs]), case UsefulAddrs of [] -> - State; + no; UsefulAddrs -> - case NowOrDelayed of - now -> - do_maybe_reconnect(State, UsefulAddrs); - delayed -> - MaxDelaySecs = application:get_env(riak_repl, realtime_connection_rebalance_max_delay_secs, 5*60), - DelayMillis = round(MaxDelaySecs * 1000 * crypto:rand_uniform(0, 1000)), - erlang:send_after(DelayMillis, self(), rebalance_now), - State - end + {yes, UsefulAddrs} end end. +rebalance_delay_millis() -> + MaxDelaySecs = application:get_env(riak_repl, realtime_connection_rebalance_max_delay_secs, 5*60), + round(MaxDelaySecs * 1000 * crypto:rand_uniform(0, 1000)). -do_maybe_reconnect(State=#state{remote=Remote}, BetterAddrs) -> +reconnect(State=#state{remote=Remote}, BetterAddrs) -> lager:info("trying reconnect to one of: ~p", [BetterAddrs]), %% if we have a pending connection attempt - drop that From 3d33b54947a1b4bf9c484eea17584c55d016fa7b Mon Sep 17 00:00:00 2001 From: rsltrifork Date: Thu, 9 Oct 2014 14:41:37 +0200 Subject: [PATCH 37/47] indent --- src/riak_repl2_rtsource_conn.erl | 180 +++++++++++++++---------------- 1 file changed, 90 insertions(+), 90 deletions(-) diff --git a/src/riak_repl2_rtsource_conn.erl b/src/riak_repl2_rtsource_conn.erl index 2a265470..36a5388b 100644 --- a/src/riak_repl2_rtsource_conn.erl +++ b/src/riak_repl2_rtsource_conn.erl @@ -90,7 +90,7 @@ address(Pid) -> %% need to rebalance, and if we still do, then execute %% reconnection to the better sink node. maybe_rebalance_delayed(Pid) -> - gen_server:cast(Pid, rebalance_delayed). + gen_server:cast(Pid, rebalance_delayed). status(Pid) -> status(Pid, infinity). @@ -132,13 +132,13 @@ init([Remote]) -> {nodelay, true}, {packet, 0}, {active, false}], - % nodes running 1.3.1 have a bug in the service_mgr module. - % this bug prevents it from being able to negotiate a version list longer - % than 2. Until we no longer support communicating with that version, - % we need to artifically truncate the version list. - % TODO: expand version list or remove comment when we no - % longer support 1.3.1 - % prefered version list: [{2,0}, {1,5}, {1,1}, {1,0}] + % nodes running 1.3.1 have a bug in the service_mgr module. + % this bug prevents it from being able to negotiate a version list longer + % than 2. Until we no longer support communicating with that version, + % we need to artifically truncate the version list. + % TODO: expand version list or remove comment when we no + % longer support 1.3.1 + % prefered version list: [{2,0}, {1,5}, {1,1}, {1,0}] ClientSpec = {{realtime,[{3,0}, {2,0}, {1,5}]}, {TcpOptions, ?MODULE, self()}}, %% Todo: check for bad remote name @@ -223,7 +223,7 @@ handle_call({connected, Socket, Transport, EndPoint, Proto}, _From, SocketTag = riak_repl_util:generate_socket_tag("rt_source", Transport, Socket), lager:debug("Keeping stats for " ++ SocketTag), riak_core_tcp_mon:monitor(Socket, {?TCP_MON_RT_APP, source, - SocketTag}, Transport), + SocketTag}, Transport), State2 = State#state{transport = Transport, socket = Socket, address = EndPoint, @@ -232,7 +232,7 @@ handle_call({connected, Socket, Transport, EndPoint, Proto}, _From, helper_pid = HelperPid, ver = Ver}, lager:info("Established realtime connection to site ~p address ~s", - [Remote, peername(State2)]), + [Remote, peername(State2)]), case Proto of {realtime, _OurVer, {1, 0}} -> @@ -262,15 +262,15 @@ handle_cast({connect_failed, _HelperPid, Reason}, {stop, normal, State}; handle_cast(rebalance_delayed, State) -> - {noreply, maybe_rebalance(State, delayed)}. + {noreply, maybe_rebalance(State, delayed)}. handle_info({Proto, _S, TcpBin}, State= #state{cont = Cont}) - when Proto == tcp; Proto == ssl -> + when Proto == tcp; Proto == ssl -> recv(<>, State); handle_info({Closed, _S}, State = #state{remote = Remote, cont = Cont}) - when Closed == tcp_closed; Closed == ssl_closed -> + when Closed == tcp_closed; Closed == ssl_closed -> case size(Cont) of 0 -> ok; @@ -285,7 +285,7 @@ handle_info({Closed, _S}, {stop, normal, State}; handle_info({Error, _S, Reason}, State = #state{remote = Remote, cont = Cont}) - when Error == tcp_error; Error == ssl_error -> + when Error == tcp_error; Error == ssl_error -> riak_repl_stats:rt_source_errors(), lager:warning("Realtime connection ~s to ~p network error ~p - ~b bytes pending\n", [peername(State), Remote, Reason, size(Cont)]), @@ -315,7 +315,7 @@ handle_info({heartbeat_timeout, HBSent}, State = #state{hb_sent_q = HBSentQ, {stop, normal, State} end; handle_info(rebalance_now, State) -> - {noreply, maybe_rebalance(State, now)}; + {noreply, maybe_rebalance(State, now)}; handle_info(Msg, State) -> lager:warning("Unhandled info: ~p", [Msg]), {noreply, State}. @@ -328,69 +328,69 @@ code_change(_OldVsn, State, _Extra) -> {ok, State}. maybe_rebalance(State, NowOrDelayed) -> - case should_rebalance(State) of - no -> - State; - {yes, UsefulAddrs} -> - case NowOrDelayed of - now -> - reconnect(State, UsefulAddrs); - delayed -> - erlang:send_after(rebalance_delay_millis(), self(), rebalance_now), - State - end - end. + case should_rebalance(State) of + no -> + State; + {yes, UsefulAddrs} -> + case NowOrDelayed of + now -> + reconnect(State, UsefulAddrs); + delayed -> + erlang:send_after(rebalance_delay_millis(), self(), rebalance_now), + State + end + end. should_rebalance(#state{address=ConnectedAddr, remote=Remote}) -> - {ok, Ring} = riak_core_ring_manager:get_my_ring(), - Addrs = riak_repl_ring:get_clusterIpAddrs(Ring, Remote), - {ok, ShuffledAddrs} = riak_core_cluster_mgr:shuffle_remote_ipaddrs(Addrs), - lager:debug("ShuffledAddrs: ~p, ConnectedAddr: ~p", [ShuffledAddrs, ConnectedAddr]), - case (ShuffledAddrs /= []) andalso same_ipaddr(ConnectedAddr, hd(ShuffledAddrs)) of - true -> - no; % we're already connected to the ideal buddy - - false -> - %% compute the addrs that are "better" than the currently connected addr - BetterAddrs = lists:takewhile(fun(A) -> not same_ipaddr(ConnectedAddr, A) end, - ShuffledAddrs), - - %% remove those that are blacklisted anyway - UsefulAddrs = riak_core_connection_mgr:filter_blacklisted_ipaddrs(BetterAddrs), - lager:debug("BetterAddrs: ~p, UsefulAddrs ~p", [BetterAddrs, UsefulAddrs]), - case UsefulAddrs of - [] -> - no; - UsefulAddrs -> - {yes, UsefulAddrs} - end - end. + {ok, Ring} = riak_core_ring_manager:get_my_ring(), + Addrs = riak_repl_ring:get_clusterIpAddrs(Ring, Remote), + {ok, ShuffledAddrs} = riak_core_cluster_mgr:shuffle_remote_ipaddrs(Addrs), + lager:debug("ShuffledAddrs: ~p, ConnectedAddr: ~p", [ShuffledAddrs, ConnectedAddr]), + case (ShuffledAddrs /= []) andalso same_ipaddr(ConnectedAddr, hd(ShuffledAddrs)) of + true -> + no; % we're already connected to the ideal buddy + + false -> + %% compute the addrs that are "better" than the currently connected addr + BetterAddrs = lists:takewhile(fun(A) -> not same_ipaddr(ConnectedAddr, A) end, + ShuffledAddrs), + + %% remove those that are blacklisted anyway + UsefulAddrs = riak_core_connection_mgr:filter_blacklisted_ipaddrs(BetterAddrs), + lager:debug("BetterAddrs: ~p, UsefulAddrs ~p", [BetterAddrs, UsefulAddrs]), + case UsefulAddrs of + [] -> + no; + UsefulAddrs -> + {yes, UsefulAddrs} + end + end. rebalance_delay_millis() -> MaxDelaySecs = application:get_env(riak_repl, realtime_connection_rebalance_max_delay_secs, 5*60), round(MaxDelaySecs * 1000 * crypto:rand_uniform(0, 1000)). reconnect(State=#state{remote=Remote}, BetterAddrs) -> - lager:info("trying reconnect to one of: ~p", [BetterAddrs]), + lager:info("trying reconnect to one of: ~p", [BetterAddrs]), - %% if we have a pending connection attempt - drop that - riak_core_connection_mgr:disconnect({rt_repl, Remote}), + %% if we have a pending connection attempt - drop that + riak_core_connection_mgr:disconnect({rt_repl, Remote}), - TcpOptions = [{keepalive, true}, - {nodelay, true}, - {packet, 0}, - {active, false}], - ClientSpec = {{realtime,[{3,0}, {2,0}, {1,5}]}, {TcpOptions, ?MODULE, self()}}, + TcpOptions = [{keepalive, true}, + {nodelay, true}, + {packet, 0}, + {active, false}], + ClientSpec = {{realtime,[{3,0}, {2,0}, {1,5}]}, {TcpOptions, ?MODULE, self()}}, - lager:debug("re-connecting to remote ~p", [Remote]), - case riak_core_connection_mgr:connect({rt_repl, Remote}, ClientSpec, {use_only, BetterAddrs}) of - {ok, Ref} -> - lager:debug("connecting ref ~p", [Ref]), - {noreply, State#state{ connection_ref = Ref}}; - {error, Reason}-> - lager:warning("Error connecting to remote ~p (ignoring as we're reconnecting)", [Reason]), - {noreply, State} - end. + lager:debug("re-connecting to remote ~p", [Remote]), + case riak_core_connection_mgr:connect({rt_repl, Remote}, ClientSpec, {use_only, BetterAddrs}) of + {ok, Ref} -> + lager:debug("connecting ref ~p", [Ref]), + {noreply, State#state{ connection_ref = Ref}}; + {error, Reason}-> + lager:warning("Error connecting to remote ~p (ignoring as we're reconnecting)", [Reason]), + {noreply, State} + end. cancel_timer(undefined) -> ok; @@ -458,7 +458,7 @@ send_heartbeat(State = #state{hb_timeout = HBTimeout, hb_sent_q = SentQ, helper_pid = HelperPid}) when is_integer(HBTimeout) -> Now = now(), % using now as need a unique reference for this heartbeat - % to spot late heartbeat timeout messages + % to spot late heartbeat timeout messages riak_repl2_rtsource_helper:send_heartbeat(HelperPid), TRef = erlang:send_after(timer:seconds(HBTimeout), self(), {heartbeat_timeout, Now}), State2 = State#state{hb_interval_tref = undefined, hb_timeout_tref = TRef, @@ -471,7 +471,7 @@ send_heartbeat(State) -> %% Schedule the next heartbeat schedule_heartbeat(State = #state{hb_interval_tref = undefined, - hb_interval = HBInterval}) when is_integer(HBInterval) -> + hb_interval = HBInterval}) when is_integer(HBInterval) -> TRef = erlang:send_after(timer:seconds(HBInterval), self(), send_heartbeat), State#state{hb_interval_tref = TRef}; @@ -480,12 +480,12 @@ schedule_heartbeat(State) -> State. same_ipaddr({IP,Port}, {IP,Port}) -> - true; + true; same_ipaddr({_IP1,_Port1}, {_IP2,_Port2}) -> - false; + false; same_ipaddr(X,Y) -> - lager:warning("ipaddrs have unexpected format! ~p, ~p", [X,Y]), - false. + lager:warning("ipaddrs have unexpected format! ~p, ~p", [X,Y]), + false. %% =================================================================== %% EUnit tests @@ -548,29 +548,29 @@ cache_peername_test_case() -> setup_connection_for_peername() -> riak_repl_test_util:reset_meck(riak_core_connection_mgr, [no_link, passthrough]), meck:expect(riak_core_connection_mgr, connect, fun(_ServiceAndRemote, ClientSpec) -> - proc_lib:spawn_link(fun() -> - Version = stateful:version(), - {_Proto, {TcpOpts, Module, Pid}} = ClientSpec, - {ok, Socket} = gen_tcp:connect("127.0.0.1", ?SINK_PORT, [binary | TcpOpts]), + proc_lib:spawn_link(fun() -> + Version = stateful:version(), + {_Proto, {TcpOpts, Module, Pid}} = ClientSpec, + {ok, Socket} = gen_tcp:connect("127.0.0.1", ?SINK_PORT, [binary | TcpOpts]), - ok = Module:connected(Socket, gen_tcp, {"127.0.0.1", ?SINK_PORT}, Version, Pid, []), + ok = Module:connected(Socket, gen_tcp, {"127.0.0.1", ?SINK_PORT}, Version, Pid, []), - % simulate local socket problem - inet:close(Socket), + % simulate local socket problem + inet:close(Socket), - % get the State from the source connection. - {status,Pid,_,[_,_,_,_,[_,_,{data,[{_,State}]}]]} = sys:get_status(Pid), + % get the State from the source connection. + {status,Pid,_,[_,_,_,_,[_,_,{data,[{_,State}]}]]} = sys:get_status(Pid), - % getting the peername from the socket should produce error string - ?assertEqual("error:einval", peername(inet, Socket)), + % getting the peername from the socket should produce error string + ?assertEqual("error:einval", peername(inet, Socket)), - % while getting the peername from the State should produce the cached string - ?assertEqual("127.0.0.1:5007", peername(State)) + % while getting the peername from the State should produce the cached string + ?assertEqual("127.0.0.1:5007", peername(State)) - end), + end), - {ok, make_ref()} - end). + {ok, make_ref()} + end). %% Connect to the 'fake' sink connect(RemoteName) -> @@ -593,7 +593,7 @@ connect(RemoteName) -> {sink_started, SinkPid} -> {SourcePid, SinkPid} after 1000 -> - {error, timeout} + {error, timeout} end. -endif. From 4b8b350b0565edf94ff307e9fe0c0f18c01d41af Mon Sep 17 00:00:00 2001 From: rsltrifork Date: Thu, 9 Oct 2014 18:24:19 +0200 Subject: [PATCH 38/47] remove spamming log statement --- src/riak_core_connection_mgr.erl | 1 - 1 file changed, 1 deletion(-) diff --git a/src/riak_core_connection_mgr.erl b/src/riak_core_connection_mgr.erl index e133e80f..831e8254 100644 --- a/src/riak_core_connection_mgr.erl +++ b/src/riak_core_connection_mgr.erl @@ -653,7 +653,6 @@ update_endpoints(Addrs, Endpoints) -> %% Return the addresses of non-blacklisted endpoints that are also %% members of the list EpAddrs. filter_blacklisted_endpoints(EpAddrs, AllEps) -> - lager:info("filter_blacklisted_endpoints(EpAddrs ~p, AllEps ~p) ->", [EpAddrs, AllEps]), PredicateFun = (fun(Addr) -> case orddict:find(Addr, AllEps) of {ok, EP} -> From b54555eff9bc167464958ee65eda047ef0cd030f Mon Sep 17 00:00:00 2001 From: Mikael Lixenstrand Date: Mon, 13 Oct 2014 13:32:47 +0100 Subject: [PATCH 39/47] Only send one rebalance_now. --- src/riak_repl2_rtsource_conn.erl | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/src/riak_repl2_rtsource_conn.erl b/src/riak_repl2_rtsource_conn.erl index 36a5388b..fc9d0d3e 100644 --- a/src/riak_repl2_rtsource_conn.erl +++ b/src/riak_repl2_rtsource_conn.erl @@ -73,6 +73,7 @@ hb_timeout_tref,% heartbeat timeout timer reference hb_sent_q, % queue of heartbeats now() that were sent hb_rtt, % RTT in milliseconds for last completed heartbeat + rb_timeout_tref,% Rebalance timeout timer reference cont = <<>>}). % continuation from previous TCP buffer %% API - start trying to send realtime repl to remote site @@ -315,7 +316,7 @@ handle_info({heartbeat_timeout, HBSent}, State = #state{hb_sent_q = HBSentQ, {stop, normal, State} end; handle_info(rebalance_now, State) -> - {noreply, maybe_rebalance(State, now)}; + {noreply, maybe_rebalance(State#state{rb_timeout_tref = undefined}, now)}; handle_info(Msg, State) -> lager:warning("Unhandled info: ~p", [Msg]), {noreply, State}. @@ -336,8 +337,14 @@ maybe_rebalance(State, NowOrDelayed) -> now -> reconnect(State, UsefulAddrs); delayed -> - erlang:send_after(rebalance_delay_millis(), self(), rebalance_now), - State + case State#state.rb_timeout_tref of + undefined -> + RbTimeoutTref = erlang:send_after(rebalance_delay_millis(), self(), rebalance_now), + State#state{rb_timeout_tref = RbTimeoutTref}; + _ -> + %% Already sent a "rebalance_now" + State + end end end. From 51c417b311bf092c52f3b85e92a5f834503abcfe Mon Sep 17 00:00:00 2001 From: Mikael Lixenstrand Date: Mon, 13 Oct 2014 14:26:53 +0100 Subject: [PATCH 40/47] Remove extra multiplication with 1000 --- src/riak_repl2_rtsource_conn.erl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/riak_repl2_rtsource_conn.erl b/src/riak_repl2_rtsource_conn.erl index fc9d0d3e..b473bd10 100644 --- a/src/riak_repl2_rtsource_conn.erl +++ b/src/riak_repl2_rtsource_conn.erl @@ -374,8 +374,8 @@ should_rebalance(#state{address=ConnectedAddr, remote=Remote}) -> end. rebalance_delay_millis() -> - MaxDelaySecs = application:get_env(riak_repl, realtime_connection_rebalance_max_delay_secs, 5*60), - round(MaxDelaySecs * 1000 * crypto:rand_uniform(0, 1000)). + MaxDelaySecs = app_helper:get_env(riak_repl, realtime_connection_rebalance_max_delay_secs, 5*60), + round(MaxDelaySecs * crypto:rand_uniform(0, 1000)). reconnect(State=#state{remote=Remote}, BetterAddrs) -> lager:info("trying reconnect to one of: ~p", [BetterAddrs]), From a6873bd5192b33b2fad2e96de21353134c3a5582 Mon Sep 17 00:00:00 2001 From: Mikael Lixenstrand Date: Mon, 13 Oct 2014 14:52:41 +0100 Subject: [PATCH 41/47] Remove duplicated code --- src/riak_repl2_rt.erl | 3 ++- src/riak_repl2_rtsource_conn.erl | 40 +++++++++++++++----------------- 2 files changed, 21 insertions(+), 22 deletions(-) diff --git a/src/riak_repl2_rt.erl b/src/riak_repl2_rt.erl index 09365423..30056b70 100644 --- a/src/riak_repl2_rt.erl +++ b/src/riak_repl2_rt.erl @@ -97,7 +97,8 @@ ensure_rt(WantEnabled0, WantStarted0) -> application:set_env(riak_repl, rtenabled, true) end, - %% For each connection to validate, call maybe_rebalance_delayed to handle the potential need to rebalance connections. + %% For each connection to validate, call maybe_rebalance_delayed to handle + %% the potential need to rebalance connections. ToValidate = Started -- ToStop, _ = [case lists:keyfind(Remote, 1, Connections) of {_, PID} -> diff --git a/src/riak_repl2_rtsource_conn.erl b/src/riak_repl2_rtsource_conn.erl index b473bd10..36b623da 100644 --- a/src/riak_repl2_rtsource_conn.erl +++ b/src/riak_repl2_rtsource_conn.erl @@ -58,6 +58,23 @@ -define(DEFAULT_HBINTERVAL, 15). -define(DEFAULT_HBTIMEOUT, 15). +-define(TCP_OPTIONS, [{keepalive, true}, + {nodelay, true}, + {packet, 0}, + {active, false}]). + +%% nodes running 1.3.1 have a bug in the service_mgr module. +%% this bug prevents it from being able to negotiate a version list longer +%% than 2. Until we no longer support communicating with that version, +%% we need to artifically truncate the version list. +%% TODO: expand version list or remove comment when we no +%% longer support 1.3.1 +%% prefered version list: [{2,0}, {1,5}, {1,1}, {1,0}] + + +-define(CLIENT_SPEC, {{realtime,[{3,0}, {2,0}, {1,5}]}, + {?TCP_OPTIONS, ?MODULE, self()}}). + -record(state, {remote, % remote name address, % {IP, Port} connection_ref, % reference handed out by connection manager @@ -129,22 +146,9 @@ connect_failed(_ClientProto, Reason, RtSourcePid) -> %% Initialize init([Remote]) -> - TcpOptions = [{keepalive, true}, - {nodelay, true}, - {packet, 0}, - {active, false}], - % nodes running 1.3.1 have a bug in the service_mgr module. - % this bug prevents it from being able to negotiate a version list longer - % than 2. Until we no longer support communicating with that version, - % we need to artifically truncate the version list. - % TODO: expand version list or remove comment when we no - % longer support 1.3.1 - % prefered version list: [{2,0}, {1,5}, {1,1}, {1,0}] - ClientSpec = {{realtime,[{3,0}, {2,0}, {1,5}]}, {TcpOptions, ?MODULE, self()}}, - %% Todo: check for bad remote name lager:debug("connecting to remote ~p", [Remote]), - case riak_core_connection_mgr:connect({rt_repl, Remote}, ClientSpec) of + case riak_core_connection_mgr:connect({rt_repl, Remote}, ?CLIENT_SPEC) of {ok, Ref} -> lager:debug("connection ref ~p", [Ref]), {ok, #state{remote = Remote, connection_ref = Ref}}; @@ -383,14 +387,8 @@ reconnect(State=#state{remote=Remote}, BetterAddrs) -> %% if we have a pending connection attempt - drop that riak_core_connection_mgr:disconnect({rt_repl, Remote}), - TcpOptions = [{keepalive, true}, - {nodelay, true}, - {packet, 0}, - {active, false}], - ClientSpec = {{realtime,[{3,0}, {2,0}, {1,5}]}, {TcpOptions, ?MODULE, self()}}, - lager:debug("re-connecting to remote ~p", [Remote]), - case riak_core_connection_mgr:connect({rt_repl, Remote}, ClientSpec, {use_only, BetterAddrs}) of + case riak_core_connection_mgr:connect({rt_repl, Remote}, ?CLIENT_SPEC, {use_only, BetterAddrs}) of {ok, Ref} -> lager:debug("connecting ref ~p", [Ref]), {noreply, State#state{ connection_ref = Ref}}; From 610508ed802cc44fa4afef8137a93d507903f3eb Mon Sep 17 00:00:00 2001 From: Mikael Lixenstrand Date: Mon, 13 Oct 2014 17:40:24 +0100 Subject: [PATCH 42/47] Refactor maybe_rebalance --- src/riak_core_cluster_conn.erl | 34 ++++++++------- src/riak_repl2_rt.erl | 21 ++++----- src/riak_repl2_rtsource_conn.erl | 75 ++++++++++++++++---------------- 3 files changed, 66 insertions(+), 64 deletions(-) diff --git a/src/riak_core_cluster_conn.erl b/src/riak_core_cluster_conn.erl index 0b1e2db6..39fa03d0 100644 --- a/src/riak_core_cluster_conn.erl +++ b/src/riak_core_cluster_conn.erl @@ -256,10 +256,11 @@ waiting_for_cluster_members({cluster_members, NewMembers}, State = #state{ proto %% this is 1.0 code. NewMembers is list of {IP,Port} SortedNew = ordsets:from_list(NewMembers), - Members = NewMembers ++ lists:filter( fun(Mem) -> - not ordsets:is_element(Mem, SortedNew) - end, - OldMembers ), + Members = + NewMembers ++ lists:filter(fun(Mem) -> + not ordsets:is_element(Mem, SortedNew) + end, + OldMembers), ClusterUpdatedMsg = {cluster_updated, PreviousName, @@ -278,18 +279,19 @@ waiting_for_cluster_members({all_cluster_members, NewMembers}, State) -> %% this is 1.1+ code. Members is list of {node,{IP,Port}} - Members = lists:foldl( fun(Elm={_Node,{_Ip,Port}}, Acc) when is_integer(Port) -> - [Elm|Acc]; - ({Node,_}, Acc) -> - case lists:keyfind(Node, 1, OldMembers) of - Elm={Node,{_IP,Port}} when is_integer(Port) -> - [Elm|Acc]; - _ -> - Acc - end - end, - [], - NewMembers ), + Members = + lists:foldl(fun(Elm={_Node,{_Ip,Port}}, Acc) when is_integer(Port) -> + [Elm|Acc]; + ({Node,_}, Acc) -> + case lists:keyfind(Node, 1, OldMembers) of + Elm={Node,{_IP,Port}} when is_integer(Port) -> + [Elm|Acc]; + _ -> + Acc + end + end, + [], + NewMembers ), ClusterUpdatedMsg = {cluster_updated, PreviousName, diff --git a/src/riak_repl2_rt.erl b/src/riak_repl2_rt.erl index 30056b70..8935eca1 100644 --- a/src/riak_repl2_rt.erl +++ b/src/riak_repl2_rt.erl @@ -135,10 +135,10 @@ ensure_rt(WantEnabled0, WantStarted0) -> end. register_remote_locator() -> - Locator = fun (_, {use_only, Addrs}) -> - {ok, Addrs}; - (Name, _Policy) -> - riak_core_cluster_mgr:get_ipaddrs_of_cluster(Name) + Locator = fun(_, {use_only, Addrs}) -> + {ok, Addrs}; + (Name, _Policy) -> + riak_core_cluster_mgr:get_ipaddrs_of_cluster(Name) end, ok = riak_core_connection_mgr:register_locator(rt_repl, Locator). @@ -162,12 +162,13 @@ postcommit(RObj) -> Objects = Objects0 ++ [RObj], Meta = set_bucket_meta(RObj), - BinObjs = case orddict:fetch(?BT_META_TYPED_BUCKET, Meta) of - false -> - riak_repl_util:to_wire(w1, Objects); - true -> - riak_repl_util:to_wire(w2, Objects) - end, + BinObjs = + case orddict:fetch(?BT_META_TYPED_BUCKET, Meta) of + false -> + riak_repl_util:to_wire(w1, Objects); + true -> + riak_repl_util:to_wire(w2, Objects) + end, %% try the proxy first, avoids race conditions with unregister() %% during shutdown case whereis(riak_repl2_rtq_proxy) of diff --git a/src/riak_repl2_rtsource_conn.erl b/src/riak_repl2_rtsource_conn.erl index 36b623da..661e9a67 100644 --- a/src/riak_repl2_rtsource_conn.erl +++ b/src/riak_repl2_rtsource_conn.erl @@ -38,6 +38,7 @@ -ifdef(TEST). -include_lib("eunit/include/eunit.hrl"). +-export([riak_core_connection_mgr_connect/1]). -endif. %% API @@ -332,24 +333,21 @@ terminate(_Reason, #state{helper_pid=_HelperPid, remote=Remote}) -> code_change(_OldVsn, State, _Extra) -> {ok, State}. -maybe_rebalance(State, NowOrDelayed) -> +maybe_rebalance(State, now) -> case should_rebalance(State) of no -> State; {yes, UsefulAddrs} -> - case NowOrDelayed of - now -> - reconnect(State, UsefulAddrs); - delayed -> - case State#state.rb_timeout_tref of - undefined -> - RbTimeoutTref = erlang:send_after(rebalance_delay_millis(), self(), rebalance_now), - State#state{rb_timeout_tref = RbTimeoutTref}; - _ -> - %% Already sent a "rebalance_now" - State - end - end + reconnect(State, UsefulAddrs) + end; +maybe_rebalance(State, delayed) -> + case State#state.rb_timeout_tref of + undefined -> + RbTimeoutTref = erlang:send_after(rebalance_delay_millis(), self(), rebalance_now), + State#state{rb_timeout_tref = RbTimeoutTref}; + _ -> + %% Already sent a "rebalance_now" + State end. should_rebalance(#state{address=ConnectedAddr, remote=Remote}) -> @@ -360,12 +358,10 @@ should_rebalance(#state{address=ConnectedAddr, remote=Remote}) -> case (ShuffledAddrs /= []) andalso same_ipaddr(ConnectedAddr, hd(ShuffledAddrs)) of true -> no; % we're already connected to the ideal buddy - false -> %% compute the addrs that are "better" than the currently connected addr BetterAddrs = lists:takewhile(fun(A) -> not same_ipaddr(ConnectedAddr, A) end, ShuffledAddrs), - %% remove those that are blacklisted anyway UsefulAddrs = riak_core_connection_mgr:filter_blacklisted_ipaddrs(BetterAddrs), lager:debug("BetterAddrs: ~p, UsefulAddrs ~p", [BetterAddrs, UsefulAddrs]), @@ -378,7 +374,8 @@ should_rebalance(#state{address=ConnectedAddr, remote=Remote}) -> end. rebalance_delay_millis() -> - MaxDelaySecs = app_helper:get_env(riak_repl, realtime_connection_rebalance_max_delay_secs, 5*60), + MaxDelaySecs = + app_helper:get_env(riak_repl, realtime_connection_rebalance_max_delay_secs, 5*60), round(MaxDelaySecs * crypto:rand_uniform(0, 1000)). reconnect(State=#state{remote=Remote}, BetterAddrs) -> @@ -462,8 +459,11 @@ send_heartbeat(State = #state{hb_interval = undefined}) -> send_heartbeat(State = #state{hb_timeout = HBTimeout, hb_sent_q = SentQ, helper_pid = HelperPid}) when is_integer(HBTimeout) -> - Now = now(), % using now as need a unique reference for this heartbeat - % to spot late heartbeat timeout messages + + % Using now as need a unique reference for this heartbeat + % to spot late heartbeat timeout messages + Now = now(), + riak_repl2_rtsource_helper:send_heartbeat(HelperPid), TRef = erlang:send_after(timer:seconds(HBTimeout), self(), {heartbeat_timeout, Now}), State2 = State#state{hb_interval_tref = undefined, hb_timeout_tref = TRef, @@ -552,30 +552,29 @@ cache_peername_test_case() -> %% Set up the test setup_connection_for_peername() -> riak_repl_test_util:reset_meck(riak_core_connection_mgr, [no_link, passthrough]), - meck:expect(riak_core_connection_mgr, connect, fun(_ServiceAndRemote, ClientSpec) -> - proc_lib:spawn_link(fun() -> - Version = stateful:version(), - {_Proto, {TcpOpts, Module, Pid}} = ClientSpec, - {ok, Socket} = gen_tcp:connect("127.0.0.1", ?SINK_PORT, [binary | TcpOpts]), - - ok = Module:connected(Socket, gen_tcp, {"127.0.0.1", ?SINK_PORT}, Version, Pid, []), - - % simulate local socket problem - inet:close(Socket), + meck:expect(riak_core_connection_mgr, connect, + fun(_ServiceAndRemote, ClientSpec) -> + proc_lib:spawn_link(?MODULE, riak_core_connection_mgr_connect, [ClientSpec]), + {ok, make_ref()} + end). +riak_core_connection_mgr_connect(ClientSpec) -> + Version = stateful:version(), + {_Proto, {TcpOpts, Module, Pid}} = ClientSpec, + {ok, Socket} = gen_tcp:connect("127.0.0.1", ?SINK_PORT, [binary | TcpOpts]), - % get the State from the source connection. - {status,Pid,_,[_,_,_,_,[_,_,{data,[{_,State}]}]]} = sys:get_status(Pid), + ok = Module:connected(Socket, gen_tcp, {"127.0.0.1", ?SINK_PORT}, Version, Pid, []), - % getting the peername from the socket should produce error string - ?assertEqual("error:einval", peername(inet, Socket)), + % simulate local socket problem + inet:close(Socket), - % while getting the peername from the State should produce the cached string - ?assertEqual("127.0.0.1:5007", peername(State)) + % get the State from the source connection. + {status,Pid,_,[_,_,_,_,[_,_,{data,[{_,State}]}]]} = sys:get_status(Pid), - end), + % getting the peername from the socket should produce error string + ?assertEqual("error:einval", peername(inet, Socket)), - {ok, make_ref()} - end). + % while getting the peername from the State should produce the cached string + ?assertEqual("127.0.0.1:5007", peername(State)). %% Connect to the 'fake' sink connect(RemoteName) -> From 5188180baf5c1faa07a2c754b75246a2fa18bfbb Mon Sep 17 00:00:00 2001 From: Mikael Lixenstrand Date: Wed, 15 Oct 2014 12:44:39 +0100 Subject: [PATCH 43/47] Fix eunit tests. --- src/riak_core_cluster_conn.erl | 7 +++++-- test/riak_core_cluster_conn_eqc.erl | 6 ++++-- test/riak_core_cluster_mgr_tests.erl | 12 ++++++------ 3 files changed, 15 insertions(+), 10 deletions(-) diff --git a/src/riak_core_cluster_conn.erl b/src/riak_core_cluster_conn.erl index 39fa03d0..4c53d895 100644 --- a/src/riak_core_cluster_conn.erl +++ b/src/riak_core_cluster_conn.erl @@ -307,8 +307,11 @@ waiting_for_cluster_members(_, _State) -> %% Sync message handling for the `waiting_for_cluster_members' state waiting_for_cluster_members(status, _From, State) -> {reply, {waiting_for_cluster_members, State#state.name}, waiting_for_cluster_members, State}; -waiting_for_cluster_members(_, _From, _State) -> - {reply, ok, waiting_for_cluster_members, _State}. +waiting_for_cluster_members(Other, _From, State) -> + _ = lager:error("cluster_conn: client got unexpected " + "msg from remote: ~p, ~p", + [State#state.remote, Other]), + {reply, ok, waiting_for_cluster_members, State}. %% Async message handling for the `connected' state connected(poll_cluster, State) -> diff --git a/test/riak_core_cluster_conn_eqc.erl b/test/riak_core_cluster_conn_eqc.erl index 79348e81..e6486be0 100644 --- a/test/riak_core_cluster_conn_eqc.erl +++ b/test/riak_core_cluster_conn_eqc.erl @@ -175,7 +175,8 @@ postcondition(connected, connected, _S ,{call, ?MODULE, status, _}, R) -> ExpectedStatus = {fake_socket, ranch_tcp, "overtherainbow", - [{clustername, "FARFARAWAY"}]}, + [{clustername, "FARFARAWAY"}], + {1,0}}, {_, status, Status} = R, ?P(Status =:= ExpectedStatus); postcondition(State, State, _S ,{call, ?MODULE, status, [Pid]}, R) -> @@ -226,7 +227,8 @@ connected_to_remote(Pid) -> fake_socket, ranch_tcp, "overtherainbow", - [{clustername, "FARFARAWAY"}]}, + [{clustername, "FARFARAWAY"}], + {1,0}}, gen_fsm:send_event(Pid, Event). cluster_name(Pid) -> diff --git a/test/riak_core_cluster_mgr_tests.erl b/test/riak_core_cluster_mgr_tests.erl index 980793fe..7340edd4 100644 --- a/test/riak_core_cluster_mgr_tests.erl +++ b/test/riak_core_cluster_mgr_tests.erl @@ -106,13 +106,13 @@ single_node_test_() -> ?assertEqual({ok, [?REMOTE_CLUSTER_NAME]}, Knowners) end}, + %% We should get "127.0.0.1",5002 every time + %% since local is always nonode@nohost {"get ipaddres of cluster", fun() -> - Original = [{"127.0.0.1",5001}, {"127.0.0.1",5002}, {"127.0.0.1",5003}], - Rotated1 = [{"127.0.0.1",5002}, {"127.0.0.1",5003}, {"127.0.0.1",5001}], - Rotated2 = [{"127.0.0.1",5003}, {"127.0.0.1",5001}, {"127.0.0.1",5002}], - ?assert({ok,Original} == riak_core_cluster_mgr:get_ipaddrs_of_cluster(?REMOTE_CLUSTER_NAME)), - ?assert({ok,Rotated1} == riak_core_cluster_mgr:get_ipaddrs_of_cluster(?REMOTE_CLUSTER_NAME)), - ?assert({ok,Rotated2} == riak_core_cluster_mgr:get_ipaddrs_of_cluster(?REMOTE_CLUSTER_NAME)) + Original = [{"127.0.0.1",5002}, {"127.0.0.1",5001}, {"127.0.0.1",5003}], + ?assertEqual({ok,Original},riak_core_cluster_mgr:get_ipaddrs_of_cluster(?REMOTE_CLUSTER_NAME)), + ?assertEqual({ok,Original},riak_core_cluster_mgr:get_ipaddrs_of_cluster(?REMOTE_CLUSTER_NAME)), + ?assertEqual({ok,Original},riak_core_cluster_mgr:get_ipaddrs_of_cluster(?REMOTE_CLUSTER_NAME)) end} ] end }. From 5c71490ea8010e6064c83ba356d9130de14178f4 Mon Sep 17 00:00:00 2001 From: Mikael Lixenstrand Date: Wed, 22 Oct 2014 09:14:33 +0100 Subject: [PATCH 44/47] Indent & Refac --- src/riak_core_cluster_mgr.erl | 8 ++++---- src/riak_core_connection.erl | 30 ++++++++++++++++++++++++------ 2 files changed, 28 insertions(+), 10 deletions(-) diff --git a/src/riak_core_cluster_mgr.erl b/src/riak_core_cluster_mgr.erl index 872ce28c..afba88f4 100644 --- a/src/riak_core_cluster_mgr.erl +++ b/src/riak_core_cluster_mgr.erl @@ -770,11 +770,11 @@ shuffle_remote_ipaddrs([]) -> shuffle_remote_ipaddrs(RemoteUnsorted) -> {ok, MyRing} = riak_core_ring_manager:get_my_ring(), SortedNodes = lists:sort(riak_core_ring:all_members(MyRing)), - NodesTagged = lists:zip( lists:seq(1, length(SortedNodes)), SortedNodes ), + NodesTagged = lists:zip(lists:seq(1, length(SortedNodes)), SortedNodes), case lists:keyfind(node(), 2, NodesTagged) of {MyPos, _} -> OurClusterName = riak_core_connection:symbolic_clustername(), - RemoteAddrs = shuffle_with_seed( lists:sort( RemoteUnsorted ), [OurClusterName] ), + RemoteAddrs = shuffle_with_seed(lists:sort(RemoteUnsorted), [OurClusterName]), %% MyPos is the position if *this* node in the sorted list of %% all nodes in my ring. Now choose the node at the corresponding @@ -782,8 +782,8 @@ shuffle_remote_ipaddrs(RemoteUnsorted) -> SplitPos = ((MyPos-1) rem length(RemoteAddrs)), case lists:split(SplitPos,RemoteAddrs) of {BeforeBuddy,[Buddy|AfterBuddy]} -> - {ok, [Buddy | shuffle_with_seed(AfterBuddy ++ BeforeBuddy, node()) ]} + {ok, [Buddy | shuffle_with_seed(AfterBuddy ++ BeforeBuddy, node())]} end; false -> - {ok, shuffle_with_seed(lists:sort( RemoteUnsorted ), node())} + {ok, shuffle_with_seed(lists:sort(RemoteUnsorted), node())} end. diff --git a/src/riak_core_connection.erl b/src/riak_core_connection.erl index b2bc586d..0f00bc46 100644 --- a/src/riak_core_connection.erl +++ b/src/riak_core_connection.erl @@ -29,7 +29,8 @@ %%
  • The server sends `term_to_binary({?CTRL_ACK, ?CTRL_REV, TheirCaps}).'
  • %%
  • If the server and client agree on SSL, the session is upgraded.
  • %%
  • The client sends `term_to_binary({Protocol, Version}).'
  • -%%
  • The server sends `term_to_binary({ok, {ProtoName, {CommonMajor, RemoteMinor, LocalMinor}}})', after which we call ?MODULE:connect/6 and exit.
  • +%%
  • The server sends `term_to_binary({ok, {ProtoName, {CommonMajor, RemoteMinor, LocalMinor}}})',
  • +%%
  • after which we call ?MODULE:connect/6 and exit.
  • -module(riak_core_connection). -behavior(gen_fsm). @@ -170,7 +171,17 @@ init({IP, Port, Protocol, ProtoVers, SocketOptions, Mod, ModArgs}) -> Hello = term_to_binary({?CTRL_HELLO, ?CTRL_REV, MyCaps}), ok = ranch_tcp:send(Socket, Hello), ranch_tcp:setopts(Socket, [{active, once}]), - State = #state{transport = ranch_tcp, socket = Socket, protocol = Protocol, protovers = ProtoVers, socket_opts = SocketOptions, mod = Mod, mod_args = ModArgs, cluster_name = MyName, local_capabilities = MyCaps, ip = IP, port = Port}, + State = #state{transport = ranch_tcp, + socket = Socket, + protocol = Protocol, + protovers = ProtoVers, + socket_opts = SocketOptions, + mod = Mod, + mod_args = ModArgs, + cluster_name = MyName, + local_capabilities = MyCaps, + ip = IP, + port = Port}, {ok, wait_for_capabilities, State}; Else -> lager:warning("Could not connect ~p:~p due to ~p", [IP, Port, Else]), @@ -205,9 +216,11 @@ handle_info({_Transport, Socket, Data}, wait_for_capabilities, State = #state{so {stop, Error, State}; {NewTransport, NewSocket} -> FullProto = {State#state.protocol, State#state.protovers}, - NewTransport:send(NewSocket, erlang:term_to_binary(FullProto)), - NewTransport:setopts(NewSocket, [{active, once}]), - State2 = State#state{transport = NewTransport, socket = NewSocket, remote_capabilities = TheirCaps}, + NewTransport:send(Socket, erlang:term_to_binary(FullProto)), + NewTransport:setopts(Socket, [{active, once}]), + State2 = State#state{transport = NewTransport, + socket = NewSocket, + remote_capabilities = TheirCaps}, {next_state, wait_for_protocol, State2} end; Else -> @@ -222,7 +235,12 @@ handle_info({_TransTag, Socket, Data}, wait_for_protocol, State = #state{socket IpPort = {State#state.ip, State#state.port}, NegotiatedProto = {ProtoName, {CommonMajor, LocalMinor}, {CommonMajor, RemoteMinor}}, _ = Transport:setopts(Socket, State#state.socket_opts), - _ModStarted = Module:connected(Socket, Transport, IpPort, NegotiatedProto, ModArgs, State#state.remote_capabilities), + _ModStarted = Module:connected(Socket, + Transport, + IpPort, + NegotiatedProto, + ModArgs, + State#state.remote_capabilities), {stop, normal, State}; Else -> lager:warning("Invalid version returned: ~p", [Else]), From 6e61348083a50d102b7745f9388afef25117e6bf Mon Sep 17 00:00:00 2001 From: Mikael Lixenstrand Date: Fri, 24 Oct 2014 15:12:05 +0100 Subject: [PATCH 45/47] Fix dialyzer --- dialyzer.ignore-warnings | 4 +--- src/riak_core_cluster_conn.erl | 2 +- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/dialyzer.ignore-warnings b/dialyzer.ignore-warnings index 3487cc7d..33aa0b34 100644 --- a/dialyzer.ignore-warnings +++ b/dialyzer.ignore-warnings @@ -4,9 +4,7 @@ riak_repl_keylist_client.erl:267: The call application:unset_env('riak_repl',{'p riak_repl_keylist_client.erl:120: The call application:unset_env('riak_repl',{'progress',_}) breaks the contract (Application,Par) -> 'ok' when is_subtype(Application,atom()), is_subtype(Par,atom()) riak_repl_keylist_client.erl:132: The call application:set_env('riak_repl',{'progress',_},nonempty_maybe_improper_list()) breaks the contract (Application,Par,Val) -> 'ok' when is_subtype(Application,atom()), is_subtype(Par,atom()), is_subtype(Val,term()) riak_core_connection.erl:108: Function exchange_handshakes_with/4 has no local return -riak_core_connection.erl:171: The call ranch_tcp:send(Socket::port(),Hello::binary()) breaks the contract (inet:socket(),iolist()) -> 'ok' | {'error',atom()} -riak_core_connection.erl:189: Function try_ssl/4 will never be called -riak_core_connection.erl:225: Function negotiate_proto_with_server/3 will never be called +riak_core_connection.erl:172: The call ranch_tcp:send(Socket::port(),Hello::binary()) breaks the contract (inet:socket(),iolist()) -> 'ok' | {'error',atom()} riak_repl_keylist_client.erl:106: The call application:unset_env('riak_repl',{'progress',_}) breaks the contract (Application,Par) -> 'ok' when is_subtype(Application,atom()), is_subtype(Par,atom()) riak_repl_keylist_client.erl:216: The call application:unset_env('riak_repl',{'progress',_}) breaks the contract (Application,Par) -> 'ok' when is_subtype(Application,atom()), is_subtype(Par,atom()) riak_repl_keylist_client.erl:265: The call application:unset_env('riak_repl',{'progress',_}) breaks the contract (Application,Par) -> 'ok' when is_subtype(Application,atom()), is_subtype(Par,atom()) diff --git a/src/riak_core_cluster_conn.erl b/src/riak_core_cluster_conn.erl index 4c53d895..295d2e72 100644 --- a/src/riak_core_cluster_conn.erl +++ b/src/riak_core_cluster_conn.erl @@ -87,7 +87,7 @@ connection_timeout :: timeout(), transport :: atom(), address :: peer_address(), - connection_props :: proplist:proplist(), + connection_props :: proplists:proplist(), transport_msgs :: ranch_transport_messages(), proto_version :: {non_neg_integer(), non_neg_integer()} }). -type state() :: #state{}. From 1bbea3704473ff9e39fee9e687c9e24692000253 Mon Sep 17 00:00:00 2001 From: Mikael Lixenstrand Date: Wed, 26 Nov 2014 15:56:48 +0000 Subject: [PATCH 46/47] Bug: takewhile -> filter Possible to lose some addresses when ConnectedAddr are early in the list. [{"127.0.0.1",10106},{"127.0.0.1",10066},{"127.0.0.1",10096},{"127.0.0.1",10076},{"127.0.0.1",10086}], ConnectedAddr: {"127.0.0.1",10066} [{"127.0.0.1",10106}], UsefulAddrs [{"127.0.0.1",10106}] --- src/riak_repl2_rtsource_conn.erl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/riak_repl2_rtsource_conn.erl b/src/riak_repl2_rtsource_conn.erl index 661e9a67..100d70a3 100644 --- a/src/riak_repl2_rtsource_conn.erl +++ b/src/riak_repl2_rtsource_conn.erl @@ -360,8 +360,8 @@ should_rebalance(#state{address=ConnectedAddr, remote=Remote}) -> no; % we're already connected to the ideal buddy false -> %% compute the addrs that are "better" than the currently connected addr - BetterAddrs = lists:takewhile(fun(A) -> not same_ipaddr(ConnectedAddr, A) end, - ShuffledAddrs), + BetterAddrs = lists:filter(fun(A) -> not same_ipaddr(ConnectedAddr, A) end, + ShuffledAddrs), %% remove those that are blacklisted anyway UsefulAddrs = riak_core_connection_mgr:filter_blacklisted_ipaddrs(BetterAddrs), lager:debug("BetterAddrs: ~p, UsefulAddrs ~p", [BetterAddrs, UsefulAddrs]), From 23e81b5ffb43335799fbb99a609a2035cdecd48f Mon Sep 17 00:00:00 2001 From: "Engel A. Sanchez" Date: Mon, 15 Dec 2014 16:40:00 -0500 Subject: [PATCH 47/47] Fix exometer induced dialyzer errors Stats function now never returns error code. --- src/riak_core_connection_mgr_stats.erl | 7 ++----- src/riak_repl_stats.erl | 7 ++----- 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/src/riak_core_connection_mgr_stats.erl b/src/riak_core_connection_mgr_stats.erl index a20fc983..163814c4 100644 --- a/src/riak_core_connection_mgr_stats.erl +++ b/src/riak_core_connection_mgr_stats.erl @@ -59,11 +59,8 @@ register_stats() -> %% When the cache needs to get the latest values, it will call our %% `produce_stats()' function. get_stats() -> - case riak_core_stat_cache:get_stats(?APP) of - {ok, Stats, _TS} -> - Stats; - Error -> Error - end. + {ok, Stats, _TS} = riak_core_stat_cache:get_stats(?APP), + Stats. get_consolidated_stats() -> Strings = [format_stat(Stat) || Stat <- get_stats()], diff --git a/src/riak_repl_stats.erl b/src/riak_repl_stats.erl index f654ce49..8924c99b 100644 --- a/src/riak_repl_stats.erl +++ b/src/riak_repl_stats.erl @@ -151,11 +151,8 @@ rt_dirty() -> get_stats() -> case erlang:whereis(riak_repl_stats) of Pid when is_pid(Pid) -> - case riak_core_stat_cache:get_stats(?APP) of - {ok, Stats, _TS} -> - Stats; - Error -> Error - end; + {ok, Stats, _TS} = riak_core_stat_cache:get_stats(?APP), + Stats; _ -> [] end.