55import struct
66import time
77import uuid
8- from typing import Coroutine
8+ from typing import Coroutine , Tuple
99
1010from hazelcast import __version__
1111from hazelcast .config import ReconnectMode
@@ -307,37 +307,18 @@ async def connect_to_all_cluster_members(self, sync_start):
307307
308308 self ._start_connect_all_members_timer ()
309309
310- async def on_connection_close (self , closed_connection ):
311- remote_uuid = closed_connection .remote_uuid
312- remote_address = closed_connection .remote_address
313-
314- if not remote_address :
310+ async def on_connection_close (self , closed_connection , unsafe = False ):
311+ if not closed_connection .remote_address :
315312 _logger .debug (
316313 "Destroying %s, but it has no remote address, hence nothing is "
317314 "removed from the connection dictionary" ,
318315 closed_connection ,
319316 )
320317 return
321318
322- disconnected = False
323- removed = False
324- trigger_reconnection = False
325- async with self ._lock :
326- connection = self .active_connections .get (remote_uuid , None )
327- if connection == closed_connection :
328- self .active_connections .pop (remote_uuid , None )
329- removed = True
330- _logger .info (
331- "Removed connection to %s:%s, connection: %s" ,
332- remote_address ,
333- remote_uuid ,
334- connection ,
335- )
336-
337- if not self .active_connections :
338- trigger_reconnection = True
339- if self ._client_state == ClientState .INITIALIZED_ON_CLUSTER :
340- disconnected = True
319+ disconnected , removed , trigger_reconnection = await self ._determine_connection_state (
320+ closed_connection , unsafe = unsafe
321+ )
341322
342323 if disconnected :
343324 self ._lifecycle_service .fire_lifecycle_event (LifecycleState .DISCONNECTED )
@@ -359,9 +340,40 @@ async def on_connection_close(self, closed_connection):
359340 _logger .debug (
360341 "Destroying %s, but there is no mapping for %s in the connection dictionary" ,
361342 closed_connection ,
343+ closed_connection .remote_uuid ,
344+ )
345+
346+ async def _determine_connection_state (
347+ self , closed_connection , unsafe = False
348+ ) -> Tuple [bool , bool , bool ]:
349+ if unsafe :
350+ return self ._determine_connection_state_unsafe (closed_connection )
351+ async with self ._lock :
352+ return self ._determine_connection_state_unsafe (closed_connection )
353+
354+ def _determine_connection_state_unsafe (self , closed_connection ) -> Tuple [bool , bool , bool ]:
355+ remote_uuid = closed_connection .remote_uuid
356+ disconnected = False
357+ removed = False
358+ trigger_reconnection = False
359+ connection = self .active_connections .get (remote_uuid , None )
360+ if connection == closed_connection :
361+ self .active_connections .pop (remote_uuid , None )
362+ removed = True
363+ _logger .info (
364+ "Removed connection to %s:%s, connection: %s" ,
365+ closed_connection .remote_address ,
362366 remote_uuid ,
367+ connection ,
363368 )
364369
370+ if not self .active_connections :
371+ trigger_reconnection = True
372+ if self ._client_state == ClientState .INITIALIZED_ON_CLUSTER :
373+ disconnected = True
374+
375+ return disconnected , removed , trigger_reconnection
376+
365377 def check_invocation_allowed (self ):
366378 state = self ._client_state
367379 if state == ClientState .INITIALIZED_ON_CLUSTER and self .active_connections :
@@ -464,6 +476,12 @@ def _init_wait_strategy(self, config):
464476 def _start_connect_all_members_timer (self ):
465477 connecting_uuids = set ()
466478
479+ async def connect_to_member (member ):
480+ try :
481+ await self ._get_or_connect_to_member (member )
482+ except Exception :
483+ _logger .debug ("Error connecting to %s in reconnect timer" , member , exc_info = True )
484+
467485 async def run ():
468486 await asyncio .sleep (1 )
469487 if not self ._lifecycle_service .running :
@@ -480,7 +498,7 @@ async def run():
480498 connecting_uuids .add (member_uuid )
481499 if not self ._lifecycle_service .running :
482500 break
483- tg .create_task (self . _get_or_connect_to_member (member ))
501+ tg .create_task (connect_to_member (member ))
484502 member_uuids .append (member_uuid )
485503
486504 for item in member_uuids :
@@ -658,49 +676,54 @@ async def _handle_successful_auth(self, response, connection):
658676
659677 existing = self .active_connections .get (remote_uuid , None )
660678
661- if existing :
662- await connection .close_connection (
663- "Duplicate connection to same member with UUID: %s" % remote_uuid , None
664- )
665- return existing
666-
667- new_cluster_id = response ["cluster_id" ]
668- changed_cluster = self ._cluster_id is not None and self ._cluster_id != new_cluster_id
669- if changed_cluster :
670- await self ._check_client_state_on_cluster_change (connection )
671- _logger .warning (
672- "Switching from current cluster: %s to new cluster: %s" ,
673- self ._cluster_id ,
674- new_cluster_id ,
675- )
676- self ._on_cluster_restart ()
679+ if existing :
680+ await connection .close_connection (
681+ "Duplicate connection to same member with UUID: %s" % remote_uuid ,
682+ None ,
683+ unsafe = True ,
684+ )
685+ return existing
686+
687+ new_cluster_id = response ["cluster_id" ]
688+ changed_cluster = self ._cluster_id is not None and self ._cluster_id != new_cluster_id
689+ if changed_cluster :
690+ await self ._check_client_state_on_cluster_change (connection )
691+ _logger .warning (
692+ "Switching from current cluster: %s to new cluster: %s" ,
693+ self ._cluster_id ,
694+ new_cluster_id ,
695+ )
696+ self ._on_cluster_restart ()
677697
678- async with self ._lock :
679698 is_initial_connection = not self .active_connections
680699 self .active_connections [remote_uuid ] = connection
681700 fire_connected_lifecycle_event = False
682701
683- if is_initial_connection :
684- self ._cluster_id = new_cluster_id
685- # In split brain, the client might connect to the one half
686- # of the cluster, and then later might reconnect to the
687- # other half, after the half it was connected to is
688- # completely dead. Since the cluster id is preserved in
689- # split brain scenarios, it is impossible to distinguish
690- # reconnection to the same cluster vs reconnection to the
691- # other half of the split brain. However, in the latter,
692- # we might need to send some state to the other half of
693- # the split brain (like Compact schemas). That forces us
694- # to send the client state to the cluster after the first
695- # cluster connection, regardless the cluster id is
696- # changed or not.
697- if self ._established_initial_cluster_connection :
698- self ._client_state = ClientState .CONNECTED_TO_CLUSTER
699- await self ._initialize_on_cluster (new_cluster_id )
700- else :
701- fire_connected_lifecycle_event = True
702- self ._established_initial_cluster_connection = True
703- self ._client_state = ClientState .INITIALIZED_ON_CLUSTER
702+ init_on_cluster = False
703+ if is_initial_connection :
704+ self ._cluster_id = new_cluster_id
705+ # In split brain, the client might connect to the one half
706+ # of the cluster, and then later might reconnect to the
707+ # other half, after the half it was connected to is
708+ # completely dead. Since the cluster id is preserved in
709+ # split brain scenarios, it is impossible to distinguish
710+ # reconnection to the same cluster vs reconnection to the
711+ # other half of the split brain. However, in the latter,
712+ # we might need to send some state to the other half of
713+ # the split brain (like Compact schemas). That forces us
714+ # to send the client state to the cluster after the first
715+ # cluster connection, regardless the cluster id is
716+ # changed or not.
717+ if self ._established_initial_cluster_connection :
718+ self ._client_state = ClientState .CONNECTED_TO_CLUSTER
719+ init_on_cluster = True
720+ else :
721+ fire_connected_lifecycle_event = True
722+ self ._established_initial_cluster_connection = True
723+ self ._client_state = ClientState .INITIALIZED_ON_CLUSTER
724+
725+ if init_on_cluster :
726+ await self ._initialize_on_cluster (new_cluster_id )
704727
705728 if fire_connected_lifecycle_event :
706729 self ._lifecycle_service .fire_lifecycle_event (LifecycleState .CONNECTED )
@@ -777,7 +800,7 @@ async def _check_client_state_on_cluster_change(self, connection):
777800 # we can operate on. In those scenarios, we rely on the fact that we will
778801 # reopen the connections.
779802 reason = "Connection does not belong to the cluster %s" % self ._cluster_id
780- await connection .close_connection (reason , None )
803+ await connection .close_connection (reason , None , unsafe = True )
781804 raise ValueError (reason )
782805
783806 def _on_cluster_restart (self ):
@@ -985,13 +1008,13 @@ def send_message(self, message):
9851008 self ._write (message .buf )
9861009 return True
9871010
988- # Not named close to distinguish it from the asyncore.dispatcher.close.
989- async def close_connection (self , reason , cause ):
1011+ async def close_connection (self , reason , cause , unsafe = False ):
9901012 """Closes the connection.
9911013
9921014 Args:
9931015 reason (str): The reason this connection is going to be closed. Is allowed to be None.
9941016 cause (Exception): The exception responsible for closing this connection. Is allowed to be None.
1017+ unsafe (bool): Do not acquire a lock
9951018 """
9961019 if not self .live :
9971020 return
@@ -1003,7 +1026,7 @@ async def close_connection(self, reason, cause):
10031026 self ._inner_close ()
10041027 except Exception :
10051028 _logger .exception ("Error while closing the the connection %s" , self )
1006- await self ._connection_manager .on_connection_close (self )
1029+ await self ._connection_manager .on_connection_close (self , unsafe = unsafe )
10071030
10081031 def _log_close (self , reason , cause ):
10091032 msg = "%s closed. Reason: %s"
0 commit comments