@ -32,12 +32,19 @@ from synapse.util.metrics import Measure
# period to cache .well-known results for by default
WELL_KNOWN_DEFAULT_CACHE_PERIOD = 24 * 3600
# jitter to add to the .well-known default cache ttl
WELL_KNOWN_DEFAULT_CACHE_PERIOD_JITTER = 10 * 60
# jitter factor to add to the .well-known default cache ttls
WELL_KNOWN_DEFAULT_CACHE_PERIOD_JITTER = 0.1
# period to cache failure to fetch .well-known for
WELL_KNOWN_INVALID_CACHE_PERIOD = 1 * 3600
# period to cache failure to fetch .well-known if there has recently been a
# valid well-known for that domain.
WELL_KNOWN_DOWN_CACHE_PERIOD = 2 * 60
# period to remember there was a valid well-known after valid record expires
WELL_KNOWN_REMEMBER_DOMAIN_HAD_VALID = 2 * 3600
# cap for .well-known cache period
WELL_KNOWN_MAX_CACHE_PERIOD = 48 * 3600
@ -49,11 +56,16 @@ WELL_KNOWN_MIN_CACHE_PERIOD = 5 * 60
# we'll start trying to refetch 1 minute before it expires.
WELL_KNOWN_GRACE_PERIOD_FACTOR = 0.2
# Number of times we retry fetching a well-known for a domain we know recently
# had a valid entry.
WELL_KNOWN_RETRY_ATTEMPTS = 3
logger = logging . getLogger ( __name__ )
_well_known_cache = TTLCache ( " well-known " )
_had_valid_well_known_cache = TTLCache ( " had-valid-well-known " )
@attr . s ( slots = True , frozen = True )
@ -65,14 +77,20 @@ class WellKnownResolver(object):
""" Handles well-known lookups for matrix servers.
"""
def __init__ ( self , reactor , agent , well_known_cache = None ) :
def __init__ (
self , reactor , agent , well_known_cache = None , had_well_known_cache = None
) :
self . _reactor = reactor
self . _clock = Clock ( reactor )
if well_known_cache is None :
well_known_cache = _well_known_cache
if had_well_known_cache is None :
had_well_known_cache = _had_valid_well_known_cache
self . _well_known_cache = well_known_cache
self . _had_valid_well_known_cache = had_well_known_cache
self . _well_known_agent = RedirectAgent ( agent )
@defer . inlineCallbacks
@ -100,7 +118,7 @@ class WellKnownResolver(object):
# requests for the same server in parallel?
try :
with Measure ( self . _clock , " get_well_known " ) :
result , cache_period = yield self . _do_get _well_known ( server_name )
result , cache_period = yield self . _fetch _well_known ( server_name )
except _FetchWellKnownFailure as e :
if prev_result and e . temporary :
@ -111,10 +129,18 @@ class WellKnownResolver(object):
result = None
# add some randomness to the TTL to avoid a stampeding herd every hour
# after startup
cache_period = WELL_KNOWN_INVALID_CACHE_PERIOD
cache_period + = random . uniform ( 0 , WELL_KNOWN_DEFAULT_CACHE_PERIOD_JITTER )
if self . _had_valid_well_known_cache . get ( server_name , False ) :
# We have recently seen a valid well-known record for this
# server, so we cache the lack of well-known for a shorter time.
cache_period = WELL_KNOWN_DOWN_CACHE_PERIOD
else :
cache_period = WELL_KNOWN_INVALID_CACHE_PERIOD
# add some randomness to the TTL to avoid a stampeding herd
cache_period * = random . uniform (
1 - WELL_KNOWN_DEFAULT_CACHE_PERIOD_JITTER ,
1 + WELL_KNOWN_DEFAULT_CACHE_PERIOD_JITTER ,
)
if cache_period > 0 :
self . _well_known_cache . set ( server_name , result , cache_period )
@ -122,7 +148,7 @@ class WellKnownResolver(object):
return WellKnownLookupResult ( delegated_server = result )
@defer . inlineCallbacks
def _do_get _well_known ( self , server_name ) :
def _fetch _well_known ( self , server_name ) :
""" Actually fetch and parse a .well-known, without checking the cache
Args :
@ -134,24 +160,15 @@ class WellKnownResolver(object):
Returns :
Deferred [ Tuple [ bytes , int ] ] : The lookup result and cache period .
"""
uri = b " https:// %s /.well-known/matrix/server " % ( server_name , )
uri_str = uri . decode ( " ascii " )
logger . info ( " Fetching %s " , uri_str )
had_valid_well_known = self . _had_valid_well_known_cache . get ( server_name , False )
# We do this in two steps to differentiate between possibly transient
# errors (e.g. can't connect to host, 503 response) and more permenant
# errors (such as getting a 404 response).
try :
response = yield make_deferred_yieldable (
self . _well_known_agent . request ( b " GET " , uri )
)
body = yield make_deferred_yieldable ( readBody ( response ) )
if 500 < = response . code < 600 :
raise Exception ( " Non-200 response %s " % ( response . code , ) )
except Exception as e :
logger . info ( " Error fetching %s : %s " , uri_str , e )
raise _FetchWellKnownFailure ( temporary = True )
response , body = yield self . _make_well_known_request (
server_name , retry = had_valid_well_known
)
try :
if response . code != 200 :
@ -161,8 +178,11 @@ class WellKnownResolver(object):
logger . info ( " Response from .well-known: %s " , parsed_body )
result = parsed_body [ " m.server " ] . encode ( " ascii " )
except defer . CancelledError :
# Bail if we've been cancelled
raise
except Exception as e :
logger . info ( " Error fetching %s : %s " , uri_str , e )
logger . info ( " Error parsing well-known for %s : %s " , server_name , e )
raise _FetchWellKnownFailure ( temporary = False )
cache_period = _cache_period_from_headers (
@ -172,13 +192,69 @@ class WellKnownResolver(object):
cache_period = WELL_KNOWN_DEFAULT_CACHE_PERIOD
# add some randomness to the TTL to avoid a stampeding herd every 24 hours
# after startup
cache_period + = random . uniform ( 0 , WELL_KNOWN_DEFAULT_CACHE_PERIOD_JITTER )
cache_period * = random . uniform (
1 - WELL_KNOWN_DEFAULT_CACHE_PERIOD_JITTER ,
1 + WELL_KNOWN_DEFAULT_CACHE_PERIOD_JITTER ,
)
else :
cache_period = min ( cache_period , WELL_KNOWN_MAX_CACHE_PERIOD )
cache_period = max ( cache_period , WELL_KNOWN_MIN_CACHE_PERIOD )
# We got a success, mark as such in the cache
self . _had_valid_well_known_cache . set (
server_name ,
bool ( result ) ,
cache_period + WELL_KNOWN_REMEMBER_DOMAIN_HAD_VALID ,
)
return ( result , cache_period )
@defer . inlineCallbacks
def _make_well_known_request ( self , server_name , retry ) :
""" Make the well known request.
This will retry the request if requested and it fails ( with unable
to connect or receives a 5 xx error ) .
Args :
server_name ( bytes )
retry ( bool ) : Whether to retry the request if it fails .
Returns :
Deferred [ tuple [ IResponse , bytes ] ] Returns the response object and
body . Response may be a non - 200 response .
"""
uri = b " https:// %s /.well-known/matrix/server " % ( server_name , )
uri_str = uri . decode ( " ascii " )
i = 0
while True :
i + = 1
logger . info ( " Fetching %s " , uri_str )
try :
response = yield make_deferred_yieldable (
self . _well_known_agent . request ( b " GET " , uri )
)
body = yield make_deferred_yieldable ( readBody ( response ) )
if 500 < = response . code < 600 :
raise Exception ( " Non-200 response %s " % ( response . code , ) )
return response , body
except defer . CancelledError :
# Bail if we've been cancelled
raise
except Exception as e :
if not retry or i > = WELL_KNOWN_RETRY_ATTEMPTS :
logger . info ( " Error fetching %s : %s " , uri_str , e )
raise _FetchWellKnownFailure ( temporary = True )
logger . info ( " Error fetching %s : %s . Retrying " , uri_str , e )
# Sleep briefly in the hopes that they come back up
yield self . _clock . sleep ( 0.5 )
def _cache_period_from_headers ( headers , time_now = time . time ) :
cache_controls = _parse_cache_control ( headers )