Enable Faster Remote Room Joins against worker-mode Synapse. (#14752)

* Enable Complement tests for Faster Remote Room Joins on worker-mode

* (dangerous) Add an override to allow Complement to use FRRJ under workers

* Newsfile

Signed-off-by: Olivier Wilkinson (reivilibre) <oliverw@matrix.org>

* Fix race where we didn't send out replication notification

* MORE HACKS

* Fix get_un_partial_stated_rooms_token to take instance_name

* Fix bad merge

* Remove warning

* Correctly advance un_partial_stated_room_stream

* Fix merge

* Add another notify_replication

* Fixups

* Create a separate ReplicationNotifier

* Fix test

* Fix portdb

* Create a separate ReplicationNotifier

* Fix test

* Fix portdb

* Fix presence test

* Newsfile

* Apply suggestions from code review

* Update changelog.d/14752.misc

Co-authored-by: Erik Johnston <erik@matrix.org>

* lint

Signed-off-by: Olivier Wilkinson (reivilibre) <oliverw@matrix.org>
Co-authored-by: Erik Johnston <erik@matrix.org>
1.103.0-whithout-watcha
reivilibre 2 years ago committed by GitHub
parent d329a566df
commit 22cc93afe3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 1
      changelog.d/14752.misc
  2. 2
      docker/complement/conf/workers-shared-extra.yaml.j2
  3. 11
      scripts-dev/complement.sh
  4. 7
      synapse/app/generic_worker.py
  5. 2
      synapse/handlers/device.py
  6. 7
      synapse/handlers/federation.py
  7. 7
      synapse/replication/tcp/streams/partial_state.py
  8. 13
      synapse/storage/databases/main/events_worker.py
  9. 19
      synapse/storage/databases/main/room.py
  10. 2
      synapse/storage/databases/main/state.py

@ -0,0 +1 @@
Enable Complement tests for Faster Remote Room Joins against worker-mode Synapse.

@ -94,10 +94,8 @@ allow_device_name_lookup_over_federation: true
experimental_features: experimental_features:
# Enable history backfilling support # Enable history backfilling support
msc2716_enabled: true msc2716_enabled: true
{% if not workers_in_use %}
# client-side support for partial state in /send_join responses # client-side support for partial state in /send_join responses
faster_joins: true faster_joins: true
{% endif %}
# Enable support for polls # Enable support for polls
msc3381_polls_enabled: true msc3381_polls_enabled: true
# Enable deleting device-specific notification settings stored in account data # Enable deleting device-specific notification settings stored in account data

@ -190,7 +190,7 @@ fi
extra_test_args=() extra_test_args=()
test_tags="synapse_blacklist,msc3787,msc3874,msc3890,msc3391,msc3930" test_tags="synapse_blacklist,msc3787,msc3874,msc3890,msc3391,msc3930,faster_joins"
# All environment variables starting with PASS_ will be shared. # All environment variables starting with PASS_ will be shared.
# (The prefix is stripped off before reaching the container.) # (The prefix is stripped off before reaching the container.)
@ -223,12 +223,9 @@ else
export PASS_SYNAPSE_COMPLEMENT_DATABASE=sqlite export PASS_SYNAPSE_COMPLEMENT_DATABASE=sqlite
fi fi
# We only test faster room joins on monoliths, because they are purposefully # The tests for importing historical messages (MSC2716)
# being developed without worker support to start with. # only pass with monoliths, currently.
# test_tags="$test_tags,msc2716"
# The tests for importing historical messages (MSC2716) also only pass with monoliths,
# currently.
test_tags="$test_tags,faster_joins,msc2716"
fi fi

@ -282,13 +282,6 @@ def start(config_options: List[str]) -> None:
"synapse.app.user_dir", "synapse.app.user_dir",
) )
if config.experimental.faster_joins_enabled:
raise ConfigError(
"You have enabled the experimental `faster_joins` config option, but it is "
"not compatible with worker deployments yet. Please disable `faster_joins` "
"or run Synapse as a single process deployment instead."
)
synapse.events.USE_FROZEN_DICTS = config.server.use_frozen_dicts synapse.events.USE_FROZEN_DICTS = config.server.use_frozen_dicts
synapse.util.caches.TRACK_MEMORY_USAGE = config.caches.track_memory_usage synapse.util.caches.TRACK_MEMORY_USAGE = config.caches.track_memory_usage

@ -974,6 +974,7 @@ class DeviceListUpdater(DeviceListWorkerUpdater):
self.federation = hs.get_federation_client() self.federation = hs.get_federation_client()
self.clock = hs.get_clock() self.clock = hs.get_clock()
self.device_handler = device_handler self.device_handler = device_handler
self._notifier = hs.get_notifier()
self._remote_edu_linearizer = Linearizer(name="remote_device_list") self._remote_edu_linearizer = Linearizer(name="remote_device_list")
@ -1054,6 +1055,7 @@ class DeviceListUpdater(DeviceListWorkerUpdater):
user_id, user_id,
device_id, device_id,
) )
self._notifier.notify_replication()
room_ids = await self.store.get_rooms_for_user(user_id) room_ids = await self.store.get_rooms_for_user(user_id)
if not room_ids: if not room_ids:

@ -1870,14 +1870,15 @@ class FederationHandler:
logger.info("Clearing partial-state flag for %s", room_id) logger.info("Clearing partial-state flag for %s", room_id)
success = await self.store.clear_partial_state_room(room_id) success = await self.store.clear_partial_state_room(room_id)
# Poke the notifier so that other workers see the write to
# the un-partial-stated rooms stream.
self._notifier.notify_replication()
if success: if success:
logger.info("State resync complete for %s", room_id) logger.info("State resync complete for %s", room_id)
self._storage_controllers.state.notify_room_un_partial_stated( self._storage_controllers.state.notify_room_un_partial_stated(
room_id room_id
) )
# Poke the notifier so that other workers see the write to
# the un-partial-stated rooms stream.
self._notifier.notify_replication()
# TODO(faster_joins) update room stats and user directory? # TODO(faster_joins) update room stats and user directory?
# https://github.com/matrix-org/synapse/issues/12814 # https://github.com/matrix-org/synapse/issues/12814

@ -16,7 +16,6 @@ from typing import TYPE_CHECKING
import attr import attr
from synapse.replication.tcp.streams import Stream from synapse.replication.tcp.streams import Stream
from synapse.replication.tcp.streams._base import current_token_without_instance
if TYPE_CHECKING: if TYPE_CHECKING:
from synapse.server import HomeServer from synapse.server import HomeServer
@ -42,8 +41,7 @@ class UnPartialStatedRoomStream(Stream):
store = hs.get_datastores().main store = hs.get_datastores().main
super().__init__( super().__init__(
hs.get_instance_name(), hs.get_instance_name(),
# TODO(faster_joins, multiple writers): we need to account for instance names store.get_un_partial_stated_rooms_token,
current_token_without_instance(store.get_un_partial_stated_rooms_token),
store.get_un_partial_stated_rooms_from_stream, store.get_un_partial_stated_rooms_from_stream,
) )
@ -70,7 +68,6 @@ class UnPartialStatedEventStream(Stream):
store = hs.get_datastores().main store = hs.get_datastores().main
super().__init__( super().__init__(
hs.get_instance_name(), hs.get_instance_name(),
# TODO(faster_joins, multiple writers): we need to account for instance names store.get_un_partial_stated_events_token,
current_token_without_instance(store.get_un_partial_stated_events_token),
store.get_un_partial_stated_events_from_stream, store.get_un_partial_stated_events_from_stream,
) )

@ -322,11 +322,12 @@ class EventsWorkerStore(SQLBaseStore):
"stream_id", "stream_id",
) )
def get_un_partial_stated_events_token(self) -> int: def get_un_partial_stated_events_token(self, instance_name: str) -> int:
# TODO(faster_joins, multiple writers): This is inappropriate if there are multiple return (
# writers because workers that don't write often will hold all self._un_partial_stated_events_stream_id_gen.get_current_token_for_writer(
# readers up. instance_name
return self._un_partial_stated_events_stream_id_gen.get_current_token() )
)
async def get_un_partial_stated_events_from_stream( async def get_un_partial_stated_events_from_stream(
self, instance_name: str, last_id: int, current_id: int, limit: int self, instance_name: str, last_id: int, current_id: int, limit: int
@ -416,6 +417,8 @@ class EventsWorkerStore(SQLBaseStore):
self._stream_id_gen.advance(instance_name, token) self._stream_id_gen.advance(instance_name, token)
elif stream_name == BackfillStream.NAME: elif stream_name == BackfillStream.NAME:
self._backfill_id_gen.advance(instance_name, -token) self._backfill_id_gen.advance(instance_name, -token)
elif stream_name == UnPartialStatedEventStream.NAME:
self._un_partial_stated_events_stream_id_gen.advance(instance_name, token)
super().process_replication_position(stream_name, instance_name, token) super().process_replication_position(stream_name, instance_name, token)
async def have_censored_event(self, event_id: str) -> bool: async def have_censored_event(self, event_id: str) -> bool:

@ -43,6 +43,7 @@ from synapse.api.errors import StoreError
from synapse.api.room_versions import RoomVersion, RoomVersions from synapse.api.room_versions import RoomVersion, RoomVersions
from synapse.config.homeserver import HomeServerConfig from synapse.config.homeserver import HomeServerConfig
from synapse.events import EventBase from synapse.events import EventBase
from synapse.replication.tcp.streams.partial_state import UnPartialStatedRoomStream
from synapse.storage._base import SQLBaseStore, db_to_json, make_in_list_sql_clause from synapse.storage._base import SQLBaseStore, db_to_json, make_in_list_sql_clause
from synapse.storage.database import ( from synapse.storage.database import (
DatabasePool, DatabasePool,
@ -144,6 +145,13 @@ class RoomWorkerStore(CacheInvalidationWorkerStore):
"stream_id", "stream_id",
) )
def process_replication_position(
self, stream_name: str, instance_name: str, token: int
) -> None:
if stream_name == UnPartialStatedRoomStream.NAME:
self._un_partial_stated_rooms_stream_id_gen.advance(instance_name, token)
return super().process_replication_position(stream_name, instance_name, token)
async def store_room( async def store_room(
self, self,
room_id: str, room_id: str,
@ -1281,13 +1289,10 @@ class RoomWorkerStore(CacheInvalidationWorkerStore):
) )
return result["join_event_id"], result["device_lists_stream_id"] return result["join_event_id"], result["device_lists_stream_id"]
def get_un_partial_stated_rooms_token(self) -> int: def get_un_partial_stated_rooms_token(self, instance_name: str) -> int:
# TODO(faster_joins, multiple writers): This is inappropriate if there return self._un_partial_stated_rooms_stream_id_gen.get_current_token_for_writer(
# are multiple writers because workers that don't write often will instance_name
# hold all readers up. )
# (See `MultiWriterIdGenerator.get_persisted_upto_position` for an
# explanation.)
return self._un_partial_stated_rooms_stream_id_gen.get_current_token()
async def get_un_partial_stated_rooms_from_stream( async def get_un_partial_stated_rooms_from_stream(
self, instance_name: str, last_id: int, current_id: int, limit: int self, instance_name: str, last_id: int, current_id: int, limit: int

@ -95,6 +95,7 @@ class StateGroupWorkerStore(EventsWorkerStore, SQLBaseStore):
for row in rows: for row in rows:
assert isinstance(row, UnPartialStatedEventStreamRow) assert isinstance(row, UnPartialStatedEventStreamRow)
self._get_state_group_for_event.invalidate((row.event_id,)) self._get_state_group_for_event.invalidate((row.event_id,))
self.is_partial_state_event.invalidate((row.event_id,))
super().process_replication_rows(stream_name, instance_name, token, rows) super().process_replication_rows(stream_name, instance_name, token, rows)
@ -485,6 +486,7 @@ class StateGroupWorkerStore(EventsWorkerStore, SQLBaseStore):
"rejection_status_changed": rejection_status_changed, "rejection_status_changed": rejection_status_changed,
}, },
) )
txn.call_after(self.hs.get_notifier().on_new_replication_data)
class MainStateBackgroundUpdateStore(RoomMemberWorkerStore): class MainStateBackgroundUpdateStore(RoomMemberWorkerStore):

Loading…
Cancel
Save