Shuffle persist event data store functions. (#7440)

The aim here is to get to a stage where we have a `PersistEventStore` that holds all the write methods used during event persistence, so that we can take that class out of the `DataStore` mixin and instansiate it separately. This will allow us to instansiate it on processes other than master, while also ensuring it is only available on processes that are configured to write to events stream.

This is a bit of an architectural change, where we end up with multiple classes per data store (rather than one per data store we have now). We end up having:

1. Storage classes that provide high level APIs that can talk to multiple data stores.
2. Data store modules that consist of classes that must point at the same database instance.
3. Classes in a data store that can be instantiated on processes depending on config.
code_spécifique_watcha
Erik Johnston 5 years ago committed by GitHub
parent 7ee24c5674
commit 782e4e64df
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 1
      changelog.d/7440.misc
  2. 9
      synapse/storage/data_stores/__init__.py
  3. 8
      synapse/storage/data_stores/main/__init__.py
  4. 213
      synapse/storage/data_stores/main/censor_events.py
  5. 83
      synapse/storage/data_stores/main/event_federation.py
  6. 74
      synapse/storage/data_stores/main/event_push_actions.py
  7. 1216
      synapse/storage/data_stores/main/events.py
  8. 152
      synapse/storage/data_stores/main/events_worker.py
  9. 128
      synapse/storage/data_stores/main/metrics.py
  10. 399
      synapse/storage/data_stores/main/purge_events.py
  11. 11
      synapse/storage/data_stores/main/rejections.py
  12. 60
      synapse/storage/data_stores/main/relations.py
  13. 49
      synapse/storage/data_stores/main/room.py
  14. 90
      synapse/storage/data_stores/main/roommember.py
  15. 23
      synapse/storage/data_stores/main/search.py
  16. 30
      synapse/storage/data_stores/main/signatures.py
  17. 35
      synapse/storage/data_stores/main/state.py
  18. 27
      synapse/storage/persist_events.py
  19. 3
      tests/storage/test_event_push_actions.py

@ -0,0 +1 @@
Refactor event persistence database functions in preparation for allowing them to be run on non-master processes.

@ -15,6 +15,7 @@
import logging
from synapse.storage.data_stores.main.events import PersistEventsStore
from synapse.storage.data_stores.state import StateGroupDataStore
from synapse.storage.database import Database, make_conn
from synapse.storage.engines import create_engine
@ -39,6 +40,7 @@ class DataStores(object):
self.databases = []
self.main = None
self.state = None
self.persist_events = None
for database_config in hs.config.database.databases:
db_name = database_config.name
@ -64,6 +66,13 @@ class DataStores(object):
self.main = main_store_class(database, db_conn, hs)
# If we're on a process that can persist events (currently
# master), also instansiate a `PersistEventsStore`
if hs.config.worker.worker_app is None:
self.persist_events = PersistEventsStore(
hs, database, self.main
)
if "state" in database_config.data_stores:
logger.info("Starting 'state' data store")

@ -34,6 +34,7 @@ from synapse.util.caches.stream_change_cache import StreamChangeCache
from .account_data import AccountDataStore
from .appservice import ApplicationServiceStore, ApplicationServiceTransactionStore
from .cache import CacheInvalidationWorkerStore
from .censor_events import CensorEventsStore
from .client_ips import ClientIpStore
from .deviceinbox import DeviceInboxStore
from .devices import DeviceStore
@ -42,16 +43,17 @@ from .e2e_room_keys import EndToEndRoomKeyStore
from .end_to_end_keys import EndToEndKeyStore
from .event_federation import EventFederationStore
from .event_push_actions import EventPushActionsStore
from .events import EventsStore
from .events_bg_updates import EventsBackgroundUpdatesStore
from .filtering import FilteringStore
from .group_server import GroupServerStore
from .keys import KeyStore
from .media_repository import MediaRepositoryStore
from .metrics import ServerMetricsStore
from .monthly_active_users import MonthlyActiveUsersStore
from .openid import OpenIdStore
from .presence import PresenceStore, UserPresenceState
from .profile import ProfileStore
from .purge_events import PurgeEventsStore
from .push_rule import PushRuleStore
from .pusher import PusherStore
from .receipts import ReceiptsStore
@ -88,7 +90,7 @@ class DataStore(
StateStore,
SignatureStore,
ApplicationServiceStore,
EventsStore,
PurgeEventsStore,
EventFederationStore,
MediaRepositoryStore,
RejectionsStore,
@ -113,8 +115,10 @@ class DataStore(
MonthlyActiveUsersStore,
StatsStore,
RelationsStore,
CensorEventsStore,
UIAuthStore,
CacheInvalidationWorkerStore,
ServerMetricsStore,
):
def __init__(self, database: Database, db_conn, hs):
self.hs = hs

@ -0,0 +1,213 @@
# -*- coding: utf-8 -*-
# Copyright 2020 The Matrix.org Foundation C.I.C.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
from typing import TYPE_CHECKING
from twisted.internet import defer
from synapse.events.utils import prune_event_dict
from synapse.metrics.background_process_metrics import run_as_background_process
from synapse.storage._base import SQLBaseStore
from synapse.storage.data_stores.main.cache import CacheInvalidationWorkerStore
from synapse.storage.data_stores.main.events import encode_json
from synapse.storage.data_stores.main.events_worker import EventsWorkerStore
from synapse.storage.database import Database
if TYPE_CHECKING:
from synapse.server import HomeServer
logger = logging.getLogger(__name__)
class CensorEventsStore(CacheInvalidationWorkerStore, EventsWorkerStore, SQLBaseStore):
def __init__(self, database: Database, db_conn, hs: "HomeServer"):
super().__init__(database, db_conn, hs)
# This should only exist on master for now
assert (
hs.config.worker.worker_app is None
), "Can only instantiate CensorEventsStore on master"
def _censor_redactions():
return run_as_background_process(
"_censor_redactions", self._censor_redactions
)
if self.hs.config.redaction_retention_period is not None:
hs.get_clock().looping_call(_censor_redactions, 5 * 60 * 1000)
async def _censor_redactions(self):
"""Censors all redactions older than the configured period that haven't
been censored yet.
By censor we mean update the event_json table with the redacted event.
"""
if self.hs.config.redaction_retention_period is None:
return
if not (
await self.db.updates.has_completed_background_update(
"redactions_have_censored_ts_idx"
)
):
# We don't want to run this until the appropriate index has been
# created.
return
before_ts = self._clock.time_msec() - self.hs.config.redaction_retention_period
# We fetch all redactions that:
# 1. point to an event we have,
# 2. has a received_ts from before the cut off, and
# 3. we haven't yet censored.
#
# This is limited to 100 events to ensure that we don't try and do too
# much at once. We'll get called again so this should eventually catch
# up.
sql = """
SELECT redactions.event_id, redacts FROM redactions
LEFT JOIN events AS original_event ON (
redacts = original_event.event_id
)
WHERE NOT have_censored
AND redactions.received_ts <= ?
ORDER BY redactions.received_ts ASC
LIMIT ?
"""
rows = await self.db.execute(
"_censor_redactions_fetch", None, sql, before_ts, 100
)
updates = []
for redaction_id, event_id in rows:
redaction_event = await self.get_event(redaction_id, allow_none=True)
original_event = await self.get_event(
event_id, allow_rejected=True, allow_none=True
)
# The SQL above ensures that we have both the redaction and
# original event, so if the `get_event` calls return None it
# means that the redaction wasn't allowed. Either way we know that
# the result won't change so we mark the fact that we've checked.
if (
redaction_event
and original_event
and original_event.internal_metadata.is_redacted()
):
# Redaction was allowed
pruned_json = encode_json(
prune_event_dict(
original_event.room_version, original_event.get_dict()
)
)
else:
# Redaction wasn't allowed
pruned_json = None
updates.append((redaction_id, event_id, pruned_json))
def _update_censor_txn(txn):
for redaction_id, event_id, pruned_json in updates:
if pruned_json:
self._censor_event_txn(txn, event_id, pruned_json)
self.db.simple_update_one_txn(
txn,
table="redactions",
keyvalues={"event_id": redaction_id},
updatevalues={"have_censored": True},
)
await self.db.runInteraction("_update_censor_txn", _update_censor_txn)
def _censor_event_txn(self, txn, event_id, pruned_json):
"""Censor an event by replacing its JSON in the event_json table with the
provided pruned JSON.
Args:
txn (LoggingTransaction): The database transaction.
event_id (str): The ID of the event to censor.
pruned_json (str): The pruned JSON
"""
self.db.simple_update_one_txn(
txn,
table="event_json",
keyvalues={"event_id": event_id},
updatevalues={"json": pruned_json},
)
@defer.inlineCallbacks
def expire_event(self, event_id):
"""Retrieve and expire an event that has expired, and delete its associated
expiry timestamp. If the event can't be retrieved, delete its associated
timestamp so we don't try to expire it again in the future.
Args:
event_id (str): The ID of the event to delete.
"""
# Try to retrieve the event's content from the database or the event cache.
event = yield self.get_event(event_id)
def delete_expired_event_txn(txn):
# Delete the expiry timestamp associated with this event from the database.
self._delete_event_expiry_txn(txn, event_id)
if not event:
# If we can't find the event, log a warning and delete the expiry date
# from the database so that we don't try to expire it again in the
# future.
logger.warning(
"Can't expire event %s because we don't have it.", event_id
)
return
# Prune the event's dict then convert it to JSON.
pruned_json = encode_json(
prune_event_dict(event.room_version, event.get_dict())
)
# Update the event_json table to replace the event's JSON with the pruned
# JSON.
self._censor_event_txn(txn, event.event_id, pruned_json)
# We need to invalidate the event cache entry for this event because we
# changed its content in the database. We can't call
# self._invalidate_cache_and_stream because self.get_event_cache isn't of the
# right type.
txn.call_after(self._get_event_cache.invalidate, (event.event_id,))
# Send that invalidation to replication so that other workers also invalidate
# the event cache.
self._send_invalidation_to_replication(
txn, "_get_event_cache", (event.event_id,)
)
yield self.db.runInteraction("delete_expired_event", delete_expired_event_txn)
def _delete_event_expiry_txn(self, txn, event_id):
"""Delete the expiry timestamp associated with an event ID without deleting the
actual event.
Args:
txn (LoggingTransaction): The transaction to use to perform the deletion.
event_id (str): The event ID to delete the associated expiry timestamp of.
"""
return self.db.simple_delete_txn(
txn=txn, table="event_expiry", keyvalues={"event_id": event_id}
)

@ -640,89 +640,6 @@ class EventFederationStore(EventFederationWorkerStore):
self._delete_old_forward_extrem_cache, 60 * 60 * 1000
)
def _update_min_depth_for_room_txn(self, txn, room_id, depth):
min_depth = self._get_min_depth_interaction(txn, room_id)
if min_depth is not None and depth >= min_depth:
return
self.db.simple_upsert_txn(
txn,
table="room_depth",
keyvalues={"room_id": room_id},
values={"min_depth": depth},
)
def _handle_mult_prev_events(self, txn, events):
"""
For the given event, update the event edges table and forward and
backward extremities tables.
"""
self.db.simple_insert_many_txn(
txn,
table="event_edges",
values=[
{
"event_id": ev.event_id,
"prev_event_id": e_id,
"room_id": ev.room_id,
"is_state": False,
}
for ev in events
for e_id in ev.prev_event_ids()
],
)
self._update_backward_extremeties(txn, events)
def _update_backward_extremeties(self, txn, events):
"""Updates the event_backward_extremities tables based on the new/updated
events being persisted.
This is called for new events *and* for events that were outliers, but
are now being persisted as non-outliers.
Forward extremities are handled when we first start persisting the events.
"""
events_by_room = {}
for ev in events:
events_by_room.setdefault(ev.room_id, []).append(ev)
query = (
"INSERT INTO event_backward_extremities (event_id, room_id)"
" SELECT ?, ? WHERE NOT EXISTS ("
" SELECT 1 FROM event_backward_extremities"
" WHERE event_id = ? AND room_id = ?"
" )"
" AND NOT EXISTS ("
" SELECT 1 FROM events WHERE event_id = ? AND room_id = ? "
" AND outlier = ?"
" )"
)
txn.executemany(
query,
[
(e_id, ev.room_id, e_id, ev.room_id, e_id, ev.room_id, False)
for ev in events
for e_id in ev.prev_event_ids()
if not ev.internal_metadata.is_outlier()
],
)
query = (
"DELETE FROM event_backward_extremities"
" WHERE event_id = ? AND room_id = ?"
)
txn.executemany(
query,
[
(ev.event_id, ev.room_id)
for ev in events
if not ev.internal_metadata.is_outlier()
],
)
def _delete_old_forward_extrem_cache(self):
def _delete_old_forward_extrem_cache_txn(txn):
# Delete entries older than a month, while making sure we don't delete

@ -652,69 +652,6 @@ class EventPushActionsStore(EventPushActionsWorkerStore):
self._start_rotate_notifs, 30 * 60 * 1000
)
def _set_push_actions_for_event_and_users_txn(
self, txn, events_and_contexts, all_events_and_contexts
):
"""Handles moving push actions from staging table to main
event_push_actions table for all events in `events_and_contexts`.
Also ensures that all events in `all_events_and_contexts` are removed
from the push action staging area.
Args:
events_and_contexts (list[(EventBase, EventContext)]): events
we are persisting
all_events_and_contexts (list[(EventBase, EventContext)]): all
events that we were going to persist. This includes events
we've already persisted, etc, that wouldn't appear in
events_and_context.
"""
sql = """
INSERT INTO event_push_actions (
room_id, event_id, user_id, actions, stream_ordering,
topological_ordering, notif, highlight
)
SELECT ?, event_id, user_id, actions, ?, ?, notif, highlight
FROM event_push_actions_staging
WHERE event_id = ?
"""
if events_and_contexts:
txn.executemany(
sql,
(
(
event.room_id,
event.internal_metadata.stream_ordering,
event.depth,
event.event_id,
)
for event, _ in events_and_contexts
),
)
for event, _ in events_and_contexts:
user_ids = self.db.simple_select_onecol_txn(
txn,
table="event_push_actions_staging",
keyvalues={"event_id": event.event_id},
retcol="user_id",
)
for uid in user_ids:
txn.call_after(
self.get_unread_event_push_actions_by_room_for_user.invalidate_many,
(event.room_id, uid),
)
# Now we delete the staging area for *all* events that were being
# persisted.
txn.executemany(
"DELETE FROM event_push_actions_staging WHERE event_id = ?",
((event.event_id,) for event, _ in all_events_and_contexts),
)
@defer.inlineCallbacks
def get_push_actions_for_user(
self, user_id, before=None, limit=50, only_highlight=False
@ -763,17 +700,6 @@ class EventPushActionsStore(EventPushActionsWorkerStore):
)
return result[0] or 0
def _remove_push_actions_for_event_id_txn(self, txn, room_id, event_id):
# Sad that we have to blow away the cache for the whole room here
txn.call_after(
self.get_unread_event_push_actions_by_room_for_user.invalidate_many,
(room_id,),
)
txn.execute(
"DELETE FROM event_push_actions WHERE room_id = ? AND event_id = ?",
(room_id, event_id),
)
def _remove_old_push_actions_before_txn(
self, txn, room_id, user_id, stream_ordering
):

File diff suppressed because it is too large Load Diff

@ -27,7 +27,7 @@ from constantly import NamedConstant, Names
from twisted.internet import defer
from synapse.api.constants import EventTypes
from synapse.api.errors import NotFoundError
from synapse.api.errors import NotFoundError, SynapseError
from synapse.api.room_versions import (
KNOWN_ROOM_VERSIONS,
EventFormatVersions,
@ -40,7 +40,7 @@ from synapse.metrics.background_process_metrics import run_as_background_process
from synapse.storage._base import SQLBaseStore, make_in_list_sql_clause
from synapse.storage.database import Database
from synapse.types import get_domain_from_id
from synapse.util.caches.descriptors import Cache
from synapse.util.caches.descriptors import Cache, cached, cachedInlineCallbacks
from synapse.util.iterutils import batch_iter
from synapse.util.metrics import Measure
@ -1157,4 +1157,152 @@ class EventsWorkerStore(SQLBaseStore):
rows = await self.db.runInteraction(
"get_deltas_for_stream_id", get_deltas_for_stream_id_txn, to_token
)
return rows, to_token, True
@cached(num_args=5, max_entries=10)
def get_all_new_events(
self,
last_backfill_id,
last_forward_id,
current_backfill_id,
current_forward_id,
limit,
):
"""Get all the new events that have arrived at the server either as
new events or as backfilled events"""
have_backfill_events = last_backfill_id != current_backfill_id
have_forward_events = last_forward_id != current_forward_id
if not have_backfill_events and not have_forward_events:
return defer.succeed(AllNewEventsResult([], [], [], [], []))
def get_all_new_events_txn(txn):
sql = (
"SELECT e.stream_ordering, e.event_id, e.room_id, e.type,"
" state_key, redacts"
" FROM events AS e"
" LEFT JOIN redactions USING (event_id)"
" LEFT JOIN state_events USING (event_id)"
" WHERE ? < stream_ordering AND stream_ordering <= ?"
" ORDER BY stream_ordering ASC"
" LIMIT ?"
)
if have_forward_events:
txn.execute(sql, (last_forward_id, current_forward_id, limit))
new_forward_events = txn.fetchall()
if len(new_forward_events) == limit:
upper_bound = new_forward_events[-1][0]
else:
upper_bound = current_forward_id
sql = (
"SELECT event_stream_ordering, event_id, state_group"
" FROM ex_outlier_stream"
" WHERE ? > event_stream_ordering"
" AND event_stream_ordering >= ?"
" ORDER BY event_stream_ordering DESC"
)
txn.execute(sql, (last_forward_id, upper_bound))
forward_ex_outliers = txn.fetchall()
else:
new_forward_events = []
forward_ex_outliers = []
sql = (
"SELECT -e.stream_ordering, e.event_id, e.room_id, e.type,"
" state_key, redacts"
" FROM events AS e"
" LEFT JOIN redactions USING (event_id)"
" LEFT JOIN state_events USING (event_id)"
" WHERE ? > stream_ordering AND stream_ordering >= ?"
" ORDER BY stream_ordering DESC"
" LIMIT ?"
)
if have_backfill_events:
txn.execute(sql, (-last_backfill_id, -current_backfill_id, limit))
new_backfill_events = txn.fetchall()
if len(new_backfill_events) == limit:
upper_bound = new_backfill_events[-1][0]
else:
upper_bound = current_backfill_id
sql = (
"SELECT -event_stream_ordering, event_id, state_group"
" FROM ex_outlier_stream"
" WHERE ? > event_stream_ordering"
" AND event_stream_ordering >= ?"
" ORDER BY event_stream_ordering DESC"
)
txn.execute(sql, (-last_backfill_id, -upper_bound))
backward_ex_outliers = txn.fetchall()
else:
new_backfill_events = []
backward_ex_outliers = []
return AllNewEventsResult(
new_forward_events,
new_backfill_events,
forward_ex_outliers,
backward_ex_outliers,
)
return self.db.runInteraction("get_all_new_events", get_all_new_events_txn)
async def is_event_after(self, event_id1, event_id2):
"""Returns True if event_id1 is after event_id2 in the stream
"""
to_1, so_1 = await self._get_event_ordering(event_id1)
to_2, so_2 = await self._get_event_ordering(event_id2)
return (to_1, so_1) > (to_2, so_2)
@cachedInlineCallbacks(max_entries=5000)
def _get_event_ordering(self, event_id):
res = yield self.db.simple_select_one(
table="events",
retcols=["topological_ordering", "stream_ordering"],
keyvalues={"event_id": event_id},
allow_none=True,
)
if not res:
raise SynapseError(404, "Could not find event %s" % (event_id,))
return (int(res["topological_ordering"]), int(res["stream_ordering"]))
def get_next_event_to_expire(self):
"""Retrieve the entry with the lowest expiry timestamp in the event_expiry
table, or None if there's no more event to expire.
Returns: Deferred[Optional[Tuple[str, int]]]
A tuple containing the event ID as its first element and an expiry timestamp
as its second one, if there's at least one row in the event_expiry table.
None otherwise.
"""
def get_next_event_to_expire_txn(txn):
txn.execute(
"""
SELECT event_id, expiry_ts FROM event_expiry
ORDER BY expiry_ts ASC LIMIT 1
"""
)
return txn.fetchone()
return self.db.runInteraction(
desc="get_next_event_to_expire", func=get_next_event_to_expire_txn
)
AllNewEventsResult = namedtuple(
"AllNewEventsResult",
[
"new_forward_events",
"new_backfill_events",
"forward_ex_outliers",
"backward_ex_outliers",
],
)

@ -0,0 +1,128 @@
# -*- coding: utf-8 -*-
# Copyright 2020 The Matrix.org Foundation C.I.C.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import typing
from collections import Counter
from twisted.internet import defer
from synapse.metrics import BucketCollector
from synapse.metrics.background_process_metrics import run_as_background_process
from synapse.storage._base import SQLBaseStore
from synapse.storage.data_stores.main.event_push_actions import (
EventPushActionsWorkerStore,
)
from synapse.storage.database import Database
class ServerMetricsStore(EventPushActionsWorkerStore, SQLBaseStore):
"""Functions to pull various metrics from the DB, for e.g. phone home
stats and prometheus metrics.
"""
def __init__(self, database: Database, db_conn, hs):
super().__init__(database, db_conn, hs)
# Collect metrics on the number of forward extremities that exist.
# Counter of number of extremities to count
self._current_forward_extremities_amount = (
Counter()
) # type: typing.Counter[int]
BucketCollector(
"synapse_forward_extremities",
lambda: self._current_forward_extremities_amount,
buckets=[1, 2, 3, 5, 7, 10, 15, 20, 50, 100, 200, 500, "+Inf"],
)
# Read the extrems every 60 minutes
def read_forward_extremities():
# run as a background process to make sure that the database transactions
# have a logcontext to report to
return run_as_background_process(
"read_forward_extremities", self._read_forward_extremities
)
hs.get_clock().looping_call(read_forward_extremities, 60 * 60 * 1000)
async def _read_forward_extremities(self):
def fetch(txn):
txn.execute(
"""
select count(*) c from event_forward_extremities
group by room_id
"""
)
return txn.fetchall()
res = await self.db.runInteraction("read_forward_extremities", fetch)
self._current_forward_extremities_amount = Counter([x[0] for x in res])
@defer.inlineCallbacks
def count_daily_messages(self):
"""
Returns an estimate of the number of messages sent in the last day.
If it has been significantly less or more than one day since the last
call to this function, it will return None.
"""
def _count_messages(txn):
sql = """
SELECT COALESCE(COUNT(*), 0) FROM events
WHERE type = 'm.room.message'
AND stream_ordering > ?
"""
txn.execute(sql, (self.stream_ordering_day_ago,))
(count,) = txn.fetchone()
return count
ret = yield self.db.runInteraction("count_messages", _count_messages)
return ret
@defer.inlineCallbacks
def count_daily_sent_messages(self):
def _count_messages(txn):
# This is good enough as if you have silly characters in your own
# hostname then thats your own fault.
like_clause = "%:" + self.hs.hostname
sql = """
SELECT COALESCE(COUNT(*), 0) FROM events
WHERE type = 'm.room.message'
AND sender LIKE ?
AND stream_ordering > ?
"""
txn.execute(sql, (like_clause, self.stream_ordering_day_ago))
(count,) = txn.fetchone()
return count
ret = yield self.db.runInteraction("count_daily_sent_messages", _count_messages)
return ret
@defer.inlineCallbacks
def count_daily_active_rooms(self):
def _count(txn):
sql = """
SELECT COALESCE(COUNT(DISTINCT room_id), 0) FROM events
WHERE type = 'm.room.message'
AND stream_ordering > ?
"""
txn.execute(sql, (self.stream_ordering_day_ago,))
(count,) = txn.fetchone()
return count
ret = yield self.db.runInteraction("count_daily_active_rooms", _count)
return ret

@ -0,0 +1,399 @@
# -*- coding: utf-8 -*-
# Copyright 2020 The Matrix.org Foundation C.I.C.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
from typing import Any, Tuple
from synapse.api.errors import SynapseError
from synapse.storage._base import SQLBaseStore
from synapse.storage.data_stores.main.state import StateGroupWorkerStore
from synapse.types import RoomStreamToken
logger = logging.getLogger(__name__)
class PurgeEventsStore(StateGroupWorkerStore, SQLBaseStore):
def purge_history(self, room_id, token, delete_local_events):
"""Deletes room history before a certain point
Args:
room_id (str):
token (str): A topological token to delete events before
delete_local_events (bool):
if True, we will delete local events as well as remote ones
(instead of just marking them as outliers and deleting their
state groups).
Returns:
Deferred[set[int]]: The set of state groups that are referenced by
deleted events.
"""
return self.db.runInteraction(
"purge_history",
self._purge_history_txn,
room_id,
token,
delete_local_events,
)
def _purge_history_txn(self, txn, room_id, token_str, delete_local_events):
token = RoomStreamToken.parse(token_str)
# Tables that should be pruned:
# event_auth
# event_backward_extremities
# event_edges
# event_forward_extremities
# event_json
# event_push_actions
# event_reference_hashes
# event_search
# event_to_state_groups
# events
# rejections
# room_depth
# state_groups
# state_groups_state
# we will build a temporary table listing the events so that we don't
# have to keep shovelling the list back and forth across the
# connection. Annoyingly the python sqlite driver commits the
# transaction on CREATE, so let's do this first.
#
# furthermore, we might already have the table from a previous (failed)
# purge attempt, so let's drop the table first.
txn.execute("DROP TABLE IF EXISTS events_to_purge")
txn.execute(
"CREATE TEMPORARY TABLE events_to_purge ("
" event_id TEXT NOT NULL,"
" should_delete BOOLEAN NOT NULL"
")"
)
# First ensure that we're not about to delete all the forward extremeties
txn.execute(
"SELECT e.event_id, e.depth FROM events as e "
"INNER JOIN event_forward_extremities as f "
"ON e.event_id = f.event_id "
"AND e.room_id = f.room_id "
"WHERE f.room_id = ?",
(room_id,),
)
rows = txn.fetchall()
max_depth = max(row[1] for row in rows)
if max_depth < token.topological:
# We need to ensure we don't delete all the events from the database
# otherwise we wouldn't be able to send any events (due to not
# having any backwards extremeties)
raise SynapseError(
400, "topological_ordering is greater than forward extremeties"
)
logger.info("[purge] looking for events to delete")
should_delete_expr = "state_key IS NULL"
should_delete_params = () # type: Tuple[Any, ...]
if not delete_local_events:
should_delete_expr += " AND event_id NOT LIKE ?"
# We include the parameter twice since we use the expression twice
should_delete_params += ("%:" + self.hs.hostname, "%:" + self.hs.hostname)
should_delete_params += (room_id, token.topological)
# Note that we insert events that are outliers and aren't going to be
# deleted, as nothing will happen to them.
txn.execute(
"INSERT INTO events_to_purge"
" SELECT event_id, %s"
" FROM events AS e LEFT JOIN state_events USING (event_id)"
" WHERE (NOT outlier OR (%s)) AND e.room_id = ? AND topological_ordering < ?"
% (should_delete_expr, should_delete_expr),
should_delete_params,
)
# We create the indices *after* insertion as that's a lot faster.
# create an index on should_delete because later we'll be looking for
# the should_delete / shouldn't_delete subsets
txn.execute(
"CREATE INDEX events_to_purge_should_delete"
" ON events_to_purge(should_delete)"
)
# We do joins against events_to_purge for e.g. calculating state
# groups to purge, etc., so lets make an index.
txn.execute("CREATE INDEX events_to_purge_id ON events_to_purge(event_id)")
txn.execute("SELECT event_id, should_delete FROM events_to_purge")
event_rows = txn.fetchall()
logger.info(
"[purge] found %i events before cutoff, of which %i can be deleted",
len(event_rows),
sum(1 for e in event_rows if e[1]),
)
logger.info("[purge] Finding new backward extremities")
# We calculate the new entries for the backward extremeties by finding
# events to be purged that are pointed to by events we're not going to
# purge.
txn.execute(
"SELECT DISTINCT e.event_id FROM events_to_purge AS e"
" INNER JOIN event_edges AS ed ON e.event_id = ed.prev_event_id"
" LEFT JOIN events_to_purge AS ep2 ON ed.event_id = ep2.event_id"
" WHERE ep2.event_id IS NULL"
)
new_backwards_extrems = txn.fetchall()
logger.info("[purge] replacing backward extremities: %r", new_backwards_extrems)
txn.execute(
"DELETE FROM event_backward_extremities WHERE room_id = ?", (room_id,)
)
# Update backward extremeties
txn.executemany(
"INSERT INTO event_backward_extremities (room_id, event_id)"
" VALUES (?, ?)",
[(room_id, event_id) for event_id, in new_backwards_extrems],
)
logger.info("[purge] finding state groups referenced by deleted events")
# Get all state groups that are referenced by events that are to be
# deleted.
txn.execute(
"""
SELECT DISTINCT state_group FROM events_to_purge
INNER JOIN event_to_state_groups USING (event_id)
"""
)
referenced_state_groups = {sg for sg, in txn}
logger.info(
"[purge] found %i referenced state groups", len(referenced_state_groups)
)
logger.info("[purge] removing events from event_to_state_groups")
txn.execute(
"DELETE FROM event_to_state_groups "
"WHERE event_id IN (SELECT event_id from events_to_purge)"
)
for event_id, _ in event_rows:
txn.call_after(self._get_state_group_for_event.invalidate, (event_id,))
# Delete all remote non-state events
for table in (
"events",
"event_json",
"event_auth",
"event_edges",
"event_forward_extremities",
"event_reference_hashes",
"event_search",
"rejections",
):
logger.info("[purge] removing events from %s", table)
txn.execute(
"DELETE FROM %s WHERE event_id IN ("
" SELECT event_id FROM events_to_purge WHERE should_delete"
")" % (table,)
)
# event_push_actions lacks an index on event_id, and has one on
# (room_id, event_id) instead.
for table in ("event_push_actions",):
logger.info("[purge] removing events from %s", table)
txn.execute(
"DELETE FROM %s WHERE room_id = ? AND event_id IN ("
" SELECT event_id FROM events_to_purge WHERE should_delete"
")" % (table,),
(room_id,),
)
# Mark all state and own events as outliers
logger.info("[purge] marking remaining events as outliers")
txn.execute(
"UPDATE events SET outlier = ?"
" WHERE event_id IN ("
" SELECT event_id FROM events_to_purge "
" WHERE NOT should_delete"
")",
(True,),
)
# synapse tries to take out an exclusive lock on room_depth whenever it
# persists events (because upsert), and once we run this update, we
# will block that for the rest of our transaction.
#
# So, let's stick it at the end so that we don't block event
# persistence.
#
# We do this by calculating the minimum depth of the backwards
# extremities. However, the events in event_backward_extremities
# are ones we don't have yet so we need to look at the events that
# point to it via event_edges table.
txn.execute(
"""
SELECT COALESCE(MIN(depth), 0)
FROM event_backward_extremities AS eb
INNER JOIN event_edges AS eg ON eg.prev_event_id = eb.event_id
INNER JOIN events AS e ON e.event_id = eg.event_id
WHERE eb.room_id = ?
""",
(room_id,),
)
(min_depth,) = txn.fetchone()
logger.info("[purge] updating room_depth to %d", min_depth)
txn.execute(
"UPDATE room_depth SET min_depth = ? WHERE room_id = ?",
(min_depth, room_id),
)
# finally, drop the temp table. this will commit the txn in sqlite,
# so make sure to keep this actually last.
txn.execute("DROP TABLE events_to_purge")
logger.info("[purge] done")
return referenced_state_groups
def purge_room(self, room_id):
"""Deletes all record of a room
Args:
room_id (str)
Returns:
Deferred[List[int]]: The list of state groups to delete.
"""
return self.db.runInteraction("purge_room", self._purge_room_txn, room_id)
def _purge_room_txn(self, txn, room_id):
# First we fetch all the state groups that should be deleted, before
# we delete that information.
txn.execute(
"""
SELECT DISTINCT state_group FROM events
INNER JOIN event_to_state_groups USING(event_id)
WHERE events.room_id = ?
""",
(room_id,),
)
state_groups = [row[0] for row in txn]
# Now we delete tables which lack an index on room_id but have one on event_id
for table in (
"event_auth",
"event_edges",
"event_push_actions_staging",
"event_reference_hashes",
"event_relations",
"event_to_state_groups",
"redactions",
"rejections",
"state_events",
):
logger.info("[purge] removing %s from %s", room_id, table)
txn.execute(
"""
DELETE FROM %s WHERE event_id IN (
SELECT event_id FROM events WHERE room_id=?
)
"""
% (table,),
(room_id,),
)
# and finally, the tables with an index on room_id (or no useful index)
for table in (
"current_state_events",
"event_backward_extremities",
"event_forward_extremities",
"event_json",
"event_push_actions",
"event_search",
"events",
"group_rooms",
"public_room_list_stream",
"receipts_graph",
"receipts_linearized",
"room_aliases",
"room_depth",
"room_memberships",
"room_stats_state",
"room_stats_current",
"room_stats_historical",
"room_stats_earliest_token",
"rooms",
"stream_ordering_to_exterm",
"users_in_public_rooms",
"users_who_share_private_rooms",
# no useful index, but let's clear them anyway
"appservice_room_list",
"e2e_room_keys",
"event_push_summary",
"pusher_throttle",
"group_summary_rooms",
"local_invites",
"room_account_data",
"room_tags",
"local_current_membership",
):
logger.info("[purge] removing %s from %s", room_id, table)
txn.execute("DELETE FROM %s WHERE room_id=?" % (table,), (room_id,))
# Other tables we do NOT need to clear out:
#
# - blocked_rooms
# This is important, to make sure that we don't accidentally rejoin a blocked
# room after it was purged
#
# - user_directory
# This has a room_id column, but it is unused
#
# Other tables that we might want to consider clearing out include:
#
# - event_reports
# Given that these are intended for abuse management my initial
# inclination is to leave them in place.
#
# - current_state_delta_stream
# - ex_outlier_stream
# - room_tags_revisions
# The problem with these is that they are largeish and there is no room_id
# index on them. In any case we should be clearing out 'stream' tables
# periodically anyway (#5888)
# TODO: we could probably usefully do a bunch of cache invalidation here
logger.info("[purge] done")
return state_groups

@ -21,17 +21,6 @@ logger = logging.getLogger(__name__)
class RejectionsStore(SQLBaseStore):
def _store_rejections_txn(self, txn, event_id, reason):
self.db.simple_insert_txn(
txn,
table="rejections",
values={
"event_id": event_id,
"reason": reason,
"last_check": self._clock.time_msec(),
},
)
def get_rejection_reason(self, event_id):
return self.db.simple_select_one_onecol(
table="rejections",

@ -324,62 +324,4 @@ class RelationsWorkerStore(SQLBaseStore):
class RelationsStore(RelationsWorkerStore):
def _handle_event_relations(self, txn, event):
"""Handles inserting relation data during peristence of events
Args:
txn
event (EventBase)
"""
relation = event.content.get("m.relates_to")
if not relation:
# No relations
return
rel_type = relation.get("rel_type")
if rel_type not in (
RelationTypes.ANNOTATION,
RelationTypes.REFERENCE,
RelationTypes.REPLACE,
):
# Unknown relation type
return
parent_id = relation.get("event_id")
if not parent_id:
# Invalid relation
return
aggregation_key = relation.get("key")
self.db.simple_insert_txn(
txn,
table="event_relations",
values={
"event_id": event.event_id,
"relates_to_id": parent_id,
"relation_type": rel_type,
"aggregation_key": aggregation_key,
},
)
txn.call_after(self.get_relations_for_event.invalidate_many, (parent_id,))
txn.call_after(
self.get_aggregation_groups_for_event.invalidate_many, (parent_id,)
)
if rel_type == RelationTypes.REPLACE:
txn.call_after(self.get_applicable_edit.invalidate, (parent_id,))
def _handle_redaction(self, txn, redacted_event_id):
"""Handles receiving a redaction and checking whether we need to remove
any redacted relations from the database.
Args:
txn
redacted_event_id (str): The event that was redacted.
"""
self.db.simple_delete_txn(
txn, table="event_relations", keyvalues={"event_id": redacted_event_id}
)
pass

@ -21,8 +21,6 @@ from abc import abstractmethod
from enum import Enum
from typing import Any, Dict, List, Optional, Tuple
from six import integer_types
from canonicaljson import json
from twisted.internet import defer
@ -1302,53 +1300,6 @@ class RoomStore(RoomBackgroundUpdateStore, RoomWorkerStore, SearchStore):
return self.db.runInteraction("get_rooms", f)
def _store_room_topic_txn(self, txn, event):
if hasattr(event, "content") and "topic" in event.content:
self.store_event_search_txn(
txn, event, "content.topic", event.content["topic"]
)
def _store_room_name_txn(self, txn, event):
if hasattr(event, "content") and "name" in event.content:
self.store_event_search_txn(
txn, event, "content.name", event.content["name"]
)
def _store_room_message_txn(self, txn, event):
if hasattr(event, "content") and "body" in event.content:
self.store_event_search_txn(
txn, event, "content.body", event.content["body"]
)
def _store_retention_policy_for_room_txn(self, txn, event):
if hasattr(event, "content") and (
"min_lifetime" in event.content or "max_lifetime" in event.content
):
if (
"min_lifetime" in event.content
and not isinstance(event.content.get("min_lifetime"), integer_types)
) or (
"max_lifetime" in event.content
and not isinstance(event.content.get("max_lifetime"), integer_types)
):
# Ignore the event if one of the value isn't an integer.
return
self.db.simple_insert_txn(
txn=txn,
table="room_retention",
values={
"room_id": event.room_id,
"event_id": event.event_id,
"min_lifetime": event.content.get("min_lifetime"),
"max_lifetime": event.content.get("max_lifetime"),
},
)
self._invalidate_cache_and_stream(
txn, self.get_retention_policy_for_room, (event.room_id,)
)
def add_event_report(
self, room_id, event_id, user_id, reason, content, received_ts
):

@ -1050,96 +1050,6 @@ class RoomMemberStore(RoomMemberWorkerStore, RoomMemberBackgroundUpdateStore):
def __init__(self, database: Database, db_conn, hs):
super(RoomMemberStore, self).__init__(database, db_conn, hs)
def _store_room_members_txn(self, txn, events, backfilled):
"""Store a room member in the database.
"""
self.db.simple_insert_many_txn(
txn,
table="room_memberships",
values=[
{
"event_id": event.event_id,
"user_id": event.state_key,
"sender": event.user_id,
"room_id": event.room_id,
"membership": event.membership,
"display_name": event.content.get("displayname", None),
"avatar_url": event.content.get("avatar_url", None),
}
for event in events
],
)
for event in events:
txn.call_after(
self._membership_stream_cache.entity_has_changed,
event.state_key,
event.internal_metadata.stream_ordering,
)
txn.call_after(
self.get_invited_rooms_for_local_user.invalidate, (event.state_key,)
)
# We update the local_invites table only if the event is "current",
# i.e., its something that has just happened. If the event is an
# outlier it is only current if its an "out of band membership",
# like a remote invite or a rejection of a remote invite.
is_new_state = not backfilled and (
not event.internal_metadata.is_outlier()
or event.internal_metadata.is_out_of_band_membership()
)
is_mine = self.hs.is_mine_id(event.state_key)
if is_new_state and is_mine:
if event.membership == Membership.INVITE:
self.db.simple_insert_txn(
txn,
table="local_invites",
values={
"event_id": event.event_id,
"invitee": event.state_key,
"inviter": event.sender,
"room_id": event.room_id,
"stream_id": event.internal_metadata.stream_ordering,
},
)
else:
sql = (
"UPDATE local_invites SET stream_id = ?, replaced_by = ? WHERE"
" room_id = ? AND invitee = ? AND locally_rejected is NULL"
" AND replaced_by is NULL"
)
txn.execute(
sql,
(
event.internal_metadata.stream_ordering,
event.event_id,
event.room_id,
event.state_key,
),
)
# We also update the `local_current_membership` table with
# latest invite info. This will usually get updated by the
# `current_state_events` handling, unless its an outlier.
if event.internal_metadata.is_outlier():
# This should only happen for out of band memberships, so
# we add a paranoia check.
assert event.internal_metadata.is_out_of_band_membership()
self.db.simple_upsert_txn(
txn,
table="local_current_membership",
keyvalues={
"room_id": event.room_id,
"user_id": event.state_key,
},
values={
"event_id": event.event_id,
"membership": event.membership,
},
)
@defer.inlineCallbacks
def locally_reject_invite(self, user_id, room_id):
sql = (

@ -347,29 +347,6 @@ class SearchStore(SearchBackgroundUpdateStore):
def __init__(self, database: Database, db_conn, hs):
super(SearchStore, self).__init__(database, db_conn, hs)
def store_event_search_txn(self, txn, event, key, value):
"""Add event to the search table
Args:
txn (cursor):
event (EventBase):
key (str):
value (str):
"""
self.store_search_entries_txn(
txn,
(
SearchEntry(
key=key,
value=value,
event_id=event.event_id,
room_id=event.room_id,
stream_ordering=event.internal_metadata.stream_ordering,
origin_server_ts=event.origin_server_ts,
),
),
)
@defer.inlineCallbacks
def search_msgs(self, room_ids, search_term, keys):
"""Performs a full text search over events with given keys.

@ -13,23 +13,13 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import six
from unpaddedbase64 import encode_base64
from twisted.internet import defer
from synapse.crypto.event_signing import compute_event_reference_hash
from synapse.storage._base import SQLBaseStore
from synapse.util.caches.descriptors import cached, cachedList
# py2 sqlite has buffer hardcoded as only binary type, so we must use it,
# despite being deprecated and removed in favor of memoryview
if six.PY2:
db_binary_type = six.moves.builtins.buffer
else:
db_binary_type = memoryview
class SignatureWorkerStore(SQLBaseStore):
@cached()
@ -79,23 +69,3 @@ class SignatureWorkerStore(SQLBaseStore):
class SignatureStore(SignatureWorkerStore):
"""Persistence for event signatures and hashes"""
def _store_event_reference_hashes_txn(self, txn, events):
"""Store a hash for a PDU
Args:
txn (cursor):
events (list): list of Events.
"""
vals = []
for event in events:
ref_alg, ref_hash_bytes = compute_event_reference_hash(event)
vals.append(
{
"event_id": event.event_id,
"algorithm": ref_alg,
"hash": db_binary_type(ref_hash_bytes),
}
)
self.db.simple_insert_many_txn(txn, table="event_reference_hashes", values=vals)

@ -16,17 +16,12 @@
import collections.abc
import logging
from collections import namedtuple
from typing import Iterable, Tuple
from six import iteritems
from twisted.internet import defer
from synapse.api.constants import EventTypes, Membership
from synapse.api.errors import NotFoundError, UnsupportedRoomVersionError
from synapse.api.room_versions import KNOWN_ROOM_VERSIONS, RoomVersion
from synapse.events import EventBase
from synapse.events.snapshot import EventContext
from synapse.storage._base import SQLBaseStore
from synapse.storage.data_stores.main.events_worker import EventsWorkerStore
from synapse.storage.data_stores.main.roommember import RoomMemberWorkerStore
@ -473,33 +468,3 @@ class StateStore(StateGroupWorkerStore, MainStateBackgroundUpdateStore):
def __init__(self, database: Database, db_conn, hs):
super(StateStore, self).__init__(database, db_conn, hs)
def _store_event_state_mappings_txn(
self, txn, events_and_contexts: Iterable[Tuple[EventBase, EventContext]]
):
state_groups = {}
for event, context in events_and_contexts:
if event.internal_metadata.is_outlier():
continue
# if the event was rejected, just give it the same state as its
# predecessor.
if context.rejected:
state_groups[event.event_id] = context.state_group_before_event
continue
state_groups[event.event_id] = context.state_group
self.db.simple_insert_many_txn(
txn,
table="event_to_state_groups",
values=[
{"state_group": state_group_id, "event_id": event_id}
for event_id, state_group_id in iteritems(state_groups)
],
)
for event_id, state_group_id in iteritems(state_groups):
txn.call_after(
self._get_state_group_for_event.prefill, (event_id,), state_group_id
)

@ -23,7 +23,6 @@ from typing import Iterable, List, Optional, Set, Tuple
from six import iteritems
from six.moves import range
import attr
from prometheus_client import Counter, Histogram
from twisted.internet import defer
@ -35,6 +34,7 @@ from synapse.logging.context import PreserveLoggingContext, make_deferred_yielda
from synapse.metrics.background_process_metrics import run_as_background_process
from synapse.state import StateResolutionStore
from synapse.storage.data_stores import DataStores
from synapse.storage.data_stores.main.events import DeltaState
from synapse.types import StateMap
from synapse.util.async_helpers import ObservableDeferred
from synapse.util.metrics import Measure
@ -73,22 +73,6 @@ stale_forward_extremities_counter = Histogram(
)
@attr.s(slots=True)
class DeltaState:
"""Deltas to use to update the `current_state_events` table.
Attributes:
to_delete: List of type/state_keys to delete from current state
to_insert: Map of state to upsert into current state
no_longer_in_room: The server is not longer in the room, so the room
should e.g. be removed from `current_state_events` table.
"""
to_delete = attr.ib(type=List[Tuple[str, str]])
to_insert = attr.ib(type=StateMap[str])
no_longer_in_room = attr.ib(type=bool, default=False)
class _EventPeristenceQueue(object):
"""Queues up events so that they can be persisted in bulk with only one
concurrent transaction per room.
@ -205,6 +189,7 @@ class EventsPersistenceStorage(object):
# store for now.
self.main_store = stores.main
self.state_store = stores.state
self.persist_events_store = stores.persist_events
self._clock = hs.get_clock()
self.is_mine_id = hs.is_mine_id
@ -445,7 +430,7 @@ class EventsPersistenceStorage(object):
if current_state is not None:
current_state_for_room[room_id] = current_state
await self.main_store._persist_events_and_state_updates(
await self.persist_events_store._persist_events_and_state_updates(
chunk,
current_state_for_room=current_state_for_room,
state_delta_for_room=state_delta_for_room,
@ -491,13 +476,15 @@ class EventsPersistenceStorage(object):
)
# Remove any events which are prev_events of any existing events.
existing_prevs = await self.main_store._get_events_which_are_prevs(result)
existing_prevs = await self.persist_events_store._get_events_which_are_prevs(
result
)
result.difference_update(existing_prevs)
# Finally handle the case where the new events have soft-failed prev
# events. If they do we need to remove them and their prev events,
# otherwise we end up with dangling extremities.
existing_prevs = await self.main_store._get_prevs_before_rejected(
existing_prevs = await self.persist_events_store._get_prevs_before_rejected(
e_id for event in new_events for e_id in event.prev_event_ids()
)
result.difference_update(existing_prevs)

@ -35,6 +35,7 @@ class EventPushActionsStoreTestCase(tests.unittest.TestCase):
def setUp(self):
hs = yield tests.utils.setup_test_homeserver(self.addCleanup)
self.store = hs.get_datastore()
self.persist_events_store = hs.get_datastores().persist_events
@defer.inlineCallbacks
def test_get_unread_push_actions_for_user_in_range_for_http(self):
@ -76,7 +77,7 @@ class EventPushActionsStoreTestCase(tests.unittest.TestCase):
)
yield self.store.db.runInteraction(
"",
self.store._set_push_actions_for_event_and_users_txn,
self.persist_events_store._set_push_actions_for_event_and_users_txn,
[(event, None)],
[(event, None)],
)

Loading…
Cancel
Save