Like Prometheus, but for logs.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
loki/pkg/compactor/compactor.go

913 lines
34 KiB

package compactor
import (
"context"
"flag"
"fmt"
"net/http"
"os"
"path/filepath"
"sort"
Custom Retention (#3642) * Playing around with a POC based on the design doc. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Getting series ID for label matchers is now working. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Fixes tests. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * wip/ Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * We're parsing the index label now. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Starting to extract interfaces to make the code testable. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Work in progress will try to add labels to chunk ref iterator Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Iterator for chunks ref with all labels !!!. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Optimize code away. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * with delete into the mix Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Cleaner but not yet working for v10 and v11. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Fixes series cleaner. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * hooking into the compactor. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Hooking limit retention config. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Finishing off the marker processor. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Working on sweeper and fixing tests. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Adding more tests and founding more bugs along the way. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Bug with path once boltdb is closed. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Fixing more bug and more robust test. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * More test and cleanup getting close. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Fixes moar bugs with regards to period schema Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Fix a flaky tests because of boltdb still open. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Add more metrics. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Adding metrics. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Improve benchmark. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Fixes issue. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * lint code. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * more logs. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Save files without using table key Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Improve logging and ability to use more goroutines. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Removes duplicate metrics since histogram contains total too. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Add more logs. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Fixes a deadlock bug when too many workers are trying to update the mark file. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Fixes a deadlock when reading and updating db at the same time. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Fixes default config test of boltdb. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * PR Review feedbacks. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Remove the user of assert to not fail a test if it's incorrect on the first shot. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Add experimental notice to the flag documentation Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Fixes empty index detection and table deletion. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Do not delete folder it's not necessary with object store. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Better working path cleanup Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * got linted. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com>
5 years ago
"sync"
"time"
"github.com/go-kit/log/level"
"github.com/grafana/dskit/kv"
"github.com/grafana/dskit/ring"
"github.com/grafana/dskit/services"
"github.com/pkg/errors"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/common/model"
"github.com/grafana/loki/v3/pkg/analytics"
"github.com/grafana/loki/v3/pkg/compactor/deletion"
"github.com/grafana/loki/v3/pkg/compactor/retention"
"github.com/grafana/loki/v3/pkg/storage/chunk/client"
"github.com/grafana/loki/v3/pkg/storage/chunk/client/local"
chunk_util "github.com/grafana/loki/v3/pkg/storage/chunk/client/util"
"github.com/grafana/loki/v3/pkg/storage/config"
"github.com/grafana/loki/v3/pkg/storage/stores/shipper/indexshipper/storage"
"github.com/grafana/loki/v3/pkg/util/filter"
util_log "github.com/grafana/loki/v3/pkg/util/log"
lokiring "github.com/grafana/loki/v3/pkg/util/ring"
"github.com/grafana/loki/v3/pkg/validation"
)
// Here is how the generic compactor works:
// 1. Find the index type from table name using schemaPeriodForTable.
// 2. Find the registered IndexCompactor for the index type.
// 3. Build an instance of TableCompactor using IndexCompactor.NewIndexCompactor, with all the required information to do a compaction.
// 4. Run the compaction using TableCompactor.Compact, which would set the new/updated CompactedIndex for each IndexSet.
// 5. If retention is enabled, run retention on the CompactedIndex using its retention.IndexProcessor implementation.
// 6. Convert the CompactedIndex to a file using the IndexCompactor.ToIndexFile for uploading.
// 7. If we uploaded successfully, delete the old index files.
const (
// ringAutoForgetUnhealthyPeriods is how many consecutive timeout periods an unhealthy instance
// in the ring will be automatically removed.
ringAutoForgetUnhealthyPeriods = 10
// ringKey is the key under which we store the store gateways ring in the KVStore.
ringKey = "compactor"
// ringNameForServer is the name of the ring used by the compactor server.
ringNameForServer = "compactor"
// ringKeyOfLeader is a somewhat arbitrary ID to pull from the ring to see who will be elected the leader
ringKeyOfLeader = 0
// ringReplicationFactor should be 1 because we only want to pull back one node from the Ring
ringReplicationFactor = 1
// ringNumTokens sets our single token in the ring,
// we only need to insert 1 token to be used for leader election purposes.
ringNumTokens = 1
)
var (
retentionEnabledStats = analytics.NewString("compactor_retention_enabled")
defaultRetentionStats = analytics.NewString("compactor_default_retention")
)
type Config struct {
Remove shared_store and shared_store_key_prefix from shipper and compactor (#10840) **What this PR does / why we need it**: #### Removes `shared_store` and `shared_store_key_prefix` from index shipper and compactor configs and their corresponding CLI flags. - `-tsdb.shipper.shared-store` - `-boltdb.shipper.shared-store` - `-tsdb.shipper.shared-store.key-prefix` - `-boltdb.shipper.shared-store.key-prefix` - `-boltdb.shipper.compactor.shared-store` - `-boltdb.shipper.compactor.shared-store.key-prefix` `shared_store` has been a confusing option allowing users to easily misconfigure Loki. Going forward `object_store` setting in the [period_config](https://grafana.com/docs/loki/latest/configure/#period_config) (which already configured the store for chunks) will be used to configure store for the index. And the newly added `path_prefix` option under the `index` key in `period_config` will configure the path under which index tables are stored. This change enforces chunks and index files for a given period reside together in the same storage bucket. More details in the upgrade guide. --- `-compactor.delete-request-store` has to be **explicitly configured** going forward. Without setting this, loki wouldn't know which object store to use for storing delete requests. Path prefix for storing deletes is decided by `-compactor.delete-request-store.key-prefix` which defaults to `index/`. **Checklist** - [X] Reviewed the [`CONTRIBUTING.md`](https://github.com/grafana/loki/blob/main/CONTRIBUTING.md) guide (**required**) - [X] Documentation added - [X] Tests updated - [x] `CHANGELOG.md` updated - [ ] If the change is worth mentioning in the release notes, add `add-to-release-notes` label - [X] Changes that require user attention or interaction to upgrade are documented in `docs/sources/setup/upgrade/_index.md` - [ ] For Helm chart changes bump the Helm chart version in `production/helm/loki/Chart.yaml` and update `production/helm/loki/CHANGELOG.md` and `production/helm/loki/README.md`. [Example PR](https://github.com/grafana/loki/commit/d10549e3ece02120974929894ee333d07755d213)
2 years ago
WorkingDirectory string `yaml:"working_directory"`
CompactionInterval time.Duration `yaml:"compaction_interval"`
ApplyRetentionInterval time.Duration `yaml:"apply_retention_interval"`
RetentionEnabled bool `yaml:"retention_enabled"`
RetentionDeleteDelay time.Duration `yaml:"retention_delete_delay"`
RetentionDeleteWorkCount int `yaml:"retention_delete_worker_count"`
RetentionTableTimeout time.Duration `yaml:"retention_table_timeout"`
DeleteRequestStore string `yaml:"delete_request_store"`
DeleteRequestStoreKeyPrefix string `yaml:"delete_request_store_key_prefix"`
DeleteBatchSize int `yaml:"delete_batch_size"`
DeleteRequestCancelPeriod time.Duration `yaml:"delete_request_cancel_period"`
DeleteMaxInterval time.Duration `yaml:"delete_max_interval"`
MaxCompactionParallelism int `yaml:"max_compaction_parallelism"`
UploadParallelism int `yaml:"upload_parallelism"`
CompactorRing lokiring.RingConfig `yaml:"compactor_ring,omitempty" doc:"description=The hash ring configuration used by compactors to elect a single instance for running compactions. The CLI flags prefix for this block config is: compactor.ring"`
RunOnce bool `yaml:"_" doc:"hidden"`
TablesToCompact int `yaml:"tables_to_compact"`
SkipLatestNTables int `yaml:"skip_latest_n_tables"`
}
// RegisterFlags registers flags.
func (cfg *Config) RegisterFlags(f *flag.FlagSet) {
config: remove flags & YAML settings deprecated in 2.9.x (#11151) **What this PR does / why we need it**: - remove `-ruler.evaluation-delay-duration` - remove `boltdb.shipper.compactor.deletion-mode` - remove compactor flags with prefix`-boltdb.shipper.compactor.*` - remove `validation.enforce-metric-name` - remove `log.use-buffered`, `log.use-sync` **Special notes for your reviewer**: `log.use-buffered`, `log.use-sync` are not mentioned in the upgrade guide since they were hidden fields **Checklist** - [X] Reviewed the [`CONTRIBUTING.md`](https://github.com/grafana/loki/blob/main/CONTRIBUTING.md) guide (**required**) - [ ] Documentation added - [X] Tests updated - [x] `CHANGELOG.md` updated - [ ] If the change is worth mentioning in the release notes, add `add-to-release-notes` label - [X] Changes that require user attention or interaction to upgrade are documented in `docs/sources/setup/upgrade/_index.md` - [ ] For Helm chart changes bump the Helm chart version in `production/helm/loki/Chart.yaml` and update `production/helm/loki/CHANGELOG.md` and `production/helm/loki/README.md`. [Example PR](https://github.com/grafana/loki/commit/d10549e3ece02120974929894ee333d07755d213) - [X] If the change is deprecating or removing a configuration option, update the `deprecated-config.yaml` and `deleted-config.yaml` files respectively in the `tools/deprecated-config-checker` directory. [Example PR](https://github.com/grafana/loki/pull/10840/commits/0d4416a4b03739583349934b96f272fb4f685d15) --------- Co-authored-by: J Stickler <julie.stickler@grafana.com>
2 years ago
f.StringVar(&cfg.WorkingDirectory, "compactor.working-directory", "", "Directory where files can be downloaded for compaction.")
f.DurationVar(&cfg.CompactionInterval, "compactor.compaction-interval", 10*time.Minute, "Interval at which to re-run the compaction operation.")
f.DurationVar(&cfg.ApplyRetentionInterval, "compactor.apply-retention-interval", 0, "Interval at which to apply/enforce retention. 0 means run at same interval as compaction. If non-zero, it should always be a multiple of compaction interval.")
f.DurationVar(&cfg.RetentionDeleteDelay, "compactor.retention-delete-delay", 2*time.Hour, "Delay after which chunks will be fully deleted during retention.")
f.BoolVar(&cfg.RetentionEnabled, "compactor.retention-enabled", false, "Activate custom (per-stream,per-tenant) retention.")
config: remove flags & YAML settings deprecated in 2.9.x (#11151) **What this PR does / why we need it**: - remove `-ruler.evaluation-delay-duration` - remove `boltdb.shipper.compactor.deletion-mode` - remove compactor flags with prefix`-boltdb.shipper.compactor.*` - remove `validation.enforce-metric-name` - remove `log.use-buffered`, `log.use-sync` **Special notes for your reviewer**: `log.use-buffered`, `log.use-sync` are not mentioned in the upgrade guide since they were hidden fields **Checklist** - [X] Reviewed the [`CONTRIBUTING.md`](https://github.com/grafana/loki/blob/main/CONTRIBUTING.md) guide (**required**) - [ ] Documentation added - [X] Tests updated - [x] `CHANGELOG.md` updated - [ ] If the change is worth mentioning in the release notes, add `add-to-release-notes` label - [X] Changes that require user attention or interaction to upgrade are documented in `docs/sources/setup/upgrade/_index.md` - [ ] For Helm chart changes bump the Helm chart version in `production/helm/loki/Chart.yaml` and update `production/helm/loki/CHANGELOG.md` and `production/helm/loki/README.md`. [Example PR](https://github.com/grafana/loki/commit/d10549e3ece02120974929894ee333d07755d213) - [X] If the change is deprecating or removing a configuration option, update the `deprecated-config.yaml` and `deleted-config.yaml` files respectively in the `tools/deprecated-config-checker` directory. [Example PR](https://github.com/grafana/loki/pull/10840/commits/0d4416a4b03739583349934b96f272fb4f685d15) --------- Co-authored-by: J Stickler <julie.stickler@grafana.com>
2 years ago
f.IntVar(&cfg.RetentionDeleteWorkCount, "compactor.retention-delete-worker-count", 150, "The total amount of worker to use to delete chunks.")
f.StringVar(&cfg.DeleteRequestStore, "compactor.delete-request-store", "", "Store used for managing delete requests.")
Remove shared_store and shared_store_key_prefix from shipper and compactor (#10840) **What this PR does / why we need it**: #### Removes `shared_store` and `shared_store_key_prefix` from index shipper and compactor configs and their corresponding CLI flags. - `-tsdb.shipper.shared-store` - `-boltdb.shipper.shared-store` - `-tsdb.shipper.shared-store.key-prefix` - `-boltdb.shipper.shared-store.key-prefix` - `-boltdb.shipper.compactor.shared-store` - `-boltdb.shipper.compactor.shared-store.key-prefix` `shared_store` has been a confusing option allowing users to easily misconfigure Loki. Going forward `object_store` setting in the [period_config](https://grafana.com/docs/loki/latest/configure/#period_config) (which already configured the store for chunks) will be used to configure store for the index. And the newly added `path_prefix` option under the `index` key in `period_config` will configure the path under which index tables are stored. This change enforces chunks and index files for a given period reside together in the same storage bucket. More details in the upgrade guide. --- `-compactor.delete-request-store` has to be **explicitly configured** going forward. Without setting this, loki wouldn't know which object store to use for storing delete requests. Path prefix for storing deletes is decided by `-compactor.delete-request-store.key-prefix` which defaults to `index/`. **Checklist** - [X] Reviewed the [`CONTRIBUTING.md`](https://github.com/grafana/loki/blob/main/CONTRIBUTING.md) guide (**required**) - [X] Documentation added - [X] Tests updated - [x] `CHANGELOG.md` updated - [ ] If the change is worth mentioning in the release notes, add `add-to-release-notes` label - [X] Changes that require user attention or interaction to upgrade are documented in `docs/sources/setup/upgrade/_index.md` - [ ] For Helm chart changes bump the Helm chart version in `production/helm/loki/Chart.yaml` and update `production/helm/loki/CHANGELOG.md` and `production/helm/loki/README.md`. [Example PR](https://github.com/grafana/loki/commit/d10549e3ece02120974929894ee333d07755d213)
2 years ago
f.StringVar(&cfg.DeleteRequestStoreKeyPrefix, "compactor.delete-request-store.key-prefix", "index/", "Path prefix for storing delete requests.")
config: remove flags & YAML settings deprecated in 2.9.x (#11151) **What this PR does / why we need it**: - remove `-ruler.evaluation-delay-duration` - remove `boltdb.shipper.compactor.deletion-mode` - remove compactor flags with prefix`-boltdb.shipper.compactor.*` - remove `validation.enforce-metric-name` - remove `log.use-buffered`, `log.use-sync` **Special notes for your reviewer**: `log.use-buffered`, `log.use-sync` are not mentioned in the upgrade guide since they were hidden fields **Checklist** - [X] Reviewed the [`CONTRIBUTING.md`](https://github.com/grafana/loki/blob/main/CONTRIBUTING.md) guide (**required**) - [ ] Documentation added - [X] Tests updated - [x] `CHANGELOG.md` updated - [ ] If the change is worth mentioning in the release notes, add `add-to-release-notes` label - [X] Changes that require user attention or interaction to upgrade are documented in `docs/sources/setup/upgrade/_index.md` - [ ] For Helm chart changes bump the Helm chart version in `production/helm/loki/Chart.yaml` and update `production/helm/loki/CHANGELOG.md` and `production/helm/loki/README.md`. [Example PR](https://github.com/grafana/loki/commit/d10549e3ece02120974929894ee333d07755d213) - [X] If the change is deprecating or removing a configuration option, update the `deprecated-config.yaml` and `deleted-config.yaml` files respectively in the `tools/deprecated-config-checker` directory. [Example PR](https://github.com/grafana/loki/pull/10840/commits/0d4416a4b03739583349934b96f272fb4f685d15) --------- Co-authored-by: J Stickler <julie.stickler@grafana.com>
2 years ago
f.IntVar(&cfg.DeleteBatchSize, "compactor.delete-batch-size", 70, "The max number of delete requests to run per compaction cycle.")
f.DurationVar(&cfg.DeleteRequestCancelPeriod, "compactor.delete-request-cancel-period", 24*time.Hour, "Allow cancellation of delete request until duration after they are created. Data would be deleted only after delete requests have been older than this duration. Ideally this should be set to at least 24h.")
f.DurationVar(&cfg.DeleteMaxInterval, "compactor.delete-max-interval", 24*time.Hour, "Constrain the size of any single delete request with line filters. When a delete request > delete_max_interval is input, the request is sharded into smaller requests of no more than delete_max_interval")
config: remove flags & YAML settings deprecated in 2.9.x (#11151) **What this PR does / why we need it**: - remove `-ruler.evaluation-delay-duration` - remove `boltdb.shipper.compactor.deletion-mode` - remove compactor flags with prefix`-boltdb.shipper.compactor.*` - remove `validation.enforce-metric-name` - remove `log.use-buffered`, `log.use-sync` **Special notes for your reviewer**: `log.use-buffered`, `log.use-sync` are not mentioned in the upgrade guide since they were hidden fields **Checklist** - [X] Reviewed the [`CONTRIBUTING.md`](https://github.com/grafana/loki/blob/main/CONTRIBUTING.md) guide (**required**) - [ ] Documentation added - [X] Tests updated - [x] `CHANGELOG.md` updated - [ ] If the change is worth mentioning in the release notes, add `add-to-release-notes` label - [X] Changes that require user attention or interaction to upgrade are documented in `docs/sources/setup/upgrade/_index.md` - [ ] For Helm chart changes bump the Helm chart version in `production/helm/loki/Chart.yaml` and update `production/helm/loki/CHANGELOG.md` and `production/helm/loki/README.md`. [Example PR](https://github.com/grafana/loki/commit/d10549e3ece02120974929894ee333d07755d213) - [X] If the change is deprecating or removing a configuration option, update the `deprecated-config.yaml` and `deleted-config.yaml` files respectively in the `tools/deprecated-config-checker` directory. [Example PR](https://github.com/grafana/loki/pull/10840/commits/0d4416a4b03739583349934b96f272fb4f685d15) --------- Co-authored-by: J Stickler <julie.stickler@grafana.com>
2 years ago
f.DurationVar(&cfg.RetentionTableTimeout, "compactor.retention-table-timeout", 0, "The maximum amount of time to spend running retention and deletion on any given table in the index.")
f.IntVar(&cfg.MaxCompactionParallelism, "compactor.max-compaction-parallelism", 1, "Maximum number of tables to compact in parallel. While increasing this value, please make sure compactor has enough disk space allocated to be able to store and compact as many tables.")
f.IntVar(&cfg.UploadParallelism, "compactor.upload-parallelism", 10, "Number of upload/remove operations to execute in parallel when finalizing a compaction. NOTE: This setting is per compaction operation, which can be executed in parallel. The upper bound on the number of concurrent uploads is upload_parallelism * max_compaction_parallelism.")
f.BoolVar(&cfg.RunOnce, "compactor.run-once", false, "Run the compactor one time to cleanup and compact index files only (no retention applied)")
f.IntVar(&cfg.TablesToCompact, "compactor.tables-to-compact", 0, "Number of tables that compactor will try to compact. Newer tables are chosen when this is less than the number of tables available.")
f.IntVar(&cfg.SkipLatestNTables, "compactor.skip-latest-n-tables", 0, "Do not compact N latest tables. Together with -compactor.run-once and -compactor.tables-to-compact, this is useful when clearing compactor backlogs.")
// Ring
skipFlags := []string{
"compactor.ring.num-tokens",
"compactor.ring.replication-factor",
}
cfg.CompactorRing.RegisterFlagsWithPrefix("compactor.", "collectors/", f, skipFlags...)
f.IntVar(&cfg.CompactorRing.NumTokens, "compactor.ring.num-tokens", ringNumTokens, fmt.Sprintf("IGNORED: Num tokens is fixed to %d", ringNumTokens))
f.IntVar(&cfg.CompactorRing.ReplicationFactor, "compactor.ring.replication-factor", ringReplicationFactor, fmt.Sprintf("IGNORED: Replication factor is fixed to %d", ringReplicationFactor))
}
// Validate verifies the config does not contain inappropriate values
func (cfg *Config) Validate() error {
if cfg.MaxCompactionParallelism < 1 {
return errors.New("max compaction parallelism must be >= 1")
}
if cfg.CompactorRing.NumTokens != ringNumTokens {
return errors.New("Num tokens must not be changed as it will not take effect")
}
if cfg.CompactorRing.ReplicationFactor != ringReplicationFactor {
return errors.New("Replication factor must not be changed as it will not take effect")
}
Remove shared_store and shared_store_key_prefix from shipper and compactor (#10840) **What this PR does / why we need it**: #### Removes `shared_store` and `shared_store_key_prefix` from index shipper and compactor configs and their corresponding CLI flags. - `-tsdb.shipper.shared-store` - `-boltdb.shipper.shared-store` - `-tsdb.shipper.shared-store.key-prefix` - `-boltdb.shipper.shared-store.key-prefix` - `-boltdb.shipper.compactor.shared-store` - `-boltdb.shipper.compactor.shared-store.key-prefix` `shared_store` has been a confusing option allowing users to easily misconfigure Loki. Going forward `object_store` setting in the [period_config](https://grafana.com/docs/loki/latest/configure/#period_config) (which already configured the store for chunks) will be used to configure store for the index. And the newly added `path_prefix` option under the `index` key in `period_config` will configure the path under which index tables are stored. This change enforces chunks and index files for a given period reside together in the same storage bucket. More details in the upgrade guide. --- `-compactor.delete-request-store` has to be **explicitly configured** going forward. Without setting this, loki wouldn't know which object store to use for storing delete requests. Path prefix for storing deletes is decided by `-compactor.delete-request-store.key-prefix` which defaults to `index/`. **Checklist** - [X] Reviewed the [`CONTRIBUTING.md`](https://github.com/grafana/loki/blob/main/CONTRIBUTING.md) guide (**required**) - [X] Documentation added - [X] Tests updated - [x] `CHANGELOG.md` updated - [ ] If the change is worth mentioning in the release notes, add `add-to-release-notes` label - [X] Changes that require user attention or interaction to upgrade are documented in `docs/sources/setup/upgrade/_index.md` - [ ] For Helm chart changes bump the Helm chart version in `production/helm/loki/Chart.yaml` and update `production/helm/loki/CHANGELOG.md` and `production/helm/loki/README.md`. [Example PR](https://github.com/grafana/loki/commit/d10549e3ece02120974929894ee333d07755d213)
2 years ago
if cfg.RetentionEnabled {
if cfg.DeleteRequestStore == "" {
return fmt.Errorf("compactor.delete-request-store should be configured when retention is enabled")
}
compactor: do not block compation when retention is taking too long (#9884) **What this PR does / why we need it**: Currently, we perform compaction and apply retention in the same loop. Although we have a flag for not applying retention every time we perform compaction, we still see compaction getting blocked when processing some intensive delete requests(processed while applying retention). This PR separates out the compaction and retention to run in a separate loop. I have added a table-locking feature to avoid compaction and retention from processing the same tables at a time. However, compaction and retention would treat locked tables differently, as explained below: * When compaction sees a table is locked: It would skip the table and move on to the following table. However, before skipping, it would check if the table has any uncompacted files and increment the newly added counter called `loki_compactor_skipped_compacting_locked_tables_total` to track how often we are skipping compacting tables which have uncompacted files. * When retention sees a table is locked: It would wait for the lock to be released since we can't skip any tables while processing delete requests. **Special notes for your reviewer**: * The check for tables with uncompacted files looks for count > 1 because initially, we did not support per tenant index in `boltdb-shipper`, so a table can have a single compacted multi-tenant index file. In a rare case where we have a single file which was supposed to be compacted away, it is okay to have a single uncompacted file for a while. The aim here is to not block compaction for too long in a large cell with too many uncompacted files. * Retention only works on the compacted index, so we first compact down any uncompacted files while applying retention. **Checklist** - [x] Tests updated --------- Co-authored-by: Ashwanth <iamashwanth@gmail.com>
2 years ago
if cfg.ApplyRetentionInterval == 0 {
cfg.ApplyRetentionInterval = cfg.CompactionInterval
Remove shared_store and shared_store_key_prefix from shipper and compactor (#10840) **What this PR does / why we need it**: #### Removes `shared_store` and `shared_store_key_prefix` from index shipper and compactor configs and their corresponding CLI flags. - `-tsdb.shipper.shared-store` - `-boltdb.shipper.shared-store` - `-tsdb.shipper.shared-store.key-prefix` - `-boltdb.shipper.shared-store.key-prefix` - `-boltdb.shipper.compactor.shared-store` - `-boltdb.shipper.compactor.shared-store.key-prefix` `shared_store` has been a confusing option allowing users to easily misconfigure Loki. Going forward `object_store` setting in the [period_config](https://grafana.com/docs/loki/latest/configure/#period_config) (which already configured the store for chunks) will be used to configure store for the index. And the newly added `path_prefix` option under the `index` key in `period_config` will configure the path under which index tables are stored. This change enforces chunks and index files for a given period reside together in the same storage bucket. More details in the upgrade guide. --- `-compactor.delete-request-store` has to be **explicitly configured** going forward. Without setting this, loki wouldn't know which object store to use for storing delete requests. Path prefix for storing deletes is decided by `-compactor.delete-request-store.key-prefix` which defaults to `index/`. **Checklist** - [X] Reviewed the [`CONTRIBUTING.md`](https://github.com/grafana/loki/blob/main/CONTRIBUTING.md) guide (**required**) - [X] Documentation added - [X] Tests updated - [x] `CHANGELOG.md` updated - [ ] If the change is worth mentioning in the release notes, add `add-to-release-notes` label - [X] Changes that require user attention or interaction to upgrade are documented in `docs/sources/setup/upgrade/_index.md` - [ ] For Helm chart changes bump the Helm chart version in `production/helm/loki/Chart.yaml` and update `production/helm/loki/CHANGELOG.md` and `production/helm/loki/README.md`. [Example PR](https://github.com/grafana/loki/commit/d10549e3ece02120974929894ee333d07755d213)
2 years ago
}
if cfg.ApplyRetentionInterval == cfg.CompactionInterval {
// add some jitter to avoid running retention and compaction at same time
cfg.ApplyRetentionInterval += minDuration(10*time.Minute, cfg.ApplyRetentionInterval/2)
}
Remove shared_store and shared_store_key_prefix from shipper and compactor (#10840) **What this PR does / why we need it**: #### Removes `shared_store` and `shared_store_key_prefix` from index shipper and compactor configs and their corresponding CLI flags. - `-tsdb.shipper.shared-store` - `-boltdb.shipper.shared-store` - `-tsdb.shipper.shared-store.key-prefix` - `-boltdb.shipper.shared-store.key-prefix` - `-boltdb.shipper.compactor.shared-store` - `-boltdb.shipper.compactor.shared-store.key-prefix` `shared_store` has been a confusing option allowing users to easily misconfigure Loki. Going forward `object_store` setting in the [period_config](https://grafana.com/docs/loki/latest/configure/#period_config) (which already configured the store for chunks) will be used to configure store for the index. And the newly added `path_prefix` option under the `index` key in `period_config` will configure the path under which index tables are stored. This change enforces chunks and index files for a given period reside together in the same storage bucket. More details in the upgrade guide. --- `-compactor.delete-request-store` has to be **explicitly configured** going forward. Without setting this, loki wouldn't know which object store to use for storing delete requests. Path prefix for storing deletes is decided by `-compactor.delete-request-store.key-prefix` which defaults to `index/`. **Checklist** - [X] Reviewed the [`CONTRIBUTING.md`](https://github.com/grafana/loki/blob/main/CONTRIBUTING.md) guide (**required**) - [X] Documentation added - [X] Tests updated - [x] `CHANGELOG.md` updated - [ ] If the change is worth mentioning in the release notes, add `add-to-release-notes` label - [X] Changes that require user attention or interaction to upgrade are documented in `docs/sources/setup/upgrade/_index.md` - [ ] For Helm chart changes bump the Helm chart version in `production/helm/loki/Chart.yaml` and update `production/helm/loki/CHANGELOG.md` and `production/helm/loki/README.md`. [Example PR](https://github.com/grafana/loki/commit/d10549e3ece02120974929894ee333d07755d213)
2 years ago
if err := config.ValidatePathPrefix(cfg.DeleteRequestStoreKeyPrefix); err != nil {
return fmt.Errorf("validate delete store path prefix: %w", err)
}
}
return nil
}
type Compactor struct {
services.Service
cfg Config
indexStorageClient storage.Client
tableMarker retention.TableMarker
sweeper *retention.Sweeper
deleteRequestsStore deletion.DeleteRequestsStore
DeleteRequestsHandler *deletion.DeleteRequestHandler
DeleteRequestsGRPCHandler *deletion.GRPCRequestHandler
deleteRequestsManager *deletion.DeleteRequestsManager
expirationChecker retention.ExpirationChecker
metrics *metrics
running bool
wg sync.WaitGroup
indexCompactors map[string]IndexCompactor
schemaConfig config.SchemaConfig
compactor: do not block compation when retention is taking too long (#9884) **What this PR does / why we need it**: Currently, we perform compaction and apply retention in the same loop. Although we have a flag for not applying retention every time we perform compaction, we still see compaction getting blocked when processing some intensive delete requests(processed while applying retention). This PR separates out the compaction and retention to run in a separate loop. I have added a table-locking feature to avoid compaction and retention from processing the same tables at a time. However, compaction and retention would treat locked tables differently, as explained below: * When compaction sees a table is locked: It would skip the table and move on to the following table. However, before skipping, it would check if the table has any uncompacted files and increment the newly added counter called `loki_compactor_skipped_compacting_locked_tables_total` to track how often we are skipping compacting tables which have uncompacted files. * When retention sees a table is locked: It would wait for the lock to be released since we can't skip any tables while processing delete requests. **Special notes for your reviewer**: * The check for tables with uncompacted files looks for count > 1 because initially, we did not support per tenant index in `boltdb-shipper`, so a table can have a single compacted multi-tenant index file. In a rare case where we have a single file which was supposed to be compacted away, it is okay to have a single uncompacted file for a while. The aim here is to not block compaction for too long in a large cell with too many uncompacted files. * Retention only works on the compacted index, so we first compact down any uncompacted files while applying retention. **Checklist** - [x] Tests updated --------- Co-authored-by: Ashwanth <iamashwanth@gmail.com>
2 years ago
tableLocker *tableLocker
// Ring used for running a single compactor
ringLifecycler *ring.BasicLifecycler
ring *ring.Ring
ringPollPeriod time.Duration
// Subservices manager.
subservices *services.Manager
subservicesWatcher *services.FailureWatcher
Remove shared_store and shared_store_key_prefix from shipper and compactor (#10840) **What this PR does / why we need it**: #### Removes `shared_store` and `shared_store_key_prefix` from index shipper and compactor configs and their corresponding CLI flags. - `-tsdb.shipper.shared-store` - `-boltdb.shipper.shared-store` - `-tsdb.shipper.shared-store.key-prefix` - `-boltdb.shipper.shared-store.key-prefix` - `-boltdb.shipper.compactor.shared-store` - `-boltdb.shipper.compactor.shared-store.key-prefix` `shared_store` has been a confusing option allowing users to easily misconfigure Loki. Going forward `object_store` setting in the [period_config](https://grafana.com/docs/loki/latest/configure/#period_config) (which already configured the store for chunks) will be used to configure store for the index. And the newly added `path_prefix` option under the `index` key in `period_config` will configure the path under which index tables are stored. This change enforces chunks and index files for a given period reside together in the same storage bucket. More details in the upgrade guide. --- `-compactor.delete-request-store` has to be **explicitly configured** going forward. Without setting this, loki wouldn't know which object store to use for storing delete requests. Path prefix for storing deletes is decided by `-compactor.delete-request-store.key-prefix` which defaults to `index/`. **Checklist** - [X] Reviewed the [`CONTRIBUTING.md`](https://github.com/grafana/loki/blob/main/CONTRIBUTING.md) guide (**required**) - [X] Documentation added - [X] Tests updated - [x] `CHANGELOG.md` updated - [ ] If the change is worth mentioning in the release notes, add `add-to-release-notes` label - [X] Changes that require user attention or interaction to upgrade are documented in `docs/sources/setup/upgrade/_index.md` - [ ] For Helm chart changes bump the Helm chart version in `production/helm/loki/Chart.yaml` and update `production/helm/loki/CHANGELOG.md` and `production/helm/loki/README.md`. [Example PR](https://github.com/grafana/loki/commit/d10549e3ece02120974929894ee333d07755d213)
2 years ago
// one for each period
storeContainers map[config.DayTime]storeContainer
}
type storeContainer struct {
tableMarker retention.TableMarker
sweeper *retention.Sweeper
indexStorageClient storage.Client
}
type Limits interface {
deletion.Limits
retention.Limits
DefaultLimits() *validation.Limits
}
func NewCompactor(cfg Config, objectStoreClients map[config.DayTime]client.ObjectClient, deleteStoreClient client.ObjectClient, schemaConfig config.SchemaConfig, limits Limits, r prometheus.Registerer, metricsNamespace string) (*Compactor, error) {
retentionEnabledStats.Set("false")
if cfg.RetentionEnabled {
retentionEnabledStats.Set("true")
}
if limits != nil {
defaultRetentionStats.Set(limits.DefaultLimits().RetentionPeriod.String())
}
compactor := &Compactor{
cfg: cfg,
ringPollPeriod: 5 * time.Second,
indexCompactors: map[string]IndexCompactor{},
schemaConfig: schemaConfig,
compactor: do not block compation when retention is taking too long (#9884) **What this PR does / why we need it**: Currently, we perform compaction and apply retention in the same loop. Although we have a flag for not applying retention every time we perform compaction, we still see compaction getting blocked when processing some intensive delete requests(processed while applying retention). This PR separates out the compaction and retention to run in a separate loop. I have added a table-locking feature to avoid compaction and retention from processing the same tables at a time. However, compaction and retention would treat locked tables differently, as explained below: * When compaction sees a table is locked: It would skip the table and move on to the following table. However, before skipping, it would check if the table has any uncompacted files and increment the newly added counter called `loki_compactor_skipped_compacting_locked_tables_total` to track how often we are skipping compacting tables which have uncompacted files. * When retention sees a table is locked: It would wait for the lock to be released since we can't skip any tables while processing delete requests. **Special notes for your reviewer**: * The check for tables with uncompacted files looks for count > 1 because initially, we did not support per tenant index in `boltdb-shipper`, so a table can have a single compacted multi-tenant index file. In a rare case where we have a single file which was supposed to be compacted away, it is okay to have a single uncompacted file for a while. The aim here is to not block compaction for too long in a large cell with too many uncompacted files. * Retention only works on the compacted index, so we first compact down any uncompacted files while applying retention. **Checklist** - [x] Tests updated --------- Co-authored-by: Ashwanth <iamashwanth@gmail.com>
2 years ago
tableLocker: newTableLocker(),
}
ringStore, err := kv.NewClient(
cfg.CompactorRing.KVStore,
ring.GetCodec(),
kv.RegistererWithKVName(prometheus.WrapRegistererWithPrefix("loki_", r), "compactor"),
util_log.Logger,
)
if err != nil {
return nil, errors.Wrap(err, "create KV store client")
}
lifecyclerCfg, err := cfg.CompactorRing.ToLifecyclerConfig(ringNumTokens, util_log.Logger)
if err != nil {
return nil, errors.Wrap(err, "invalid ring lifecycler config")
}
// Define lifecycler delegates in reverse order (last to be called defined first because they're
// chained via "next delegate").
delegate := ring.BasicLifecyclerDelegate(compactor)
delegate = ring.NewLeaveOnStoppingDelegate(delegate, util_log.Logger)
delegate = ring.NewTokensPersistencyDelegate(cfg.CompactorRing.TokensFilePath, ring.JOINING, delegate, util_log.Logger)
delegate = ring.NewAutoForgetDelegate(ringAutoForgetUnhealthyPeriods*cfg.CompactorRing.HeartbeatTimeout, delegate, util_log.Logger)
compactor.ringLifecycler, err = ring.NewBasicLifecycler(lifecyclerCfg, ringNameForServer, ringKey, ringStore, delegate, util_log.Logger, r)
if err != nil {
return nil, errors.Wrap(err, "create ring lifecycler")
}
ringCfg := cfg.CompactorRing.ToRingConfig(ringReplicationFactor)
compactor.ring, err = ring.NewWithStoreClientAndStrategy(ringCfg, ringNameForServer, ringKey, ringStore, ring.NewIgnoreUnhealthyInstancesReplicationStrategy(), prometheus.WrapRegistererWithPrefix(metricsNamespace+"_", r), util_log.Logger)
if err != nil {
return nil, errors.Wrap(err, "create ring client")
}
compactor.subservices, err = services.NewManager(compactor.ringLifecycler, compactor.ring)
if err != nil {
return nil, err
}
compactor.subservicesWatcher = services.NewFailureWatcher()
compactor.subservicesWatcher.WatchManager(compactor.subservices)
Remove shared_store and shared_store_key_prefix from shipper and compactor (#10840) **What this PR does / why we need it**: #### Removes `shared_store` and `shared_store_key_prefix` from index shipper and compactor configs and their corresponding CLI flags. - `-tsdb.shipper.shared-store` - `-boltdb.shipper.shared-store` - `-tsdb.shipper.shared-store.key-prefix` - `-boltdb.shipper.shared-store.key-prefix` - `-boltdb.shipper.compactor.shared-store` - `-boltdb.shipper.compactor.shared-store.key-prefix` `shared_store` has been a confusing option allowing users to easily misconfigure Loki. Going forward `object_store` setting in the [period_config](https://grafana.com/docs/loki/latest/configure/#period_config) (which already configured the store for chunks) will be used to configure store for the index. And the newly added `path_prefix` option under the `index` key in `period_config` will configure the path under which index tables are stored. This change enforces chunks and index files for a given period reside together in the same storage bucket. More details in the upgrade guide. --- `-compactor.delete-request-store` has to be **explicitly configured** going forward. Without setting this, loki wouldn't know which object store to use for storing delete requests. Path prefix for storing deletes is decided by `-compactor.delete-request-store.key-prefix` which defaults to `index/`. **Checklist** - [X] Reviewed the [`CONTRIBUTING.md`](https://github.com/grafana/loki/blob/main/CONTRIBUTING.md) guide (**required**) - [X] Documentation added - [X] Tests updated - [x] `CHANGELOG.md` updated - [ ] If the change is worth mentioning in the release notes, add `add-to-release-notes` label - [X] Changes that require user attention or interaction to upgrade are documented in `docs/sources/setup/upgrade/_index.md` - [ ] For Helm chart changes bump the Helm chart version in `production/helm/loki/Chart.yaml` and update `production/helm/loki/CHANGELOG.md` and `production/helm/loki/README.md`. [Example PR](https://github.com/grafana/loki/commit/d10549e3ece02120974929894ee333d07755d213)
2 years ago
if err := compactor.init(objectStoreClients, deleteStoreClient, schemaConfig, limits, r); err != nil {
return nil, fmt.Errorf("init compactor: %w", err)
}
compactor.Service = services.NewBasicService(compactor.starting, compactor.loop, compactor.stopping)
return compactor, nil
}
Remove shared_store and shared_store_key_prefix from shipper and compactor (#10840) **What this PR does / why we need it**: #### Removes `shared_store` and `shared_store_key_prefix` from index shipper and compactor configs and their corresponding CLI flags. - `-tsdb.shipper.shared-store` - `-boltdb.shipper.shared-store` - `-tsdb.shipper.shared-store.key-prefix` - `-boltdb.shipper.shared-store.key-prefix` - `-boltdb.shipper.compactor.shared-store` - `-boltdb.shipper.compactor.shared-store.key-prefix` `shared_store` has been a confusing option allowing users to easily misconfigure Loki. Going forward `object_store` setting in the [period_config](https://grafana.com/docs/loki/latest/configure/#period_config) (which already configured the store for chunks) will be used to configure store for the index. And the newly added `path_prefix` option under the `index` key in `period_config` will configure the path under which index tables are stored. This change enforces chunks and index files for a given period reside together in the same storage bucket. More details in the upgrade guide. --- `-compactor.delete-request-store` has to be **explicitly configured** going forward. Without setting this, loki wouldn't know which object store to use for storing delete requests. Path prefix for storing deletes is decided by `-compactor.delete-request-store.key-prefix` which defaults to `index/`. **Checklist** - [X] Reviewed the [`CONTRIBUTING.md`](https://github.com/grafana/loki/blob/main/CONTRIBUTING.md) guide (**required**) - [X] Documentation added - [X] Tests updated - [x] `CHANGELOG.md` updated - [ ] If the change is worth mentioning in the release notes, add `add-to-release-notes` label - [X] Changes that require user attention or interaction to upgrade are documented in `docs/sources/setup/upgrade/_index.md` - [ ] For Helm chart changes bump the Helm chart version in `production/helm/loki/Chart.yaml` and update `production/helm/loki/CHANGELOG.md` and `production/helm/loki/README.md`. [Example PR](https://github.com/grafana/loki/commit/d10549e3ece02120974929894ee333d07755d213)
2 years ago
func (c *Compactor) init(objectStoreClients map[config.DayTime]client.ObjectClient, deleteStoreClient client.ObjectClient, schemaConfig config.SchemaConfig, limits Limits, r prometheus.Registerer) error {
err := chunk_util.EnsureDirectory(c.cfg.WorkingDirectory)
Custom Retention (#3642) * Playing around with a POC based on the design doc. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Getting series ID for label matchers is now working. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Fixes tests. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * wip/ Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * We're parsing the index label now. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Starting to extract interfaces to make the code testable. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Work in progress will try to add labels to chunk ref iterator Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Iterator for chunks ref with all labels !!!. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Optimize code away. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * with delete into the mix Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Cleaner but not yet working for v10 and v11. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Fixes series cleaner. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * hooking into the compactor. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Hooking limit retention config. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Finishing off the marker processor. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Working on sweeper and fixing tests. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Adding more tests and founding more bugs along the way. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Bug with path once boltdb is closed. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Fixing more bug and more robust test. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * More test and cleanup getting close. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Fixes moar bugs with regards to period schema Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Fix a flaky tests because of boltdb still open. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Add more metrics. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Adding metrics. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Improve benchmark. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Fixes issue. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * lint code. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * more logs. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Save files without using table key Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Improve logging and ability to use more goroutines. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Removes duplicate metrics since histogram contains total too. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Add more logs. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Fixes a deadlock bug when too many workers are trying to update the mark file. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Fixes a deadlock when reading and updating db at the same time. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Fixes default config test of boltdb. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * PR Review feedbacks. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Remove the user of assert to not fail a test if it's incorrect on the first shot. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Add experimental notice to the flag documentation Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Fixes empty index detection and table deletion. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Do not delete folder it's not necessary with object store. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Better working path cleanup Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * got linted. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com>
5 years ago
if err != nil {
return err
Custom Retention (#3642) * Playing around with a POC based on the design doc. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Getting series ID for label matchers is now working. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Fixes tests. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * wip/ Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * We're parsing the index label now. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Starting to extract interfaces to make the code testable. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Work in progress will try to add labels to chunk ref iterator Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Iterator for chunks ref with all labels !!!. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Optimize code away. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * with delete into the mix Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Cleaner but not yet working for v10 and v11. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Fixes series cleaner. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * hooking into the compactor. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Hooking limit retention config. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Finishing off the marker processor. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Working on sweeper and fixing tests. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Adding more tests and founding more bugs along the way. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Bug with path once boltdb is closed. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Fixing more bug and more robust test. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * More test and cleanup getting close. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Fixes moar bugs with regards to period schema Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Fix a flaky tests because of boltdb still open. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Add more metrics. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Adding metrics. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Improve benchmark. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Fixes issue. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * lint code. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * more logs. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Save files without using table key Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Improve logging and ability to use more goroutines. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Removes duplicate metrics since histogram contains total too. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Add more logs. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Fixes a deadlock bug when too many workers are trying to update the mark file. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Fixes a deadlock when reading and updating db at the same time. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Fixes default config test of boltdb. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * PR Review feedbacks. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Remove the user of assert to not fail a test if it's incorrect on the first shot. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Add experimental notice to the flag documentation Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Fixes empty index detection and table deletion. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Do not delete folder it's not necessary with object store. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Better working path cleanup Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * got linted. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com>
5 years ago
}
if c.cfg.RetentionEnabled {
Remove shared_store and shared_store_key_prefix from shipper and compactor (#10840) **What this PR does / why we need it**: #### Removes `shared_store` and `shared_store_key_prefix` from index shipper and compactor configs and their corresponding CLI flags. - `-tsdb.shipper.shared-store` - `-boltdb.shipper.shared-store` - `-tsdb.shipper.shared-store.key-prefix` - `-boltdb.shipper.shared-store.key-prefix` - `-boltdb.shipper.compactor.shared-store` - `-boltdb.shipper.compactor.shared-store.key-prefix` `shared_store` has been a confusing option allowing users to easily misconfigure Loki. Going forward `object_store` setting in the [period_config](https://grafana.com/docs/loki/latest/configure/#period_config) (which already configured the store for chunks) will be used to configure store for the index. And the newly added `path_prefix` option under the `index` key in `period_config` will configure the path under which index tables are stored. This change enforces chunks and index files for a given period reside together in the same storage bucket. More details in the upgrade guide. --- `-compactor.delete-request-store` has to be **explicitly configured** going forward. Without setting this, loki wouldn't know which object store to use for storing delete requests. Path prefix for storing deletes is decided by `-compactor.delete-request-store.key-prefix` which defaults to `index/`. **Checklist** - [X] Reviewed the [`CONTRIBUTING.md`](https://github.com/grafana/loki/blob/main/CONTRIBUTING.md) guide (**required**) - [X] Documentation added - [X] Tests updated - [x] `CHANGELOG.md` updated - [ ] If the change is worth mentioning in the release notes, add `add-to-release-notes` label - [X] Changes that require user attention or interaction to upgrade are documented in `docs/sources/setup/upgrade/_index.md` - [ ] For Helm chart changes bump the Helm chart version in `production/helm/loki/Chart.yaml` and update `production/helm/loki/CHANGELOG.md` and `production/helm/loki/README.md`. [Example PR](https://github.com/grafana/loki/commit/d10549e3ece02120974929894ee333d07755d213)
2 years ago
if deleteStoreClient == nil {
return fmt.Errorf("delete store client not initialised when retention is enabled")
}
Remove shared_store and shared_store_key_prefix from shipper and compactor (#10840) **What this PR does / why we need it**: #### Removes `shared_store` and `shared_store_key_prefix` from index shipper and compactor configs and their corresponding CLI flags. - `-tsdb.shipper.shared-store` - `-boltdb.shipper.shared-store` - `-tsdb.shipper.shared-store.key-prefix` - `-boltdb.shipper.shared-store.key-prefix` - `-boltdb.shipper.compactor.shared-store` - `-boltdb.shipper.compactor.shared-store.key-prefix` `shared_store` has been a confusing option allowing users to easily misconfigure Loki. Going forward `object_store` setting in the [period_config](https://grafana.com/docs/loki/latest/configure/#period_config) (which already configured the store for chunks) will be used to configure store for the index. And the newly added `path_prefix` option under the `index` key in `period_config` will configure the path under which index tables are stored. This change enforces chunks and index files for a given period reside together in the same storage bucket. More details in the upgrade guide. --- `-compactor.delete-request-store` has to be **explicitly configured** going forward. Without setting this, loki wouldn't know which object store to use for storing delete requests. Path prefix for storing deletes is decided by `-compactor.delete-request-store.key-prefix` which defaults to `index/`. **Checklist** - [X] Reviewed the [`CONTRIBUTING.md`](https://github.com/grafana/loki/blob/main/CONTRIBUTING.md) guide (**required**) - [X] Documentation added - [X] Tests updated - [x] `CHANGELOG.md` updated - [ ] If the change is worth mentioning in the release notes, add `add-to-release-notes` label - [X] Changes that require user attention or interaction to upgrade are documented in `docs/sources/setup/upgrade/_index.md` - [ ] For Helm chart changes bump the Helm chart version in `production/helm/loki/Chart.yaml` and update `production/helm/loki/CHANGELOG.md` and `production/helm/loki/README.md`. [Example PR](https://github.com/grafana/loki/commit/d10549e3ece02120974929894ee333d07755d213)
2 years ago
if err := c.initDeletes(deleteStoreClient, r, limits); err != nil {
return fmt.Errorf("failed to init delete store: %w", err)
}
}
Remove shared_store and shared_store_key_prefix from shipper and compactor (#10840) **What this PR does / why we need it**: #### Removes `shared_store` and `shared_store_key_prefix` from index shipper and compactor configs and their corresponding CLI flags. - `-tsdb.shipper.shared-store` - `-boltdb.shipper.shared-store` - `-tsdb.shipper.shared-store.key-prefix` - `-boltdb.shipper.shared-store.key-prefix` - `-boltdb.shipper.compactor.shared-store` - `-boltdb.shipper.compactor.shared-store.key-prefix` `shared_store` has been a confusing option allowing users to easily misconfigure Loki. Going forward `object_store` setting in the [period_config](https://grafana.com/docs/loki/latest/configure/#period_config) (which already configured the store for chunks) will be used to configure store for the index. And the newly added `path_prefix` option under the `index` key in `period_config` will configure the path under which index tables are stored. This change enforces chunks and index files for a given period reside together in the same storage bucket. More details in the upgrade guide. --- `-compactor.delete-request-store` has to be **explicitly configured** going forward. Without setting this, loki wouldn't know which object store to use for storing delete requests. Path prefix for storing deletes is decided by `-compactor.delete-request-store.key-prefix` which defaults to `index/`. **Checklist** - [X] Reviewed the [`CONTRIBUTING.md`](https://github.com/grafana/loki/blob/main/CONTRIBUTING.md) guide (**required**) - [X] Documentation added - [X] Tests updated - [x] `CHANGELOG.md` updated - [ ] If the change is worth mentioning in the release notes, add `add-to-release-notes` label - [X] Changes that require user attention or interaction to upgrade are documented in `docs/sources/setup/upgrade/_index.md` - [ ] For Helm chart changes bump the Helm chart version in `production/helm/loki/Chart.yaml` and update `production/helm/loki/CHANGELOG.md` and `production/helm/loki/README.md`. [Example PR](https://github.com/grafana/loki/commit/d10549e3ece02120974929894ee333d07755d213)
2 years ago
legacyMarkerDirs := make(map[string]struct{})
c.storeContainers = make(map[config.DayTime]storeContainer, len(objectStoreClients))
for from, objectClient := range objectStoreClients {
period, err := schemaConfig.SchemaForTime(from.Time)
if err != nil {
return err
}
var sc storeContainer
Remove shared_store and shared_store_key_prefix from shipper and compactor (#10840) **What this PR does / why we need it**: #### Removes `shared_store` and `shared_store_key_prefix` from index shipper and compactor configs and their corresponding CLI flags. - `-tsdb.shipper.shared-store` - `-boltdb.shipper.shared-store` - `-tsdb.shipper.shared-store.key-prefix` - `-boltdb.shipper.shared-store.key-prefix` - `-boltdb.shipper.compactor.shared-store` - `-boltdb.shipper.compactor.shared-store.key-prefix` `shared_store` has been a confusing option allowing users to easily misconfigure Loki. Going forward `object_store` setting in the [period_config](https://grafana.com/docs/loki/latest/configure/#period_config) (which already configured the store for chunks) will be used to configure store for the index. And the newly added `path_prefix` option under the `index` key in `period_config` will configure the path under which index tables are stored. This change enforces chunks and index files for a given period reside together in the same storage bucket. More details in the upgrade guide. --- `-compactor.delete-request-store` has to be **explicitly configured** going forward. Without setting this, loki wouldn't know which object store to use for storing delete requests. Path prefix for storing deletes is decided by `-compactor.delete-request-store.key-prefix` which defaults to `index/`. **Checklist** - [X] Reviewed the [`CONTRIBUTING.md`](https://github.com/grafana/loki/blob/main/CONTRIBUTING.md) guide (**required**) - [X] Documentation added - [X] Tests updated - [x] `CHANGELOG.md` updated - [ ] If the change is worth mentioning in the release notes, add `add-to-release-notes` label - [X] Changes that require user attention or interaction to upgrade are documented in `docs/sources/setup/upgrade/_index.md` - [ ] For Helm chart changes bump the Helm chart version in `production/helm/loki/Chart.yaml` and update `production/helm/loki/CHANGELOG.md` and `production/helm/loki/README.md`. [Example PR](https://github.com/grafana/loki/commit/d10549e3ece02120974929894ee333d07755d213)
2 years ago
sc.indexStorageClient = storage.NewIndexStorageClient(objectClient, period.IndexTables.PathPrefix)
if c.cfg.RetentionEnabled {
var (
raw client.ObjectClient
encoder client.KeyEncoder
Remove shared_store and shared_store_key_prefix from shipper and compactor (#10840) **What this PR does / why we need it**: #### Removes `shared_store` and `shared_store_key_prefix` from index shipper and compactor configs and their corresponding CLI flags. - `-tsdb.shipper.shared-store` - `-boltdb.shipper.shared-store` - `-tsdb.shipper.shared-store.key-prefix` - `-boltdb.shipper.shared-store.key-prefix` - `-boltdb.shipper.compactor.shared-store` - `-boltdb.shipper.compactor.shared-store.key-prefix` `shared_store` has been a confusing option allowing users to easily misconfigure Loki. Going forward `object_store` setting in the [period_config](https://grafana.com/docs/loki/latest/configure/#period_config) (which already configured the store for chunks) will be used to configure store for the index. And the newly added `path_prefix` option under the `index` key in `period_config` will configure the path under which index tables are stored. This change enforces chunks and index files for a given period reside together in the same storage bucket. More details in the upgrade guide. --- `-compactor.delete-request-store` has to be **explicitly configured** going forward. Without setting this, loki wouldn't know which object store to use for storing delete requests. Path prefix for storing deletes is decided by `-compactor.delete-request-store.key-prefix` which defaults to `index/`. **Checklist** - [X] Reviewed the [`CONTRIBUTING.md`](https://github.com/grafana/loki/blob/main/CONTRIBUTING.md) guide (**required**) - [X] Documentation added - [X] Tests updated - [x] `CHANGELOG.md` updated - [ ] If the change is worth mentioning in the release notes, add `add-to-release-notes` label - [X] Changes that require user attention or interaction to upgrade are documented in `docs/sources/setup/upgrade/_index.md` - [ ] For Helm chart changes bump the Helm chart version in `production/helm/loki/Chart.yaml` and update `production/helm/loki/CHANGELOG.md` and `production/helm/loki/README.md`. [Example PR](https://github.com/grafana/loki/commit/d10549e3ece02120974929894ee333d07755d213)
2 years ago
name = fmt.Sprintf("%s_%s", period.ObjectType, period.From.String())
retentionWorkDir = filepath.Join(c.cfg.WorkingDirectory, "retention", name)
r = prometheus.WrapRegistererWith(prometheus.Labels{"from": name}, r)
)
Remove shared_store and shared_store_key_prefix from shipper and compactor (#10840) **What this PR does / why we need it**: #### Removes `shared_store` and `shared_store_key_prefix` from index shipper and compactor configs and their corresponding CLI flags. - `-tsdb.shipper.shared-store` - `-boltdb.shipper.shared-store` - `-tsdb.shipper.shared-store.key-prefix` - `-boltdb.shipper.shared-store.key-prefix` - `-boltdb.shipper.compactor.shared-store` - `-boltdb.shipper.compactor.shared-store.key-prefix` `shared_store` has been a confusing option allowing users to easily misconfigure Loki. Going forward `object_store` setting in the [period_config](https://grafana.com/docs/loki/latest/configure/#period_config) (which already configured the store for chunks) will be used to configure store for the index. And the newly added `path_prefix` option under the `index` key in `period_config` will configure the path under which index tables are stored. This change enforces chunks and index files for a given period reside together in the same storage bucket. More details in the upgrade guide. --- `-compactor.delete-request-store` has to be **explicitly configured** going forward. Without setting this, loki wouldn't know which object store to use for storing delete requests. Path prefix for storing deletes is decided by `-compactor.delete-request-store.key-prefix` which defaults to `index/`. **Checklist** - [X] Reviewed the [`CONTRIBUTING.md`](https://github.com/grafana/loki/blob/main/CONTRIBUTING.md) guide (**required**) - [X] Documentation added - [X] Tests updated - [x] `CHANGELOG.md` updated - [ ] If the change is worth mentioning in the release notes, add `add-to-release-notes` label - [X] Changes that require user attention or interaction to upgrade are documented in `docs/sources/setup/upgrade/_index.md` - [ ] For Helm chart changes bump the Helm chart version in `production/helm/loki/Chart.yaml` and update `production/helm/loki/CHANGELOG.md` and `production/helm/loki/README.md`. [Example PR](https://github.com/grafana/loki/commit/d10549e3ece02120974929894ee333d07755d213)
2 years ago
// given that compaction can now run on multiple periods, marker files are stored under /retention/{objectStoreType}_{periodFrom}/markers/
// if any markers are found in the common markers dir (/retention/markers/) or store specific markers dir (/retention/{objectStoreType}/markers/), copy them to the period specific dirs
// chunk would be removed by the sweeper if it belongs to a given period or no-op if it doesn't exist.
if err := retention.CopyMarkers(filepath.Join(c.cfg.WorkingDirectory, "retention"), retentionWorkDir); err != nil {
return fmt.Errorf("failed to move common markers to period specific dir: %w", err)
}
if err := retention.CopyMarkers(filepath.Join(c.cfg.WorkingDirectory, "retention", period.ObjectType), retentionWorkDir); err != nil {
return fmt.Errorf("failed to move store markers to period specific dir: %w", err)
}
// remove markers from the store dir after copying them to period specific dirs.
legacyMarkerDirs[period.ObjectType] = struct{}{}
if casted, ok := objectClient.(client.PrefixedObjectClient); ok {
raw = casted.GetDownstream()
} else {
raw = objectClient
}
if _, ok := raw.(*local.FSObjectClient); ok {
encoder = client.FSEncoder
}
chunkClient := client.NewClient(objectClient, encoder, schemaConfig)
sc.sweeper, err = retention.NewSweeper(retentionWorkDir, chunkClient, c.cfg.RetentionDeleteWorkCount, c.cfg.RetentionDeleteDelay, r)
if err != nil {
return fmt.Errorf("failed to init sweeper: %w", err)
}
sc.tableMarker, err = retention.NewMarker(retentionWorkDir, c.expirationChecker, c.cfg.RetentionTableTimeout, chunkClient, r)
if err != nil {
return fmt.Errorf("failed to init table marker: %w", err)
}
}
Remove shared_store and shared_store_key_prefix from shipper and compactor (#10840) **What this PR does / why we need it**: #### Removes `shared_store` and `shared_store_key_prefix` from index shipper and compactor configs and their corresponding CLI flags. - `-tsdb.shipper.shared-store` - `-boltdb.shipper.shared-store` - `-tsdb.shipper.shared-store.key-prefix` - `-boltdb.shipper.shared-store.key-prefix` - `-boltdb.shipper.compactor.shared-store` - `-boltdb.shipper.compactor.shared-store.key-prefix` `shared_store` has been a confusing option allowing users to easily misconfigure Loki. Going forward `object_store` setting in the [period_config](https://grafana.com/docs/loki/latest/configure/#period_config) (which already configured the store for chunks) will be used to configure store for the index. And the newly added `path_prefix` option under the `index` key in `period_config` will configure the path under which index tables are stored. This change enforces chunks and index files for a given period reside together in the same storage bucket. More details in the upgrade guide. --- `-compactor.delete-request-store` has to be **explicitly configured** going forward. Without setting this, loki wouldn't know which object store to use for storing delete requests. Path prefix for storing deletes is decided by `-compactor.delete-request-store.key-prefix` which defaults to `index/`. **Checklist** - [X] Reviewed the [`CONTRIBUTING.md`](https://github.com/grafana/loki/blob/main/CONTRIBUTING.md) guide (**required**) - [X] Documentation added - [X] Tests updated - [x] `CHANGELOG.md` updated - [ ] If the change is worth mentioning in the release notes, add `add-to-release-notes` label - [X] Changes that require user attention or interaction to upgrade are documented in `docs/sources/setup/upgrade/_index.md` - [ ] For Helm chart changes bump the Helm chart version in `production/helm/loki/Chart.yaml` and update `production/helm/loki/CHANGELOG.md` and `production/helm/loki/README.md`. [Example PR](https://github.com/grafana/loki/commit/d10549e3ece02120974929894ee333d07755d213)
2 years ago
c.storeContainers[from] = sc
}
if c.cfg.RetentionEnabled {
// remove legacy markers
Remove shared_store and shared_store_key_prefix from shipper and compactor (#10840) **What this PR does / why we need it**: #### Removes `shared_store` and `shared_store_key_prefix` from index shipper and compactor configs and their corresponding CLI flags. - `-tsdb.shipper.shared-store` - `-boltdb.shipper.shared-store` - `-tsdb.shipper.shared-store.key-prefix` - `-boltdb.shipper.shared-store.key-prefix` - `-boltdb.shipper.compactor.shared-store` - `-boltdb.shipper.compactor.shared-store.key-prefix` `shared_store` has been a confusing option allowing users to easily misconfigure Loki. Going forward `object_store` setting in the [period_config](https://grafana.com/docs/loki/latest/configure/#period_config) (which already configured the store for chunks) will be used to configure store for the index. And the newly added `path_prefix` option under the `index` key in `period_config` will configure the path under which index tables are stored. This change enforces chunks and index files for a given period reside together in the same storage bucket. More details in the upgrade guide. --- `-compactor.delete-request-store` has to be **explicitly configured** going forward. Without setting this, loki wouldn't know which object store to use for storing delete requests. Path prefix for storing deletes is decided by `-compactor.delete-request-store.key-prefix` which defaults to `index/`. **Checklist** - [X] Reviewed the [`CONTRIBUTING.md`](https://github.com/grafana/loki/blob/main/CONTRIBUTING.md) guide (**required**) - [X] Documentation added - [X] Tests updated - [x] `CHANGELOG.md` updated - [ ] If the change is worth mentioning in the release notes, add `add-to-release-notes` label - [X] Changes that require user attention or interaction to upgrade are documented in `docs/sources/setup/upgrade/_index.md` - [ ] For Helm chart changes bump the Helm chart version in `production/helm/loki/Chart.yaml` and update `production/helm/loki/CHANGELOG.md` and `production/helm/loki/README.md`. [Example PR](https://github.com/grafana/loki/commit/d10549e3ece02120974929894ee333d07755d213)
2 years ago
for store := range legacyMarkerDirs {
if err := os.RemoveAll(filepath.Join(c.cfg.WorkingDirectory, "retention", store, retention.MarkersFolder)); err != nil {
return fmt.Errorf("remove old markers from store dir: %w", err)
}
}
if err := os.RemoveAll(filepath.Join(c.cfg.WorkingDirectory, "retention", retention.MarkersFolder)); err != nil {
return fmt.Errorf("remove old markers: %w", err)
}
}
c.metrics = newMetrics(r)
return nil
}
func (c *Compactor) initDeletes(objectClient client.ObjectClient, r prometheus.Registerer, limits Limits) error {
deletionWorkDir := filepath.Join(c.cfg.WorkingDirectory, "deletion")
Remove shared_store and shared_store_key_prefix from shipper and compactor (#10840) **What this PR does / why we need it**: #### Removes `shared_store` and `shared_store_key_prefix` from index shipper and compactor configs and their corresponding CLI flags. - `-tsdb.shipper.shared-store` - `-boltdb.shipper.shared-store` - `-tsdb.shipper.shared-store.key-prefix` - `-boltdb.shipper.shared-store.key-prefix` - `-boltdb.shipper.compactor.shared-store` - `-boltdb.shipper.compactor.shared-store.key-prefix` `shared_store` has been a confusing option allowing users to easily misconfigure Loki. Going forward `object_store` setting in the [period_config](https://grafana.com/docs/loki/latest/configure/#period_config) (which already configured the store for chunks) will be used to configure store for the index. And the newly added `path_prefix` option under the `index` key in `period_config` will configure the path under which index tables are stored. This change enforces chunks and index files for a given period reside together in the same storage bucket. More details in the upgrade guide. --- `-compactor.delete-request-store` has to be **explicitly configured** going forward. Without setting this, loki wouldn't know which object store to use for storing delete requests. Path prefix for storing deletes is decided by `-compactor.delete-request-store.key-prefix` which defaults to `index/`. **Checklist** - [X] Reviewed the [`CONTRIBUTING.md`](https://github.com/grafana/loki/blob/main/CONTRIBUTING.md) guide (**required**) - [X] Documentation added - [X] Tests updated - [x] `CHANGELOG.md` updated - [ ] If the change is worth mentioning in the release notes, add `add-to-release-notes` label - [X] Changes that require user attention or interaction to upgrade are documented in `docs/sources/setup/upgrade/_index.md` - [ ] For Helm chart changes bump the Helm chart version in `production/helm/loki/Chart.yaml` and update `production/helm/loki/CHANGELOG.md` and `production/helm/loki/README.md`. [Example PR](https://github.com/grafana/loki/commit/d10549e3ece02120974929894ee333d07755d213)
2 years ago
store, err := deletion.NewDeleteStore(deletionWorkDir, storage.NewIndexStorageClient(objectClient, c.cfg.DeleteRequestStoreKeyPrefix))
if err != nil {
return err
}
c.deleteRequestsStore = store
c.DeleteRequestsHandler = deletion.NewDeleteRequestHandler(
c.deleteRequestsStore,
c.cfg.DeleteMaxInterval,
r,
)
c.DeleteRequestsGRPCHandler = deletion.NewGRPCRequestHandler(c.deleteRequestsStore, limits)
c.deleteRequestsManager = deletion.NewDeleteRequestsManager(
c.deleteRequestsStore,
c.cfg.DeleteRequestCancelPeriod,
c.cfg.DeleteBatchSize,
limits,
r,
)
c.expirationChecker = newExpirationChecker(retention.NewExpirationChecker(limits), c.deleteRequestsManager)
return nil
}
func (c *Compactor) starting(ctx context.Context) (err error) {
// In case this function will return error we want to unregister the instance
// from the ring. We do it ensuring dependencies are gracefully stopped if they
// were already started.
defer func() {
if err == nil || c.subservices == nil {
return
}
if stopErr := services.StopManagerAndAwaitStopped(context.Background(), c.subservices); stopErr != nil {
level.Error(util_log.Logger).Log("msg", "failed to gracefully stop compactor dependencies", "err", stopErr)
}
}()
if err := services.StartManagerAndAwaitHealthy(ctx, c.subservices); err != nil {
return errors.Wrap(err, "unable to start compactor subservices")
}
// The BasicLifecycler does not automatically move state to ACTIVE such that any additional work that
// someone wants to do can be done before becoming ACTIVE. For the query compactor we don't currently
// have any additional work so we can become ACTIVE right away.
// Wait until the ring client detected this instance in the JOINING state to
// make sure that when we'll run the initial sync we already know the tokens
// assigned to this instance.
level.Info(util_log.Logger).Log("msg", "waiting until compactor is JOINING in the ring")
if err := ring.WaitInstanceState(ctx, c.ring, c.ringLifecycler.GetInstanceID(), ring.JOINING); err != nil {
return err
}
level.Info(util_log.Logger).Log("msg", "compactor is JOINING in the ring")
// Change ring state to ACTIVE
if err = c.ringLifecycler.ChangeState(ctx, ring.ACTIVE); err != nil {
return errors.Wrapf(err, "switch instance to %s in the ring", ring.ACTIVE)
}
// Wait until the ring client detected this instance in the ACTIVE state to
// make sure that when we'll run the loop it won't be detected as a ring
// topology change.
level.Info(util_log.Logger).Log("msg", "waiting until compactor is ACTIVE in the ring")
if err := ring.WaitInstanceState(ctx, c.ring, c.ringLifecycler.GetInstanceID(), ring.ACTIVE); err != nil {
return err
}
level.Info(util_log.Logger).Log("msg", "compactor is ACTIVE in the ring")
return nil
}
func (c *Compactor) loop(ctx context.Context) error {
if c.cfg.RunOnce {
level.Info(util_log.Logger).Log("msg", "running single compaction")
err := c.RunCompaction(ctx, false)
if err != nil {
level.Error(util_log.Logger).Log("msg", "compaction encountered an error", "err", err)
}
level.Info(util_log.Logger).Log("msg", "single compaction finished")
level.Info(util_log.Logger).Log("msg", "interrupt or terminate the process to finish")
// Wait for Loki to shutdown.
<-ctx.Done()
level.Info(util_log.Logger).Log("msg", "compactor exiting")
return nil
}
if c.cfg.RetentionEnabled {
Add a DeletionMode config variable (#5481) * Add deletionEnabled setting and remove delete request manager Signed-off-by: Michel Hollands <michel.hollands@grafana.com> * Rebase Signed-off-by: Michel Hollands <michel.hollands@grafana.com> * Remove deletion handling from delete requests manager Signed-off-by: Michel Hollands <michel.hollands@grafana.com> * change store so it stores a logql statement Signed-off-by: Michel Hollands <michel.hollands@grafana.com> * Add validation code for logql statement Signed-off-by: Michel Hollands <michel.hollands@grafana.com> * Run deleteRequestsManager when deletion is enabled Signed-off-by: Michel Hollands <michel.hollands@grafana.com> * Remove unused variables Signed-off-by: Michel Hollands <michel.hollands@grafana.com> * Revert "Remove deletion handling from delete requests manager" This reverts commit ce4f774497aa590caff86b0745ec81588592a9e1. * Re-add IsDeleted method Signed-off-by: Michel Hollands <michel.hollands@grafana.com> * Re-add tests for IsDeleted Signed-off-by: Michel Hollands <michel.hollands@grafana.com> * Fix delete request store test Signed-off-by: Michel Hollands <michel.hollands@grafana.com> * Fix linting issue Signed-off-by: Michel Hollands <michel.hollands@grafana.com> * Revert compactor changes Signed-off-by: Michel Hollands <michel.hollands@grafana.com> * Add deletion mode Signed-off-by: Michel Hollands <michel.hollands@grafana.com> * Add v1 mode Signed-off-by: Michel Hollands <michel.hollands@grafana.com> * Rename LogQLRequest to Query Signed-off-by: Michel Hollands <michel.hollands@grafana.com> * Fix linting issues Signed-off-by: Michel Hollands <michel.hollands@grafana.com> * Use DeleteMode in compactor module Signed-off-by: Michel Hollands <michel.hollands@grafana.com> * Rename logql to query Signed-off-by: Michel Hollands <michel.hollands@grafana.com> * Put cancel under delete verb Signed-off-by: Michel Hollands <michel.hollands@grafana.com> * Update documentation Signed-off-by: Michel Hollands <michel.hollands@grafana.com> * Update changelog Signed-off-by: Michel Hollands <michel.hollands@grafana.com> * Revert only the API surface area while keeping everything else * Use moved code in syntax package Signed-off-by: Michel Hollands <michel.hollands@grafana.com> * Remove duplicte import Signed-off-by: Michel Hollands <michel.hollands@grafana.com> * Use renamed field in tests Signed-off-by: Michel Hollands <michel.hollands@grafana.com> * Remove duplicates and empty lines in changelog Signed-off-by: Michel Hollands <michel.hollands@grafana.com> * Update changelog description Signed-off-by: Michel Hollands <michel.hollands@grafana.com> * Update pkg/storage/stores/shipper/compactor/deletion/delete_request.go Co-authored-by: Christian Simon <simon@swine.de> * Update pkg/storage/stores/shipper/compactor/deletion/delete_requests_manager_test.go Co-authored-by: Christian Simon <simon@swine.de> * Update pkg/storage/stores/shipper/compactor/deletion/delete_requests_store.go Co-authored-by: Christian Simon <simon@swine.de> * Update pkg/storage/stores/shipper/compactor/deletion/delete_requests_store.go Co-authored-by: Christian Simon <simon@swine.de> * Update CHANGELOG.md Co-authored-by: Christian Simon <simon@swine.de> * Update pkg/storage/stores/shipper/compactor/deletion/delete_requests_store.go Co-authored-by: Christian Simon <simon@swine.de> * Make DeletionMode struct member non public Signed-off-by: Michel Hollands <michel.hollands@grafana.com> * Revert change to docs re cancellation Signed-off-by: Michel Hollands <michel.hollands@grafana.com> * Use same variable names Signed-off-by: Michel Hollands <michel.hollands@grafana.com> * Add parameter validation to changelog Signed-off-by: Michel Hollands <michel.hollands@grafana.com> * Rename v1 to WholeStreamDeletion Signed-off-by: Michel Hollands <michel.hollands@grafana.com> * Fix default value of deletion mode config setting Signed-off-by: Michel Hollands <michel.hollands@grafana.com> * reimplement new api * Add delete request handler when delete mode is set Signed-off-by: Michel Hollands <michel.hollands@grafana.com> * Remove unused variable Signed-off-by: Michel Hollands <michel.hollands@grafana.com> * Add comment to change the code when other deletion modes are available Signed-off-by: Michel Hollands <michel.hollands@grafana.com> * create expirationChecker if deletionMode is set Signed-off-by: Michel Hollands <michel.hollands@grafana.com> * Address review comments Signed-off-by: Michel Hollands <michel.hollands@grafana.com> * Update pkg/storage/stores/shipper/compactor/compactor.go Co-authored-by: Sandeep Sukhani <sandeep.d.sukhani@gmail.com> * Rename AddQuery to SetQuery Signed-off-by: Michel Hollands <michel.hollands@grafana.com> Co-authored-by: Travis Patterson <travis.patterson@grafana.com> Co-authored-by: Christian Simon <simon@swine.de> Co-authored-by: Sandeep Sukhani <sandeep.d.sukhani@gmail.com>
4 years ago
if c.deleteRequestsStore != nil {
defer c.deleteRequestsStore.Stop()
}
if c.deleteRequestsManager != nil {
defer c.deleteRequestsManager.Stop()
}
}
syncTicker := time.NewTicker(c.ringPollPeriod)
defer syncTicker.Stop()
var runningCtx context.Context
var runningCancel context.CancelFunc
for {
select {
case <-ctx.Done():
if runningCancel != nil {
runningCancel()
}
c.wg.Wait()
level.Info(util_log.Logger).Log("msg", "compactor exiting")
return nil
case <-syncTicker.C:
bufDescs, bufHosts, bufZones := ring.MakeBuffersForGet()
rs, err := c.ring.Get(ringKeyOfLeader, ring.Write, bufDescs, bufHosts, bufZones)
if err != nil {
level.Error(util_log.Logger).Log("msg", "error asking ring for who should run the compactor, will check again", "err", err)
continue
}
addrs := rs.GetAddresses()
if len(addrs) != 1 {
level.Error(util_log.Logger).Log("msg", "too many addresses (more that one) return when asking the ring who should run the compactor, will check again")
continue
}
if c.ringLifecycler.GetInstanceAddr() == addrs[0] {
// If not running, start
if !c.running {
level.Info(util_log.Logger).Log("msg", "this instance has been chosen to run the compactor, starting compactor")
runningCtx, runningCancel = context.WithCancel(ctx)
go c.runCompactions(runningCtx)
c.running = true
c.metrics.compactorRunning.Set(1)
}
} else {
// If running, shutdown
if c.running {
level.Info(util_log.Logger).Log("msg", "this instance should no longer run the compactor, stopping compactor")
runningCancel()
c.wg.Wait()
c.running = false
c.metrics.compactorRunning.Set(0)
level.Info(util_log.Logger).Log("msg", "compactor stopped")
}
}
}
}
}
func (c *Compactor) runCompactions(ctx context.Context) {
// To avoid races, wait 1 compaction interval before actually starting the compactor
// this allows the ring to settle if there are a lot of ring changes and gives
// time for existing compactors to shutdown before this starts to avoid
// multiple compactors running at the same time.
func() {
t := time.NewTimer(c.cfg.CompactionInterval)
defer t.Stop()
level.Info(util_log.Logger).Log("msg", fmt.Sprintf("waiting %v for ring to stay stable and previous compactions to finish before starting compactor", c.cfg.CompactionInterval))
select {
case <-ctx.Done():
return
case <-t.C:
level.Info(util_log.Logger).Log("msg", "compactor startup delay completed")
break
}
}()
compactor: do not block compation when retention is taking too long (#9884) **What this PR does / why we need it**: Currently, we perform compaction and apply retention in the same loop. Although we have a flag for not applying retention every time we perform compaction, we still see compaction getting blocked when processing some intensive delete requests(processed while applying retention). This PR separates out the compaction and retention to run in a separate loop. I have added a table-locking feature to avoid compaction and retention from processing the same tables at a time. However, compaction and retention would treat locked tables differently, as explained below: * When compaction sees a table is locked: It would skip the table and move on to the following table. However, before skipping, it would check if the table has any uncompacted files and increment the newly added counter called `loki_compactor_skipped_compacting_locked_tables_total` to track how often we are skipping compacting tables which have uncompacted files. * When retention sees a table is locked: It would wait for the lock to be released since we can't skip any tables while processing delete requests. **Special notes for your reviewer**: * The check for tables with uncompacted files looks for count > 1 because initially, we did not support per tenant index in `boltdb-shipper`, so a table can have a single compacted multi-tenant index file. In a rare case where we have a single file which was supposed to be compacted away, it is okay to have a single uncompacted file for a while. The aim here is to not block compaction for too long in a large cell with too many uncompacted files. * Retention only works on the compacted index, so we first compact down any uncompacted files while applying retention. **Checklist** - [x] Tests updated --------- Co-authored-by: Ashwanth <iamashwanth@gmail.com>
2 years ago
// do the initial compaction
if err := c.RunCompaction(ctx, false); err != nil {
level.Error(util_log.Logger).Log("msg", "failed to run compaction", "err", err)
compactor: do not block compation when retention is taking too long (#9884) **What this PR does / why we need it**: Currently, we perform compaction and apply retention in the same loop. Although we have a flag for not applying retention every time we perform compaction, we still see compaction getting blocked when processing some intensive delete requests(processed while applying retention). This PR separates out the compaction and retention to run in a separate loop. I have added a table-locking feature to avoid compaction and retention from processing the same tables at a time. However, compaction and retention would treat locked tables differently, as explained below: * When compaction sees a table is locked: It would skip the table and move on to the following table. However, before skipping, it would check if the table has any uncompacted files and increment the newly added counter called `loki_compactor_skipped_compacting_locked_tables_total` to track how often we are skipping compacting tables which have uncompacted files. * When retention sees a table is locked: It would wait for the lock to be released since we can't skip any tables while processing delete requests. **Special notes for your reviewer**: * The check for tables with uncompacted files looks for count > 1 because initially, we did not support per tenant index in `boltdb-shipper`, so a table can have a single compacted multi-tenant index file. In a rare case where we have a single file which was supposed to be compacted away, it is okay to have a single uncompacted file for a while. The aim here is to not block compaction for too long in a large cell with too many uncompacted files. * Retention only works on the compacted index, so we first compact down any uncompacted files while applying retention. **Checklist** - [x] Tests updated --------- Co-authored-by: Ashwanth <iamashwanth@gmail.com>
2 years ago
}
compactor: do not block compation when retention is taking too long (#9884) **What this PR does / why we need it**: Currently, we perform compaction and apply retention in the same loop. Although we have a flag for not applying retention every time we perform compaction, we still see compaction getting blocked when processing some intensive delete requests(processed while applying retention). This PR separates out the compaction and retention to run in a separate loop. I have added a table-locking feature to avoid compaction and retention from processing the same tables at a time. However, compaction and retention would treat locked tables differently, as explained below: * When compaction sees a table is locked: It would skip the table and move on to the following table. However, before skipping, it would check if the table has any uncompacted files and increment the newly added counter called `loki_compactor_skipped_compacting_locked_tables_total` to track how often we are skipping compacting tables which have uncompacted files. * When retention sees a table is locked: It would wait for the lock to be released since we can't skip any tables while processing delete requests. **Special notes for your reviewer**: * The check for tables with uncompacted files looks for count > 1 because initially, we did not support per tenant index in `boltdb-shipper`, so a table can have a single compacted multi-tenant index file. In a rare case where we have a single file which was supposed to be compacted away, it is okay to have a single uncompacted file for a while. The aim here is to not block compaction for too long in a large cell with too many uncompacted files. * Retention only works on the compacted index, so we first compact down any uncompacted files while applying retention. **Checklist** - [x] Tests updated --------- Co-authored-by: Ashwanth <iamashwanth@gmail.com>
2 years ago
c.wg.Add(1)
go func() {
defer c.wg.Done()
compactor: do not block compation when retention is taking too long (#9884) **What this PR does / why we need it**: Currently, we perform compaction and apply retention in the same loop. Although we have a flag for not applying retention every time we perform compaction, we still see compaction getting blocked when processing some intensive delete requests(processed while applying retention). This PR separates out the compaction and retention to run in a separate loop. I have added a table-locking feature to avoid compaction and retention from processing the same tables at a time. However, compaction and retention would treat locked tables differently, as explained below: * When compaction sees a table is locked: It would skip the table and move on to the following table. However, before skipping, it would check if the table has any uncompacted files and increment the newly added counter called `loki_compactor_skipped_compacting_locked_tables_total` to track how often we are skipping compacting tables which have uncompacted files. * When retention sees a table is locked: It would wait for the lock to be released since we can't skip any tables while processing delete requests. **Special notes for your reviewer**: * The check for tables with uncompacted files looks for count > 1 because initially, we did not support per tenant index in `boltdb-shipper`, so a table can have a single compacted multi-tenant index file. In a rare case where we have a single file which was supposed to be compacted away, it is okay to have a single uncompacted file for a while. The aim here is to not block compaction for too long in a large cell with too many uncompacted files. * Retention only works on the compacted index, so we first compact down any uncompacted files while applying retention. **Checklist** - [x] Tests updated --------- Co-authored-by: Ashwanth <iamashwanth@gmail.com>
2 years ago
ticker := time.NewTicker(c.cfg.CompactionInterval)
defer ticker.Stop()
for {
select {
case <-ticker.C:
if err := c.RunCompaction(ctx, false); err != nil {
level.Error(util_log.Logger).Log("msg", "failed to run compaction", "err", err)
compactor: do not block compation when retention is taking too long (#9884) **What this PR does / why we need it**: Currently, we perform compaction and apply retention in the same loop. Although we have a flag for not applying retention every time we perform compaction, we still see compaction getting blocked when processing some intensive delete requests(processed while applying retention). This PR separates out the compaction and retention to run in a separate loop. I have added a table-locking feature to avoid compaction and retention from processing the same tables at a time. However, compaction and retention would treat locked tables differently, as explained below: * When compaction sees a table is locked: It would skip the table and move on to the following table. However, before skipping, it would check if the table has any uncompacted files and increment the newly added counter called `loki_compactor_skipped_compacting_locked_tables_total` to track how often we are skipping compacting tables which have uncompacted files. * When retention sees a table is locked: It would wait for the lock to be released since we can't skip any tables while processing delete requests. **Special notes for your reviewer**: * The check for tables with uncompacted files looks for count > 1 because initially, we did not support per tenant index in `boltdb-shipper`, so a table can have a single compacted multi-tenant index file. In a rare case where we have a single file which was supposed to be compacted away, it is okay to have a single uncompacted file for a while. The aim here is to not block compaction for too long in a large cell with too many uncompacted files. * Retention only works on the compacted index, so we first compact down any uncompacted files while applying retention. **Checklist** - [x] Tests updated --------- Co-authored-by: Ashwanth <iamashwanth@gmail.com>
2 years ago
}
case <-ctx.Done():
return
}
}
compactor: do not block compation when retention is taking too long (#9884) **What this PR does / why we need it**: Currently, we perform compaction and apply retention in the same loop. Although we have a flag for not applying retention every time we perform compaction, we still see compaction getting blocked when processing some intensive delete requests(processed while applying retention). This PR separates out the compaction and retention to run in a separate loop. I have added a table-locking feature to avoid compaction and retention from processing the same tables at a time. However, compaction and retention would treat locked tables differently, as explained below: * When compaction sees a table is locked: It would skip the table and move on to the following table. However, before skipping, it would check if the table has any uncompacted files and increment the newly added counter called `loki_compactor_skipped_compacting_locked_tables_total` to track how often we are skipping compacting tables which have uncompacted files. * When retention sees a table is locked: It would wait for the lock to be released since we can't skip any tables while processing delete requests. **Special notes for your reviewer**: * The check for tables with uncompacted files looks for count > 1 because initially, we did not support per tenant index in `boltdb-shipper`, so a table can have a single compacted multi-tenant index file. In a rare case where we have a single file which was supposed to be compacted away, it is okay to have a single uncompacted file for a while. The aim here is to not block compaction for too long in a large cell with too many uncompacted files. * Retention only works on the compacted index, so we first compact down any uncompacted files while applying retention. **Checklist** - [x] Tests updated --------- Co-authored-by: Ashwanth <iamashwanth@gmail.com>
2 years ago
}()
if c.cfg.RetentionEnabled {
c.wg.Add(1)
go func() {
defer c.wg.Done()
if err := c.RunCompaction(ctx, true); err != nil {
level.Error(util_log.Logger).Log("msg", "failed to apply retention", "err", err)
}
ticker := time.NewTicker(c.cfg.ApplyRetentionInterval)
defer ticker.Stop()
for {
select {
case <-ticker.C:
if err := c.RunCompaction(ctx, true); err != nil {
level.Error(util_log.Logger).Log("msg", "failed to apply retention", "err", err)
}
case <-ctx.Done():
return
compactor: do not block compation when retention is taking too long (#9884) **What this PR does / why we need it**: Currently, we perform compaction and apply retention in the same loop. Although we have a flag for not applying retention every time we perform compaction, we still see compaction getting blocked when processing some intensive delete requests(processed while applying retention). This PR separates out the compaction and retention to run in a separate loop. I have added a table-locking feature to avoid compaction and retention from processing the same tables at a time. However, compaction and retention would treat locked tables differently, as explained below: * When compaction sees a table is locked: It would skip the table and move on to the following table. However, before skipping, it would check if the table has any uncompacted files and increment the newly added counter called `loki_compactor_skipped_compacting_locked_tables_total` to track how often we are skipping compacting tables which have uncompacted files. * When retention sees a table is locked: It would wait for the lock to be released since we can't skip any tables while processing delete requests. **Special notes for your reviewer**: * The check for tables with uncompacted files looks for count > 1 because initially, we did not support per tenant index in `boltdb-shipper`, so a table can have a single compacted multi-tenant index file. In a rare case where we have a single file which was supposed to be compacted away, it is okay to have a single uncompacted file for a while. The aim here is to not block compaction for too long in a large cell with too many uncompacted files. * Retention only works on the compacted index, so we first compact down any uncompacted files while applying retention. **Checklist** - [x] Tests updated --------- Co-authored-by: Ashwanth <iamashwanth@gmail.com>
2 years ago
}
Custom Retention (#3642) * Playing around with a POC based on the design doc. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Getting series ID for label matchers is now working. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Fixes tests. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * wip/ Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * We're parsing the index label now. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Starting to extract interfaces to make the code testable. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Work in progress will try to add labels to chunk ref iterator Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Iterator for chunks ref with all labels !!!. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Optimize code away. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * with delete into the mix Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Cleaner but not yet working for v10 and v11. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Fixes series cleaner. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * hooking into the compactor. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Hooking limit retention config. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Finishing off the marker processor. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Working on sweeper and fixing tests. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Adding more tests and founding more bugs along the way. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Bug with path once boltdb is closed. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Fixing more bug and more robust test. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * More test and cleanup getting close. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Fixes moar bugs with regards to period schema Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Fix a flaky tests because of boltdb still open. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Add more metrics. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Adding metrics. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Improve benchmark. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Fixes issue. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * lint code. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * more logs. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Save files without using table key Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Improve logging and ability to use more goroutines. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Removes duplicate metrics since histogram contains total too. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Add more logs. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Fixes a deadlock bug when too many workers are trying to update the mark file. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Fixes a deadlock when reading and updating db at the same time. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Fixes default config test of boltdb. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * PR Review feedbacks. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Remove the user of assert to not fail a test if it's incorrect on the first shot. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Add experimental notice to the flag documentation Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Fixes empty index detection and table deletion. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Do not delete folder it's not necessary with object store. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Better working path cleanup Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * got linted. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com>
5 years ago
}
}()
compactor: do not block compation when retention is taking too long (#9884) **What this PR does / why we need it**: Currently, we perform compaction and apply retention in the same loop. Although we have a flag for not applying retention every time we perform compaction, we still see compaction getting blocked when processing some intensive delete requests(processed while applying retention). This PR separates out the compaction and retention to run in a separate loop. I have added a table-locking feature to avoid compaction and retention from processing the same tables at a time. However, compaction and retention would treat locked tables differently, as explained below: * When compaction sees a table is locked: It would skip the table and move on to the following table. However, before skipping, it would check if the table has any uncompacted files and increment the newly added counter called `loki_compactor_skipped_compacting_locked_tables_total` to track how often we are skipping compacting tables which have uncompacted files. * When retention sees a table is locked: It would wait for the lock to be released since we can't skip any tables while processing delete requests. **Special notes for your reviewer**: * The check for tables with uncompacted files looks for count > 1 because initially, we did not support per tenant index in `boltdb-shipper`, so a table can have a single compacted multi-tenant index file. In a rare case where we have a single file which was supposed to be compacted away, it is okay to have a single uncompacted file for a while. The aim here is to not block compaction for too long in a large cell with too many uncompacted files. * Retention only works on the compacted index, so we first compact down any uncompacted files while applying retention. **Checklist** - [x] Tests updated --------- Co-authored-by: Ashwanth <iamashwanth@gmail.com>
2 years ago
for _, container := range c.storeContainers {
c.wg.Add(1)
go func(sc storeContainer) {
// starts the chunk sweeper
defer func() {
sc.sweeper.Stop()
c.wg.Done()
}()
sc.sweeper.Start()
<-ctx.Done()
}(container)
}
}
level.Info(util_log.Logger).Log("msg", "compactor started")
}
Custom Retention (#3642) * Playing around with a POC based on the design doc. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Getting series ID for label matchers is now working. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Fixes tests. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * wip/ Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * We're parsing the index label now. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Starting to extract interfaces to make the code testable. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Work in progress will try to add labels to chunk ref iterator Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Iterator for chunks ref with all labels !!!. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Optimize code away. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * with delete into the mix Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Cleaner but not yet working for v10 and v11. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Fixes series cleaner. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * hooking into the compactor. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Hooking limit retention config. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Finishing off the marker processor. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Working on sweeper and fixing tests. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Adding more tests and founding more bugs along the way. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Bug with path once boltdb is closed. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Fixing more bug and more robust test. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * More test and cleanup getting close. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Fixes moar bugs with regards to period schema Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Fix a flaky tests because of boltdb still open. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Add more metrics. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Adding metrics. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Improve benchmark. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Fixes issue. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * lint code. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * more logs. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Save files without using table key Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Improve logging and ability to use more goroutines. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Removes duplicate metrics since histogram contains total too. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Add more logs. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Fixes a deadlock bug when too many workers are trying to update the mark file. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Fixes a deadlock when reading and updating db at the same time. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Fixes default config test of boltdb. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * PR Review feedbacks. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Remove the user of assert to not fail a test if it's incorrect on the first shot. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Add experimental notice to the flag documentation Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Fixes empty index detection and table deletion. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Do not delete folder it's not necessary with object store. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * Better working path cleanup Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com> * got linted. Signed-off-by: Cyril Tovena <cyril.tovena@gmail.com>
5 years ago
func (c *Compactor) stopping(_ error) error {
return services.StopManagerAndAwaitStopped(context.Background(), c.subservices)
}
func (c *Compactor) CompactTable(ctx context.Context, tableName string, applyRetention bool) error {
schemaCfg, ok := SchemaPeriodForTable(c.schemaConfig, tableName)
if !ok {
level.Error(util_log.Logger).Log("msg", "skipping compaction since we can't find schema for table", "table", tableName)
return nil
}
indexCompactor, ok := c.indexCompactors[schemaCfg.IndexType]
if !ok {
return fmt.Errorf("index processor not found for index type %s", schemaCfg.IndexType)
}
Remove shared_store and shared_store_key_prefix from shipper and compactor (#10840) **What this PR does / why we need it**: #### Removes `shared_store` and `shared_store_key_prefix` from index shipper and compactor configs and their corresponding CLI flags. - `-tsdb.shipper.shared-store` - `-boltdb.shipper.shared-store` - `-tsdb.shipper.shared-store.key-prefix` - `-boltdb.shipper.shared-store.key-prefix` - `-boltdb.shipper.compactor.shared-store` - `-boltdb.shipper.compactor.shared-store.key-prefix` `shared_store` has been a confusing option allowing users to easily misconfigure Loki. Going forward `object_store` setting in the [period_config](https://grafana.com/docs/loki/latest/configure/#period_config) (which already configured the store for chunks) will be used to configure store for the index. And the newly added `path_prefix` option under the `index` key in `period_config` will configure the path under which index tables are stored. This change enforces chunks and index files for a given period reside together in the same storage bucket. More details in the upgrade guide. --- `-compactor.delete-request-store` has to be **explicitly configured** going forward. Without setting this, loki wouldn't know which object store to use for storing delete requests. Path prefix for storing deletes is decided by `-compactor.delete-request-store.key-prefix` which defaults to `index/`. **Checklist** - [X] Reviewed the [`CONTRIBUTING.md`](https://github.com/grafana/loki/blob/main/CONTRIBUTING.md) guide (**required**) - [X] Documentation added - [X] Tests updated - [x] `CHANGELOG.md` updated - [ ] If the change is worth mentioning in the release notes, add `add-to-release-notes` label - [X] Changes that require user attention or interaction to upgrade are documented in `docs/sources/setup/upgrade/_index.md` - [ ] For Helm chart changes bump the Helm chart version in `production/helm/loki/Chart.yaml` and update `production/helm/loki/CHANGELOG.md` and `production/helm/loki/README.md`. [Example PR](https://github.com/grafana/loki/commit/d10549e3ece02120974929894ee333d07755d213)
2 years ago
sc, ok := c.storeContainers[schemaCfg.From]
if !ok {
Remove shared_store and shared_store_key_prefix from shipper and compactor (#10840) **What this PR does / why we need it**: #### Removes `shared_store` and `shared_store_key_prefix` from index shipper and compactor configs and their corresponding CLI flags. - `-tsdb.shipper.shared-store` - `-boltdb.shipper.shared-store` - `-tsdb.shipper.shared-store.key-prefix` - `-boltdb.shipper.shared-store.key-prefix` - `-boltdb.shipper.compactor.shared-store` - `-boltdb.shipper.compactor.shared-store.key-prefix` `shared_store` has been a confusing option allowing users to easily misconfigure Loki. Going forward `object_store` setting in the [period_config](https://grafana.com/docs/loki/latest/configure/#period_config) (which already configured the store for chunks) will be used to configure store for the index. And the newly added `path_prefix` option under the `index` key in `period_config` will configure the path under which index tables are stored. This change enforces chunks and index files for a given period reside together in the same storage bucket. More details in the upgrade guide. --- `-compactor.delete-request-store` has to be **explicitly configured** going forward. Without setting this, loki wouldn't know which object store to use for storing delete requests. Path prefix for storing deletes is decided by `-compactor.delete-request-store.key-prefix` which defaults to `index/`. **Checklist** - [X] Reviewed the [`CONTRIBUTING.md`](https://github.com/grafana/loki/blob/main/CONTRIBUTING.md) guide (**required**) - [X] Documentation added - [X] Tests updated - [x] `CHANGELOG.md` updated - [ ] If the change is worth mentioning in the release notes, add `add-to-release-notes` label - [X] Changes that require user attention or interaction to upgrade are documented in `docs/sources/setup/upgrade/_index.md` - [ ] For Helm chart changes bump the Helm chart version in `production/helm/loki/Chart.yaml` and update `production/helm/loki/CHANGELOG.md` and `production/helm/loki/README.md`. [Example PR](https://github.com/grafana/loki/commit/d10549e3ece02120974929894ee333d07755d213)
2 years ago
return fmt.Errorf("index store client not found for period starting at %s", schemaCfg.From.String())
}
compactor: do not block compation when retention is taking too long (#9884) **What this PR does / why we need it**: Currently, we perform compaction and apply retention in the same loop. Although we have a flag for not applying retention every time we perform compaction, we still see compaction getting blocked when processing some intensive delete requests(processed while applying retention). This PR separates out the compaction and retention to run in a separate loop. I have added a table-locking feature to avoid compaction and retention from processing the same tables at a time. However, compaction and retention would treat locked tables differently, as explained below: * When compaction sees a table is locked: It would skip the table and move on to the following table. However, before skipping, it would check if the table has any uncompacted files and increment the newly added counter called `loki_compactor_skipped_compacting_locked_tables_total` to track how often we are skipping compacting tables which have uncompacted files. * When retention sees a table is locked: It would wait for the lock to be released since we can't skip any tables while processing delete requests. **Special notes for your reviewer**: * The check for tables with uncompacted files looks for count > 1 because initially, we did not support per tenant index in `boltdb-shipper`, so a table can have a single compacted multi-tenant index file. In a rare case where we have a single file which was supposed to be compacted away, it is okay to have a single uncompacted file for a while. The aim here is to not block compaction for too long in a large cell with too many uncompacted files. * Retention only works on the compacted index, so we first compact down any uncompacted files while applying retention. **Checklist** - [x] Tests updated --------- Co-authored-by: Ashwanth <iamashwanth@gmail.com>
2 years ago
for {
locked, lockWaiterChan := c.tableLocker.lockTable(tableName)
if locked {
break
}
// do not wait for lock to be released if we are only compacting the table since
// compaction should happen more frequently than retention and retention anyway compacts un-compacted files as well.
if !applyRetention {
hasUncompactedIndex, err := tableHasUncompactedIndex(ctx, tableName, sc.indexStorageClient)
if err != nil {
level.Error(util_log.Logger).Log("msg", "failed to check if table has uncompacted index", "table_name", tableName)
hasUncompactedIndex = true
}
if hasUncompactedIndex {
c.metrics.skippedCompactingLockedTables.WithLabelValues(tableName).Inc()
compactor: do not block compation when retention is taking too long (#9884) **What this PR does / why we need it**: Currently, we perform compaction and apply retention in the same loop. Although we have a flag for not applying retention every time we perform compaction, we still see compaction getting blocked when processing some intensive delete requests(processed while applying retention). This PR separates out the compaction and retention to run in a separate loop. I have added a table-locking feature to avoid compaction and retention from processing the same tables at a time. However, compaction and retention would treat locked tables differently, as explained below: * When compaction sees a table is locked: It would skip the table and move on to the following table. However, before skipping, it would check if the table has any uncompacted files and increment the newly added counter called `loki_compactor_skipped_compacting_locked_tables_total` to track how often we are skipping compacting tables which have uncompacted files. * When retention sees a table is locked: It would wait for the lock to be released since we can't skip any tables while processing delete requests. **Special notes for your reviewer**: * The check for tables with uncompacted files looks for count > 1 because initially, we did not support per tenant index in `boltdb-shipper`, so a table can have a single compacted multi-tenant index file. In a rare case where we have a single file which was supposed to be compacted away, it is okay to have a single uncompacted file for a while. The aim here is to not block compaction for too long in a large cell with too many uncompacted files. * Retention only works on the compacted index, so we first compact down any uncompacted files while applying retention. **Checklist** - [x] Tests updated --------- Co-authored-by: Ashwanth <iamashwanth@gmail.com>
2 years ago
level.Warn(util_log.Logger).Log("msg", "skipped compacting table which likely has uncompacted index since it is locked by retention", "table_name", tableName)
}
return nil
}
// we are applying retention and processing delete requests so,
// wait for lock to be released since we can't mark delete requests as processed without checking all the tables
select {
case <-lockWaiterChan:
case <-ctx.Done():
return nil
}
}
defer c.tableLocker.unlockTable(tableName)
table, err := newTable(ctx, filepath.Join(c.cfg.WorkingDirectory, tableName), sc.indexStorageClient, indexCompactor,
schemaCfg, sc.tableMarker, c.expirationChecker, c.cfg.UploadParallelism)
if err != nil {
level.Error(util_log.Logger).Log("msg", "failed to initialize table for compaction", "table", tableName, "err", err)
return err
}
interval := retention.ExtractIntervalFromTableName(tableName)
intervalMayHaveExpiredChunks := false
if applyRetention {
intervalMayHaveExpiredChunks = c.expirationChecker.IntervalMayHaveExpiredChunks(interval, "")
}
err = table.compact(intervalMayHaveExpiredChunks)
if err != nil {
level.Error(util_log.Logger).Log("msg", "failed to compact files", "table", tableName, "err", err)
return err
}
if !applyRetention {
c.metrics.skippedCompactingLockedTables.WithLabelValues(tableName).Set(0)
}
return nil
}
func (c *Compactor) RegisterIndexCompactor(indexType string, indexCompactor IndexCompactor) {
c.indexCompactors[indexType] = indexCompactor
}
compactor: do not block compation when retention is taking too long (#9884) **What this PR does / why we need it**: Currently, we perform compaction and apply retention in the same loop. Although we have a flag for not applying retention every time we perform compaction, we still see compaction getting blocked when processing some intensive delete requests(processed while applying retention). This PR separates out the compaction and retention to run in a separate loop. I have added a table-locking feature to avoid compaction and retention from processing the same tables at a time. However, compaction and retention would treat locked tables differently, as explained below: * When compaction sees a table is locked: It would skip the table and move on to the following table. However, before skipping, it would check if the table has any uncompacted files and increment the newly added counter called `loki_compactor_skipped_compacting_locked_tables_total` to track how often we are skipping compacting tables which have uncompacted files. * When retention sees a table is locked: It would wait for the lock to be released since we can't skip any tables while processing delete requests. **Special notes for your reviewer**: * The check for tables with uncompacted files looks for count > 1 because initially, we did not support per tenant index in `boltdb-shipper`, so a table can have a single compacted multi-tenant index file. In a rare case where we have a single file which was supposed to be compacted away, it is okay to have a single uncompacted file for a while. The aim here is to not block compaction for too long in a large cell with too many uncompacted files. * Retention only works on the compacted index, so we first compact down any uncompacted files while applying retention. **Checklist** - [x] Tests updated --------- Co-authored-by: Ashwanth <iamashwanth@gmail.com>
2 years ago
func (c *Compactor) RunCompaction(ctx context.Context, applyRetention bool) (err error) {
status := statusSuccess
start := time.Now()
if applyRetention {
c.expirationChecker.MarkPhaseStarted()
}
defer func() {
compactor: do not block compation when retention is taking too long (#9884) **What this PR does / why we need it**: Currently, we perform compaction and apply retention in the same loop. Although we have a flag for not applying retention every time we perform compaction, we still see compaction getting blocked when processing some intensive delete requests(processed while applying retention). This PR separates out the compaction and retention to run in a separate loop. I have added a table-locking feature to avoid compaction and retention from processing the same tables at a time. However, compaction and retention would treat locked tables differently, as explained below: * When compaction sees a table is locked: It would skip the table and move on to the following table. However, before skipping, it would check if the table has any uncompacted files and increment the newly added counter called `loki_compactor_skipped_compacting_locked_tables_total` to track how often we are skipping compacting tables which have uncompacted files. * When retention sees a table is locked: It would wait for the lock to be released since we can't skip any tables while processing delete requests. **Special notes for your reviewer**: * The check for tables with uncompacted files looks for count > 1 because initially, we did not support per tenant index in `boltdb-shipper`, so a table can have a single compacted multi-tenant index file. In a rare case where we have a single file which was supposed to be compacted away, it is okay to have a single uncompacted file for a while. The aim here is to not block compaction for too long in a large cell with too many uncompacted files. * Retention only works on the compacted index, so we first compact down any uncompacted files while applying retention. **Checklist** - [x] Tests updated --------- Co-authored-by: Ashwanth <iamashwanth@gmail.com>
2 years ago
if err != nil {
status = statusFailure
}
if applyRetention {
c.metrics.applyRetentionOperationTotal.WithLabelValues(status).Inc()
} else {
c.metrics.compactTablesOperationTotal.WithLabelValues(status).Inc()
}
runtime := time.Since(start)
if status == statusSuccess {
if applyRetention {
c.metrics.applyRetentionOperationDurationSeconds.Set(runtime.Seconds())
c.metrics.applyRetentionLastSuccess.SetToCurrentTime()
} else {
c.metrics.compactTablesOperationDurationSeconds.Set(runtime.Seconds())
c.metrics.compactTablesOperationLastSuccess.SetToCurrentTime()
}
}
if applyRetention {
if status == statusSuccess {
c.expirationChecker.MarkPhaseFinished()
} else {
c.expirationChecker.MarkPhaseFailed()
}
}
compactor: do not block compation when retention is taking too long (#9884) **What this PR does / why we need it**: Currently, we perform compaction and apply retention in the same loop. Although we have a flag for not applying retention every time we perform compaction, we still see compaction getting blocked when processing some intensive delete requests(processed while applying retention). This PR separates out the compaction and retention to run in a separate loop. I have added a table-locking feature to avoid compaction and retention from processing the same tables at a time. However, compaction and retention would treat locked tables differently, as explained below: * When compaction sees a table is locked: It would skip the table and move on to the following table. However, before skipping, it would check if the table has any uncompacted files and increment the newly added counter called `loki_compactor_skipped_compacting_locked_tables_total` to track how often we are skipping compacting tables which have uncompacted files. * When retention sees a table is locked: It would wait for the lock to be released since we can't skip any tables while processing delete requests. **Special notes for your reviewer**: * The check for tables with uncompacted files looks for count > 1 because initially, we did not support per tenant index in `boltdb-shipper`, so a table can have a single compacted multi-tenant index file. In a rare case where we have a single file which was supposed to be compacted away, it is okay to have a single uncompacted file for a while. The aim here is to not block compaction for too long in a large cell with too many uncompacted files. * Retention only works on the compacted index, so we first compact down any uncompacted files while applying retention. **Checklist** - [x] Tests updated --------- Co-authored-by: Ashwanth <iamashwanth@gmail.com>
2 years ago
if !applyRetention && runtime > c.cfg.CompactionInterval {
level.Warn(util_log.Logger).Log("msg", fmt.Sprintf("last compaction took %s which is longer than the compaction interval of %s, this can lead to duplicate compactors running if not running a standalone compactor instance.", runtime, c.cfg.CompactionInterval))
}
}()
Remove shared_store and shared_store_key_prefix from shipper and compactor (#10840) **What this PR does / why we need it**: #### Removes `shared_store` and `shared_store_key_prefix` from index shipper and compactor configs and their corresponding CLI flags. - `-tsdb.shipper.shared-store` - `-boltdb.shipper.shared-store` - `-tsdb.shipper.shared-store.key-prefix` - `-boltdb.shipper.shared-store.key-prefix` - `-boltdb.shipper.compactor.shared-store` - `-boltdb.shipper.compactor.shared-store.key-prefix` `shared_store` has been a confusing option allowing users to easily misconfigure Loki. Going forward `object_store` setting in the [period_config](https://grafana.com/docs/loki/latest/configure/#period_config) (which already configured the store for chunks) will be used to configure store for the index. And the newly added `path_prefix` option under the `index` key in `period_config` will configure the path under which index tables are stored. This change enforces chunks and index files for a given period reside together in the same storage bucket. More details in the upgrade guide. --- `-compactor.delete-request-store` has to be **explicitly configured** going forward. Without setting this, loki wouldn't know which object store to use for storing delete requests. Path prefix for storing deletes is decided by `-compactor.delete-request-store.key-prefix` which defaults to `index/`. **Checklist** - [X] Reviewed the [`CONTRIBUTING.md`](https://github.com/grafana/loki/blob/main/CONTRIBUTING.md) guide (**required**) - [X] Documentation added - [X] Tests updated - [x] `CHANGELOG.md` updated - [ ] If the change is worth mentioning in the release notes, add `add-to-release-notes` label - [X] Changes that require user attention or interaction to upgrade are documented in `docs/sources/setup/upgrade/_index.md` - [ ] For Helm chart changes bump the Helm chart version in `production/helm/loki/Chart.yaml` and update `production/helm/loki/CHANGELOG.md` and `production/helm/loki/README.md`. [Example PR](https://github.com/grafana/loki/commit/d10549e3ece02120974929894ee333d07755d213)
2 years ago
var (
tables []string
// it possible for two periods to use the same storage bucket and path prefix (different indexType or schema version)
// so more than one index storage client may end up listing the same set of buckets
// avoid including the same table twice in the compact tables list.
seen = make(map[string]struct{})
)
for _, sc := range c.storeContainers {
// refresh index list cache since previous compaction would have changed the index files in the object store
sc.indexStorageClient.RefreshIndexTableNamesCache(ctx)
tbls, err := sc.indexStorageClient.ListTables(ctx)
if err != nil {
return fmt.Errorf("failed to list tables: %w", err)
}
Remove shared_store and shared_store_key_prefix from shipper and compactor (#10840) **What this PR does / why we need it**: #### Removes `shared_store` and `shared_store_key_prefix` from index shipper and compactor configs and their corresponding CLI flags. - `-tsdb.shipper.shared-store` - `-boltdb.shipper.shared-store` - `-tsdb.shipper.shared-store.key-prefix` - `-boltdb.shipper.shared-store.key-prefix` - `-boltdb.shipper.compactor.shared-store` - `-boltdb.shipper.compactor.shared-store.key-prefix` `shared_store` has been a confusing option allowing users to easily misconfigure Loki. Going forward `object_store` setting in the [period_config](https://grafana.com/docs/loki/latest/configure/#period_config) (which already configured the store for chunks) will be used to configure store for the index. And the newly added `path_prefix` option under the `index` key in `period_config` will configure the path under which index tables are stored. This change enforces chunks and index files for a given period reside together in the same storage bucket. More details in the upgrade guide. --- `-compactor.delete-request-store` has to be **explicitly configured** going forward. Without setting this, loki wouldn't know which object store to use for storing delete requests. Path prefix for storing deletes is decided by `-compactor.delete-request-store.key-prefix` which defaults to `index/`. **Checklist** - [X] Reviewed the [`CONTRIBUTING.md`](https://github.com/grafana/loki/blob/main/CONTRIBUTING.md) guide (**required**) - [X] Documentation added - [X] Tests updated - [x] `CHANGELOG.md` updated - [ ] If the change is worth mentioning in the release notes, add `add-to-release-notes` label - [X] Changes that require user attention or interaction to upgrade are documented in `docs/sources/setup/upgrade/_index.md` - [ ] For Helm chart changes bump the Helm chart version in `production/helm/loki/Chart.yaml` and update `production/helm/loki/CHANGELOG.md` and `production/helm/loki/README.md`. [Example PR](https://github.com/grafana/loki/commit/d10549e3ece02120974929894ee333d07755d213)
2 years ago
for _, table := range tbls {
if _, ok := seen[table]; ok {
continue
}
tables = append(tables, table)
seen[table] = struct{}{}
}
}
// process most recent tables first
SortTablesByRange(tables)
// apply passed in compaction limits
if c.cfg.SkipLatestNTables <= len(tables) {
tables = tables[c.cfg.SkipLatestNTables:]
}
if c.cfg.TablesToCompact > 0 && c.cfg.TablesToCompact < len(tables) {
tables = tables[:c.cfg.TablesToCompact]
}
compactTablesChan := make(chan string)
errChan := make(chan error)
for i := 0; i < c.cfg.MaxCompactionParallelism; i++ {
go func() {
var err error
defer func() {
errChan <- err
}()
for {
select {
case tableName, ok := <-compactTablesChan:
if !ok {
return
}
level.Info(util_log.Logger).Log("msg", "compacting table", "table-name", tableName)
err = c.CompactTable(ctx, tableName, applyRetention)
if err != nil {
return
}
level.Info(util_log.Logger).Log("msg", "finished compacting table", "table-name", tableName)
case <-ctx.Done():
return
}
}
}()
}
go func() {
for _, tableName := range tables {
if tableName == deletion.DeleteRequestsTableName {
// we do not want to compact or apply retention on delete requests table
continue
}
select {
case compactTablesChan <- tableName:
case <-ctx.Done():
return
}
}
close(compactTablesChan)
}()
var firstErr error
// read all the errors
for i := 0; i < c.cfg.MaxCompactionParallelism; i++ {
err := <-errChan
if err != nil && firstErr == nil {
firstErr = err
}
}
compactor: do not block compation when retention is taking too long (#9884) **What this PR does / why we need it**: Currently, we perform compaction and apply retention in the same loop. Although we have a flag for not applying retention every time we perform compaction, we still see compaction getting blocked when processing some intensive delete requests(processed while applying retention). This PR separates out the compaction and retention to run in a separate loop. I have added a table-locking feature to avoid compaction and retention from processing the same tables at a time. However, compaction and retention would treat locked tables differently, as explained below: * When compaction sees a table is locked: It would skip the table and move on to the following table. However, before skipping, it would check if the table has any uncompacted files and increment the newly added counter called `loki_compactor_skipped_compacting_locked_tables_total` to track how often we are skipping compacting tables which have uncompacted files. * When retention sees a table is locked: It would wait for the lock to be released since we can't skip any tables while processing delete requests. **Special notes for your reviewer**: * The check for tables with uncompacted files looks for count > 1 because initially, we did not support per tenant index in `boltdb-shipper`, so a table can have a single compacted multi-tenant index file. In a rare case where we have a single file which was supposed to be compacted away, it is okay to have a single uncompacted file for a while. The aim here is to not block compaction for too long in a large cell with too many uncompacted files. * Retention only works on the compacted index, so we first compact down any uncompacted files while applying retention. **Checklist** - [x] Tests updated --------- Co-authored-by: Ashwanth <iamashwanth@gmail.com>
2 years ago
if firstErr != nil {
return firstErr
}
return ctx.Err()
}
type expirationChecker struct {
retentionExpiryChecker retention.ExpirationChecker
deletionExpiryChecker retention.ExpirationChecker
}
func newExpirationChecker(retentionExpiryChecker, deletionExpiryChecker retention.ExpirationChecker) retention.ExpirationChecker {
return &expirationChecker{retentionExpiryChecker, deletionExpiryChecker}
}
func (e *expirationChecker) Expired(ref retention.ChunkEntry, now model.Time) (bool, filter.Func) {
if expired, nonDeletedIntervals := e.retentionExpiryChecker.Expired(ref, now); expired {
return expired, nonDeletedIntervals
}
return e.deletionExpiryChecker.Expired(ref, now)
}
func (e *expirationChecker) MarkPhaseStarted() {
e.retentionExpiryChecker.MarkPhaseStarted()
e.deletionExpiryChecker.MarkPhaseStarted()
}
func (e *expirationChecker) MarkPhaseFailed() {
e.retentionExpiryChecker.MarkPhaseFailed()
e.deletionExpiryChecker.MarkPhaseFailed()
}
func (e *expirationChecker) MarkPhaseFinished() {
e.retentionExpiryChecker.MarkPhaseFinished()
e.deletionExpiryChecker.MarkPhaseFinished()
}
func (e *expirationChecker) MarkPhaseTimedOut() {
e.retentionExpiryChecker.MarkPhaseTimedOut()
e.deletionExpiryChecker.MarkPhaseTimedOut()
}
func (e *expirationChecker) IntervalMayHaveExpiredChunks(interval model.Interval, userID string) bool {
return e.retentionExpiryChecker.IntervalMayHaveExpiredChunks(interval, userID) || e.deletionExpiryChecker.IntervalMayHaveExpiredChunks(interval, userID)
}
func (e *expirationChecker) DropFromIndex(ref retention.ChunkEntry, tableEndTime model.Time, now model.Time) bool {
return e.retentionExpiryChecker.DropFromIndex(ref, tableEndTime, now) || e.deletionExpiryChecker.DropFromIndex(ref, tableEndTime, now)
}
func (c *Compactor) OnRingInstanceRegister(_ *ring.BasicLifecycler, ringDesc ring.Desc, instanceExists bool, _ string, instanceDesc ring.InstanceDesc) (ring.InstanceState, ring.Tokens) {
// When we initialize the compactor instance in the ring we want to start from
// a clean situation, so whatever is the state we set it JOINING, while we keep existing
// tokens (if any) or the ones loaded from file.
var tokens []uint32
if instanceExists {
tokens = instanceDesc.GetTokens()
}
takenTokens := ringDesc.GetTokens()
gen := ring.NewRandomTokenGenerator()
newTokens := gen.GenerateTokens(ringNumTokens-len(tokens), takenTokens)
// Tokens sorting will be enforced by the parent caller.
tokens = append(tokens, newTokens...)
return ring.JOINING, tokens
}
func (c *Compactor) OnRingInstanceTokens(_ *ring.BasicLifecycler, _ ring.Tokens) {}
func (c *Compactor) OnRingInstanceStopping(_ *ring.BasicLifecycler) {}
func (c *Compactor) OnRingInstanceHeartbeat(_ *ring.BasicLifecycler, _ *ring.Desc, _ *ring.InstanceDesc) {
}
func (c *Compactor) ServeHTTP(w http.ResponseWriter, req *http.Request) {
c.ring.ServeHTTP(w, req)
}
func SortTablesByRange(tables []string) {
tableRanges := make(map[string]model.Interval)
for _, table := range tables {
tableRanges[table] = retention.ExtractIntervalFromTableName(table)
}
sort.Slice(tables, func(i, j int) bool {
// less than if start time is after produces a most recent first sort order
return tableRanges[tables[i]].Start.After(tableRanges[tables[j]].Start)
})
}
func SchemaPeriodForTable(cfg config.SchemaConfig, tableName string) (config.PeriodConfig, bool) {
tableInterval := retention.ExtractIntervalFromTableName(tableName)
schemaCfg, err := cfg.SchemaForTime(tableInterval.Start)
if err != nil || schemaCfg.IndexTables.TableFor(tableInterval.Start) != tableName {
return config.PeriodConfig{}, false
}
return schemaCfg, true
}
func minDuration(x time.Duration, y time.Duration) time.Duration {
if x < y {
return x
}
return y
}