level.Error(logger).Log("msg","unable to list WAL segments for old checkpoint cleanup, checkpoint cleanup cannot proceed, this is not expected and could lead to disk space exhaustion and may indicate disk I/O problems or corruption and should be investigated manually","err",err)
allSuccess=false
return
}
iffirstSegment<=0{
// No cleanup needed if we're starting from segment 0
return
}
files,err:=os.ReadDir(dir)
iferr!=nil{
level.Error(logger).Log("msg","unable to read WAL directory for old checkpoint cleanup, checkpoint cleanup cannot proceed, this is not expected and could lead to disk space exhaustion and may indicate disk I/O problems or corruption and should be investigated manually","err",err)
allSuccess=false
return
}
for_,fi:=rangefiles{
// Check if this is a completed checkpoint (not .tmp)
idx,err:=checkpointIndex(fi.Name(),false)
iferr!=nil||!fi.IsDir(){
continue
}
// Delete checkpoints that are both:
// 1. From a time when segments <= firstSegment existed (idx < firstSegment is typical for old checkpoints)
// 2. Older than the protected checkpoint (idx < protectedCheckpointIdx means superseded)
// The second condition is the key safety check - we never delete the protected checkpoint.
ifidx<firstSegment&&idx<protectedCheckpointIdx{
orphanedPath:=filepath.Join(dir,fi.Name())
iferr:=os.RemoveAll(orphanedPath);err!=nil{
level.Error(logger).Log("msg","unable to cleanup old checkpoint, this is not expected and could lead to disk space exhaustion and may indicate disk I/O problems or corruption and should be investigated manually","dir",orphanedPath,"err",err)
allSuccess=false
}else{
level.Info(logger).Log("msg","cleaned up old superseded checkpoint","dir",fi.Name(),"idx",idx,"firstSegment",firstSegment,"protectedCheckpoint",protectedCheckpointIdx)
}
}
}
}
// cleanupStaleTmpCheckpoints removes all .tmp checkpoint directories which represent
// incomplete/failed checkpoint operations. These are safe to delete because recovery
// only uses completed checkpoints (those without the .tmp suffix).
level.Error(logger).Log("msg","unable to read WAL directory for tmp checkpoint cleanup, checkpoint cleanup cannot proceed, this is not expected and could lead to disk space exhaustion and may indicate disk I/O problems or corruption and should be investigated manually","err",err)
level.Error(logger).Log("msg","unable to cleanup stale tmp checkpoint, this is not expected and could lead to disk space exhaustion and may indicate disk I/O problems or corruption and should be investigated manually","dir",tmpPath,"err",err)
// Continue cleaning up other .tmp directories even if one fails
allSuccess=false
}else{
level.Info(logger).Log("msg","cleaned up stale tmp checkpoint at startup","dir",fi.Name())
// First, clean up any stale .tmp checkpoint directories from failed checkpoint attempts.
// These are always safe to delete at startup since they represent incomplete operations.
cleanupStaleTmpCheckpoints(dir,logger)
// Find the most recent valid checkpoint to protect it from deletion
_,latestCheckpointIdx,err:=lastCheckpoint(dir)
iferr!=nil{
level.Error(logger).Log("msg","unable to find latest checkpoint for startup checkpoint cleanu, this is not expected and could lead to disk space exhaustion and may indicate disk I/O problems or corruption and should be investigated manually","err",err)
return
}
iflatestCheckpointIdx<0{
// No checkpoints exist, nothing to clean up
return
}
// Delegate to the shared cleanup function, protecting the latest checkpoint.
// This will remove any old checkpoints that are superseded by the latest one.