mirror of https://github.com/grafana/grafana
CloudWatch Logs: Add retry strategy for hitting max concurrent queries (#39290)
* Add error passing and retry strategy * Change generic error to specific one * Make the error more generic * Refactor retry strategy * Add retry that handles multiple queries * Rollback some backend changes * Remove simple retry strategy * Add comments * Add tests * Small test fixes * Add log timeout config * Fix tests * Fix tests * Add validation * Remove commented code and add comment * Fix snapshots * Remove unnecessary castpull/41842/head
parent
3dd73387fa
commit
e237ff20a9
@ -0,0 +1,225 @@ |
||||
import { runWithRetry } from './logsRetry'; |
||||
import { toArray } from 'rxjs/operators'; |
||||
import { lastValueFrom, of, throwError } from 'rxjs'; |
||||
import { dataFrameToJSON, MutableDataFrame } from '@grafana/data'; |
||||
import { DataResponse, FetchError } from '@grafana/runtime'; |
||||
import { StartQueryRequest } from '../types'; |
||||
|
||||
describe('runWithRetry', () => { |
||||
it('returns results if no retry is needed', async () => { |
||||
const queryFunc = jest.fn(); |
||||
queryFunc.mockReturnValueOnce(of([createResponseFrame('A')])); |
||||
const targets = [targetA]; |
||||
const values = await lastValueFrom(runWithRetry(queryFunc, targets).pipe(toArray())); |
||||
expect(queryFunc).toBeCalledTimes(1); |
||||
expect(queryFunc).toBeCalledWith(targets); |
||||
expect(values).toEqual([{ frames: [createResponseFrame('A')] }]); |
||||
}); |
||||
|
||||
it('retries if error', async () => { |
||||
jest.useFakeTimers(); |
||||
const targets = [targetA]; |
||||
const queryFunc = jest.fn(); |
||||
queryFunc.mockReturnValueOnce(throwError(() => createErrorResponse(targets))); |
||||
queryFunc.mockReturnValueOnce(of([createResponseFrame('A')])); |
||||
|
||||
const valuesPromise = lastValueFrom(runWithRetry(queryFunc, targets).pipe(toArray())); |
||||
jest.runAllTimers(); |
||||
const values = await valuesPromise; |
||||
|
||||
expect(queryFunc).toBeCalledTimes(2); |
||||
expect(queryFunc).nthCalledWith(1, targets); |
||||
expect(queryFunc).nthCalledWith(2, targets); |
||||
expect(values).toEqual([{ frames: [createResponseFrame('A')] }]); |
||||
}); |
||||
|
||||
it('fails if reaching timoeut and no data was retrieved', async () => { |
||||
jest.useFakeTimers(); |
||||
const targets = [targetA]; |
||||
const queryFunc = jest.fn(); |
||||
queryFunc.mockReturnValueOnce(throwError(() => createErrorResponse(targets))); |
||||
queryFunc.mockReturnValueOnce(of([createResponseFrame('A')])); |
||||
|
||||
const valuesPromise = lastValueFrom(runWithRetry(queryFunc, targets, { timeout: 0 }).pipe(toArray())); |
||||
jest.runAllTimers(); |
||||
let error; |
||||
try { |
||||
await valuesPromise; |
||||
} catch (e) { |
||||
error = e; |
||||
} |
||||
|
||||
expect(queryFunc).toBeCalledTimes(1); |
||||
expect(queryFunc).nthCalledWith(1, targets); |
||||
expect(error).toEqual({ message: 'LimitExceededException', refId: 'A' }); |
||||
}); |
||||
|
||||
it('fails if we get unexpected error', async () => { |
||||
jest.useFakeTimers(); |
||||
const targets = [targetA]; |
||||
const queryFunc = jest.fn(); |
||||
queryFunc.mockReturnValueOnce(throwError(() => 'random error')); |
||||
|
||||
const valuesPromise = lastValueFrom(runWithRetry(queryFunc, targets).pipe(toArray())); |
||||
jest.runAllTimers(); |
||||
let error; |
||||
try { |
||||
await valuesPromise; |
||||
} catch (e) { |
||||
error = e; |
||||
} |
||||
|
||||
expect(queryFunc).toBeCalledTimes(1); |
||||
expect(queryFunc).nthCalledWith(1, targets); |
||||
expect(error).toEqual('random error'); |
||||
}); |
||||
|
||||
it('works with multiple queries if there is no error', async () => { |
||||
const targets = [targetA, targetB]; |
||||
const queryFunc = jest.fn(); |
||||
queryFunc.mockReturnValueOnce(of([createResponseFrame('A'), createResponseFrame('B')])); |
||||
|
||||
const values = await lastValueFrom(runWithRetry(queryFunc, targets).pipe(toArray())); |
||||
|
||||
expect(queryFunc).toBeCalledTimes(1); |
||||
expect(queryFunc).nthCalledWith(1, targets); |
||||
expect(values).toEqual([{ frames: [createResponseFrame('A'), createResponseFrame('B')] }]); |
||||
}); |
||||
|
||||
it('works with multiple queries only one errors out', async () => { |
||||
jest.useFakeTimers(); |
||||
const targets = [targetA, targetB]; |
||||
const queryFunc = jest.fn(); |
||||
queryFunc.mockReturnValueOnce( |
||||
throwError(() => |
||||
createErrorResponse(targets, { |
||||
A: { frames: [dataFrameToJSON(createResponseFrame('A'))] }, |
||||
B: { error: 'LimitExceededException' }, |
||||
}) |
||||
) |
||||
); |
||||
|
||||
queryFunc.mockReturnValueOnce(of([createResponseFrame('B')])); |
||||
|
||||
const valuesPromise = lastValueFrom(runWithRetry(queryFunc, targets).pipe(toArray())); |
||||
jest.runAllTimers(); |
||||
const values = await valuesPromise; |
||||
|
||||
expect(queryFunc).toBeCalledTimes(2); |
||||
expect(queryFunc).nthCalledWith(1, targets); |
||||
expect(queryFunc).nthCalledWith(2, [targetB]); |
||||
// Bit more involved because dataFrameToJSON and dataFrameFromJSON are not symmetrical and add some attributes to the
|
||||
// dataframe fields
|
||||
expect(values.length).toBe(1); |
||||
expect(values[0].frames.length).toBe(2); |
||||
expect(values[0].frames[0].fields[0].values.get(0)).toBe('A'); |
||||
expect(values[0].frames[1].fields[0].values.get(0)).toBe('B'); |
||||
}); |
||||
|
||||
it('sends data and also error if only one query gets limit error', async () => { |
||||
jest.useFakeTimers(); |
||||
const targets = [targetA, targetB]; |
||||
const queryFunc = jest.fn(); |
||||
queryFunc.mockReturnValueOnce( |
||||
throwError(() => |
||||
createErrorResponse(targets, { |
||||
A: { frames: [dataFrameToJSON(createResponseFrame('A'))] }, |
||||
B: { error: 'LimitExceededException' }, |
||||
}) |
||||
) |
||||
); |
||||
|
||||
const valuesPromise = lastValueFrom(runWithRetry(queryFunc, targets, { timeout: 0 }).pipe(toArray())); |
||||
jest.runAllTimers(); |
||||
const values = await valuesPromise; |
||||
|
||||
expect(queryFunc).toBeCalledTimes(1); |
||||
expect(queryFunc).nthCalledWith(1, targets); |
||||
expect(values.length).toBe(1); |
||||
expect(values[0].frames.length).toBe(1); |
||||
expect(values[0].frames[0].fields[0].values.get(0)).toBe('A'); |
||||
expect(values[0].error).toEqual({ message: 'Some queries timed out: LimitExceededException' }); |
||||
}); |
||||
|
||||
it('sends all collected successful responses on timeout', async () => { |
||||
jest.useFakeTimers(); |
||||
const targets = [targetA, targetB, targetC]; |
||||
const queryFunc = jest.fn(); |
||||
queryFunc.mockReturnValueOnce( |
||||
throwError(() => |
||||
createErrorResponse(targets, { |
||||
A: { frames: [dataFrameToJSON(createResponseFrame('A'))] }, |
||||
B: { error: 'LimitExceededException' }, |
||||
C: { error: 'LimitExceededException' }, |
||||
}) |
||||
) |
||||
); |
||||
|
||||
queryFunc.mockReturnValueOnce( |
||||
throwError(() => |
||||
createErrorResponse(targets, { |
||||
B: { frames: [dataFrameToJSON(createResponseFrame('B'))] }, |
||||
C: { error: 'LimitExceededException' }, |
||||
}) |
||||
) |
||||
); |
||||
|
||||
queryFunc.mockReturnValueOnce( |
||||
throwError(() => |
||||
createErrorResponse(targets, { |
||||
C: { error: 'LimitExceededException' }, |
||||
}) |
||||
) |
||||
); |
||||
|
||||
const valuesPromise = lastValueFrom( |
||||
runWithRetry(queryFunc, targets, { timeoutFunc: (retry) => retry >= 2 }).pipe(toArray()) |
||||
); |
||||
jest.runAllTimers(); |
||||
const values = await valuesPromise; |
||||
|
||||
expect(queryFunc).toBeCalledTimes(3); |
||||
expect(queryFunc).nthCalledWith(1, targets); |
||||
expect(queryFunc).nthCalledWith(2, [targetB, targetC]); |
||||
expect(queryFunc).nthCalledWith(3, [targetC]); |
||||
expect(values.length).toBe(1); |
||||
expect(values[0].frames.length).toBe(2); |
||||
expect(values[0].frames[0].fields[0].values.get(0)).toBe('A'); |
||||
expect(values[0].frames[1].fields[0].values.get(0)).toBe('B'); |
||||
expect(values[0].error).toEqual({ message: 'Some queries timed out: LimitExceededException' }); |
||||
}); |
||||
}); |
||||
|
||||
const targetA = makeTarget('A'); |
||||
const targetB = makeTarget('B'); |
||||
const targetC = makeTarget('C'); |
||||
|
||||
function makeTarget(refId: string) { |
||||
return { queryString: 'query ' + refId, refId, region: 'test' }; |
||||
} |
||||
|
||||
function createResponseFrame(ref: string) { |
||||
return new MutableDataFrame({ |
||||
fields: [{ name: 'queryId', values: [ref] }], |
||||
refId: ref, |
||||
}); |
||||
} |
||||
|
||||
function createErrorResponse(targets: StartQueryRequest[], results?: Record<string, DataResponse>): FetchError { |
||||
return { |
||||
status: 400, |
||||
data: { |
||||
results: results || { |
||||
A: { |
||||
error: 'LimitExceededException', |
||||
}, |
||||
}, |
||||
}, |
||||
config: { |
||||
url: '', |
||||
data: { |
||||
queries: targets, |
||||
}, |
||||
}, |
||||
}; |
||||
} |
@ -0,0 +1,163 @@ |
||||
import { Observable, Subscription } from 'rxjs'; |
||||
import { FetchError, toDataQueryResponse } from '@grafana/runtime'; |
||||
import { StartQueryRequest } from '../types'; |
||||
import { DataFrame, DataFrameJSON, DataQueryError } from '@grafana/data'; |
||||
|
||||
type Result = { frames: DataFrameJSON[]; error?: string }; |
||||
|
||||
const defaultTimeout = 30_000; |
||||
|
||||
/** |
||||
* A retry strategy specifically for cloud watch logs query. Cloud watch logs queries need first starting the query |
||||
* and the polling for the results. The start query can fail because of the concurrent queries rate limit, |
||||
* and so we hove to retry the start query call if there is already lot of queries running. |
||||
* |
||||
* As we send multiple queries in single request some can fail and some can succeed and we have to also handle those |
||||
* cases by only retrying the failed queries. We retry the failed queries until we hit the time limit or all queries |
||||
* succeed and only then we pass the data forward. This means we wait longer but makes the code a bit simpler as we |
||||
* can treat starting the query and polling as steps in a pipeline. |
||||
* @param queryFun |
||||
* @param targets |
||||
* @param options |
||||
*/ |
||||
export function runWithRetry( |
||||
queryFun: (targets: StartQueryRequest[]) => Observable<DataFrame[]>, |
||||
targets: StartQueryRequest[], |
||||
options: { |
||||
timeout?: number; |
||||
timeoutFunc?: (retry: number, startTime: number) => boolean; |
||||
retryWaitFunc?: (retry: number) => number; |
||||
} = {} |
||||
): Observable<{ frames: DataFrame[]; error?: DataQueryError }> { |
||||
const startTime = new Date(); |
||||
let retries = 0; |
||||
let timerID: any; |
||||
let subscription: Subscription; |
||||
let collected = {}; |
||||
|
||||
const timeoutFunction = options.timeoutFunc |
||||
? options.timeoutFunc |
||||
: (retry: number, startTime: number) => { |
||||
return Date.now() >= startTime + (options.timeout === undefined ? defaultTimeout : options.timeout); |
||||
}; |
||||
|
||||
const retryWaitFunction = options.retryWaitFunc |
||||
? options.retryWaitFunc |
||||
: (retry: number) => { |
||||
return Math.pow(2, retry) * 1000 + Math.random() * 100; |
||||
}; |
||||
|
||||
return new Observable((observer) => { |
||||
// Run function is where the logic takes place. We have it in a function so we can call it recursively.
|
||||
function run(currentQueryParams: StartQueryRequest[]) { |
||||
subscription = queryFun(currentQueryParams).subscribe({ |
||||
next(frames) { |
||||
// In case we successfully finished, merge the current response with whatever we already collected.
|
||||
const collectedPreviously = toDataQueryResponse({ data: { results: collected } }).data || []; |
||||
observer.next({ frames: [...collectedPreviously, ...frames] }); |
||||
observer.complete(); |
||||
}, |
||||
error(error: FetchError<{ results?: Record<string, Result> }> | string) { |
||||
// In case of error we first try to figure out what kind of error it is
|
||||
|
||||
// This means it was a generic 500 error probably so we just pass it on
|
||||
if (typeof error === 'string') { |
||||
observer.error(error); |
||||
return; |
||||
} |
||||
|
||||
// In case of multiple queries this some can error while some may be ok
|
||||
const errorData = splitErrorData(error); |
||||
|
||||
if (!errorData) { |
||||
// Not sure what happened but the error structure wasn't what we expected
|
||||
observer.error(error); |
||||
return; |
||||
} |
||||
|
||||
if (!errorData!.errors.length) { |
||||
// So there is no limit error but some other errors so nothing to retry so we just pass it as it would be
|
||||
// otherwise.
|
||||
observer.error(error); |
||||
return; |
||||
} |
||||
|
||||
if (timeoutFunction(retries, startTime.valueOf())) { |
||||
// We timed out but we could have started some queries
|
||||
if (Object.keys(collected).length || Object.keys(errorData.good).length) { |
||||
const dataResponse = toDataQueryResponse({ |
||||
data: { |
||||
results: { |
||||
...(errorData.good ?? {}), |
||||
...(collected ?? {}), |
||||
}, |
||||
}, |
||||
}); |
||||
dataResponse.error = { |
||||
...(dataResponse.error ?? {}), |
||||
message: `Some queries timed out: ${errorData.errorMessage}`, |
||||
}; |
||||
// So we consider this a partial success and pass the data forward but also with error to be shown to
|
||||
// the user.
|
||||
observer.next({ |
||||
error: dataResponse.error, |
||||
frames: dataResponse.data, |
||||
}); |
||||
observer.complete(); |
||||
} else { |
||||
// So we timed out and there was no data to pass forward so we just pass the error
|
||||
const dataResponse = toDataQueryResponse({ data: { results: error.data?.results ?? {} } }); |
||||
observer.error(dataResponse.error); |
||||
} |
||||
return; |
||||
} |
||||
|
||||
collected = { |
||||
...collected, |
||||
...errorData!.good, |
||||
}; |
||||
|
||||
timerID = setTimeout( |
||||
() => { |
||||
retries++; |
||||
console.log(`Attempt ${retries}`); |
||||
run(errorData!.errors); |
||||
}, |
||||
// We want to know how long to wait for the next retry. First time this will be 0.
|
||||
retryWaitFunction(retries + 1) |
||||
); |
||||
}, |
||||
}); |
||||
} |
||||
run(targets); |
||||
return () => { |
||||
// We clear only the latest timer and subscription but the observable should complete after one response so
|
||||
// there should not be more things running at the same time.
|
||||
clearTimeout(timerID); |
||||
subscription.unsubscribe(); |
||||
}; |
||||
}); |
||||
} |
||||
|
||||
function splitErrorData(error: FetchError<{ results?: Record<string, Result> }>) { |
||||
const results = error.data?.results; |
||||
if (!results) { |
||||
return undefined; |
||||
} |
||||
return Object.keys(results).reduce<{ |
||||
errors: StartQueryRequest[]; |
||||
good: Record<string, Result>; |
||||
errorMessage: string; |
||||
}>( |
||||
(acc, refId) => { |
||||
if (results[refId].error?.startsWith('LimitExceededException')) { |
||||
acc.errorMessage = results[refId].error!; |
||||
acc.errors.push(error.config.data.queries.find((q: any) => q.refId === refId)); |
||||
} else { |
||||
acc.good[refId] = results[refId]; |
||||
} |
||||
return acc; |
||||
}, |
||||
{ errors: [], good: {}, errorMessage: '' } |
||||
); |
||||
} |
Loading…
Reference in new issue