Skip to content

Commit 39816b7

Browse files
authored
Fix flaky prom remote write exporter concurrency test (#37430)
Fix #37104 This is more an artifact of the test firing an unbounded number of go routines each one making its own HTTP request. Although keepalive is enabled by default the code ends up not re-using many of the connections causing the many connections to end up in a TIME_WAIT state. In order to avoid this the test now limits the number of concurrent requests and has a small change to the actual code to facilitate re-use of existing TCP connections used by the HTTP client. Although there is a change to non-test code I don't consider this a bug worth changelog because no user of the component should reach such high burst of "push metrics" in any reasonable production scenario.
1 parent 11e18af commit 39816b7

File tree

2 files changed

+14
-7
lines changed

2 files changed

+14
-7
lines changed

exporter/prometheusremotewriteexporter/exporter.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -353,7 +353,10 @@ func (prwe *prwExporter) execute(ctx context.Context, writeReq *prompb.WriteRequ
353353
if err != nil {
354354
return err
355355
}
356-
defer resp.Body.Close()
356+
defer func() {
357+
_, _ = io.Copy(io.Discard, resp.Body)
358+
resp.Body.Close()
359+
}()
357360

358361
// 2xx status code is considered a success
359362
// 5xx errors are recoverable and the exporter should retry

exporter/prometheusremotewriteexporter/exporter_concurrency_test.go

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ import (
88
"io"
99
"net/http"
1010
"net/http/httptest"
11-
"os"
11+
"runtime"
1212
"strconv"
1313
"sync"
1414
"testing"
@@ -32,9 +32,6 @@ import (
3232
// Test everything works when there is more than one goroutine calling PushMetrics.
3333
// Today we only use 1 worker per exporter, but the intention of this test is to future-proof in case it changes.
3434
func Test_PushMetricsConcurrent(t *testing.T) {
35-
if os.Getenv("ImageOs") == "win25" && os.Getenv("GITHUB_ACTIONS") == "true" {
36-
t.Skip("Skipping test on Windows 2025 GH runners, see https://github.com/open-telemetry/opentelemetry-collector-contrib/issues/37104")
37-
}
3835
n := 1000
3936
ms := make([]pmetric.Metrics, n)
4037
testIDKey := "test_id"
@@ -137,15 +134,22 @@ func Test_PushMetricsConcurrent(t *testing.T) {
137134
resp, checkRequestErr := http.Get(server.URL)
138135
require.NoError(c, checkRequestErr)
139136
assert.NoError(c, resp.Body.Close())
140-
}, 5*time.Second, 100*time.Millisecond)
137+
}, 15*time.Second, 100*time.Millisecond)
141138

142139
var wg sync.WaitGroup
143140
wg.Add(n)
141+
maxConcurrentGoroutines := runtime.NumCPU() * 4
142+
semaphore := make(chan struct{}, maxConcurrentGoroutines)
144143
for _, m := range ms {
144+
semaphore <- struct{}{}
145145
go func() {
146+
defer func() {
147+
<-semaphore
148+
wg.Done()
149+
}()
150+
146151
err := prwe.PushMetrics(ctx, m)
147152
assert.NoError(t, err)
148-
wg.Done()
149153
}()
150154
}
151155
wg.Wait()

0 commit comments

Comments
 (0)