Fix deadlock in TunnelRegistry
Closes #183 (closed).
From #183 (comment 715092206) we can see:
A lot of goroutines are blocked trying to register with TunnelRegistry
:
815 @ 0x438256 0x447ff2 0xc8174b 0xc82878 0x9e73b5 0x9f8157 0xc8274c 0xc7ab9f 0x9f207f 0x88dd03 0xb7c831 0x88dd27 0xb7dfee 0x88dd27 0xa362f2 0x88dd27 0x9e9555 0x88dd27 0xb81e49 0x88dd27 0x88dc39 0x88ec1a 0x890125 0x889e38 0x468be1
# 0xc8174a gitlab.com/gitlab-org/cluster-integration/gitlab-agent/v14/internal/module/reverse_tunnel.(*TunnelRegistry).HandleTunnel+0x2ca internal/module/reverse_tunnel/tunnel_registry.go:150
# 0xc82877 gitlab.com/gitlab-org/cluster-integration/gitlab-agent/v14/internal/module/reverse_tunnel/server.(*server).Connect.func1+0xd7 internal/module/reverse_tunnel/server/server.go:32
# 0x9e73b4 gitlab.com/gitlab-org/cluster-integration/gitlab-agent/v14/internal/tool/retry.PollWithBackoff+0x1f4 internal/tool/retry/retry.go:80
# 0x9f8156 gitlab.com/gitlab-org/cluster-integration/gitlab-agent/v14/internal/module/modshared.(*RpcApiStub).PollWithBackoff+0x96 internal/module/modshared/rpc_api_stub.go:20
# 0xc8274b gitlab.com/gitlab-org/cluster-integration/gitlab-agent/v14/internal/module/reverse_tunnel/server.(*server).Connect+0x22b internal/module/reverse_tunnel/server/server.go:24
# 0xc7ab9e gitlab.com/gitlab-org/cluster-integration/gitlab-agent/v14/internal/module/reverse_tunnel/rpc._ReverseTunnel_Connect_Handler+0x9e internal/module/reverse_tunnel/rpc/rpc_grpc.pb.go:93
# 0x9f207e gitlab.com/gitlab-org/cluster-integration/gitlab-agent/v14/internal/tool/grpctool.StreamServerLimitingInterceptor.func1+0x15e internal/tool/grpctool/server_limiting.go:34
# 0x88dd02 google.golang.org/grpc.chainStreamInterceptors.func1.1+0x62 external/org_golang_google_grpc/server.go:1416
# 0xb7c830 github.com/grpc-ecosystem/go-grpc-middleware/tracing/opentracing.StreamServerInterceptor.func1+0x270 external/com_github_grpc_ecosystem_go_grpc_middleware/tracing/opentracing/server_interceptors.go:58
# 0x88dd26 google.golang.org/grpc.chainStreamInterceptors.func1.1+0x86 external/org_golang_google_grpc/server.go:1419
# 0xb7dfed github.com/grpc-ecosystem/go-grpc-middleware/validator.StreamServerInterceptor.func1+0x8d external/com_github_grpc_ecosystem_go_grpc_middleware/validator/validator.go:72
# 0x88dd26 google.golang.org/grpc.chainStreamInterceptors.func1.1+0x86 external/org_golang_google_grpc/server.go:1419
# 0xa362f1 gitlab.com/gitlab-org/cluster-integration/gitlab-agent/v14/internal/module/modserver.StreamAgentRpcApiInterceptor.func1+0x191 internal/module/modserver/agent_rpc_api.go:67
# 0x88dd26 google.golang.org/grpc.chainStreamInterceptors.func1.1+0x86 external/org_golang_google_grpc/server.go:1419
# 0x9e9554 gitlab.com/gitlab-org/labkit/correlation/grpc.StreamServerCorrelationInterceptor.func1+0x1d4 external/com_gitlab_gitlab_org_labkit/correlation/grpc/server_interceptors.go:74
# 0x88dd26 google.golang.org/grpc.chainStreamInterceptors.func1.1+0x86 external/org_golang_google_grpc/server.go:1419
# 0xb81e48 github.com/grpc-ecosystem/go-grpc-prometheus.(*ServerMetrics).StreamServerInterceptor.func1+0x108 external/com_github_grpc_ecosystem_go_grpc_prometheus/server_metrics.go:122
# 0x88dd26 google.golang.org/grpc.chainStreamInterceptors.func1.1+0x86 external/org_golang_google_grpc/server.go:1419
# 0x88dc38 google.golang.org/grpc.chainStreamInterceptors.func1+0x158 external/org_golang_google_grpc/server.go:1421
# 0x88ec19 google.golang.org/grpc.(*Server).processStreamingRPC+0xe99 external/org_golang_google_grpc/server.go:1557
# 0x890124 google.golang.org/grpc.(*Server).handleStream+0x9e4 external/org_golang_google_grpc/server.go:1630
# 0x889e37 google.golang.org/grpc.(*Server).serveStreams.func1.2+0x97 external/org_golang_google_grpc/server.go:941
A lot of goroutines are blocked trying to handle stream context cancellation:
773 @ 0x438256 0x447ff2 0xc81889 0xc82878 0x9e73b5 0x9f8157 0xc8274c 0xc7ab9f 0x9f207f 0x88dd03 0xb7c831 0x88dd27 0xb7dfee 0x88dd27 0xa362f2 0x88dd27 0x9e9555 0x88dd27 0xb81e49 0x88dd27 0x88dc39 0x88ec1a 0x890125 0x889e38 0x468be1
# 0xc81888 gitlab.com/gitlab-org/cluster-integration/gitlab-agent/v14/internal/module/reverse_tunnel.(*TunnelRegistry).HandleTunnel+0x408 internal/module/reverse_tunnel/tunnel_registry.go:161
# 0xc82877 gitlab.com/gitlab-org/cluster-integration/gitlab-agent/v14/internal/module/reverse_tunnel/server.(*server).Connect.func1+0xd7 internal/module/reverse_tunnel/server/server.go:32
# 0x9e73b4 gitlab.com/gitlab-org/cluster-integration/gitlab-agent/v14/internal/tool/retry.PollWithBackoff+0x1f4 internal/tool/retry/retry.go:80
# 0x9f8156 gitlab.com/gitlab-org/cluster-integration/gitlab-agent/v14/internal/module/modshared.(*RpcApiStub).PollWithBackoff+0x96 internal/module/modshared/rpc_api_stub.go:20
# 0xc8274b gitlab.com/gitlab-org/cluster-integration/gitlab-agent/v14/internal/module/reverse_tunnel/server.(*server).Connect+0x22b internal/module/reverse_tunnel/server/server.go:24
# 0xc7ab9e gitlab.com/gitlab-org/cluster-integration/gitlab-agent/v14/internal/module/reverse_tunnel/rpc._ReverseTunnel_Connect_Handler+0x9e internal/module/reverse_tunnel/rpc/rpc_grpc.pb.go:93
# 0x9f207e gitlab.com/gitlab-org/cluster-integration/gitlab-agent/v14/internal/tool/grpctool.StreamServerLimitingInterceptor.func1+0x15e internal/tool/grpctool/server_limiting.go:34
# 0x88dd02 google.golang.org/grpc.chainStreamInterceptors.func1.1+0x62 external/org_golang_google_grpc/server.go:1416
# 0xb7c830 github.com/grpc-ecosystem/go-grpc-middleware/tracing/opentracing.StreamServerInterceptor.func1+0x270 external/com_github_grpc_ecosystem_go_grpc_middleware/tracing/opentracing/server_interceptors.go:58
# 0x88dd26 google.golang.org/grpc.chainStreamInterceptors.func1.1+0x86 external/org_golang_google_grpc/server.go:1419
# 0xb7dfed github.com/grpc-ecosystem/go-grpc-middleware/validator.StreamServerInterceptor.func1+0x8d external/com_github_grpc_ecosystem_go_grpc_middleware/validator/validator.go:72
# 0x88dd26 google.golang.org/grpc.chainStreamInterceptors.func1.1+0x86 external/org_golang_google_grpc/server.go:1419
# 0xa362f1 gitlab.com/gitlab-org/cluster-integration/gitlab-agent/v14/internal/module/modserver.StreamAgentRpcApiInterceptor.func1+0x191 internal/module/modserver/agent_rpc_api.go:67
# 0x88dd26 google.golang.org/grpc.chainStreamInterceptors.func1.1+0x86 external/org_golang_google_grpc/server.go:1419
# 0x9e9554 gitlab.com/gitlab-org/labkit/correlation/grpc.StreamServerCorrelationInterceptor.func1+0x1d4 external/com_gitlab_gitlab_org_labkit/correlation/grpc/server_interceptors.go:74
# 0x88dd26 google.golang.org/grpc.chainStreamInterceptors.func1.1+0x86 external/org_golang_google_grpc/server.go:1419
# 0xb81e48 github.com/grpc-ecosystem/go-grpc-prometheus.(*ServerMetrics).StreamServerInterceptor.func1+0x108 external/com_github_grpc_ecosystem_go_grpc_prometheus/server_metrics.go:122
# 0x88dd26 google.golang.org/grpc.chainStreamInterceptors.func1.1+0x86 external/org_golang_google_grpc/server.go:1419
# 0x88dc38 google.golang.org/grpc.chainStreamInterceptors.func1+0x158 external/org_golang_google_grpc/server.go:1421
# 0x88ec19 google.golang.org/grpc.(*Server).processStreamingRPC+0xe99 external/org_golang_google_grpc/server.go:1557
# 0x890124 google.golang.org/grpc.(*Server).handleStream+0x9e4 external/org_golang_google_grpc/server.go:1630
# 0x889e37 google.golang.org/grpc.(*Server).serveStreams.func1.2+0x97 external/org_golang_google_grpc/server.go:941
Some goroutines are blocked trying to forward the incoming stream to a connected agent. Some of these might very well be legitimate usage, but some is probably not. It's a separate bug, was fixed in !549 (merged):
33 @ 0x438256 0x405565 0x40511d 0xc7fd38 0xcb89bc 0xb7c831 0x88dd03 0xb7dfee 0x88dd27 0x9f0846 0x88dd27 0xa36f32 0x88dd27 0x9e9555 0x88dd27 0xb81e49 0x88dd27 0x88dc39 0x88ec1a 0x890125 0x889e38 0x468be1
# 0xc7fd37 gitlab.com/gitlab-org/cluster-integration/gitlab-agent/v14/internal/module/reverse_tunnel.(*tunnel).ForwardStream+0x357 internal/module/reverse_tunnel/tunnel.go:170
# 0xcb89bb gitlab.com/gitlab-org/cluster-integration/gitlab-agent/v14/cmd/kas/kasapp.(*router).RouteToCorrectAgentHandler+0x6fb cmd/kas/kasapp/router_agent.go:61
# 0xb7c830 github.com/grpc-ecosystem/go-grpc-middleware/tracing/opentracing.StreamServerInterceptor.func1+0x270 external/com_github_grpc_ecosystem_go_grpc_middleware/tracing/opentracing/server_interceptors.go:58
# 0x88dd02 google.golang.org/grpc.chainStreamInterceptors.func1.1+0x62 external/org_golang_google_grpc/server.go:1416
# 0xb7dfed github.com/grpc-ecosystem/go-grpc-middleware/validator.StreamServerInterceptor.func1+0x8d external/com_github_grpc_ecosystem_go_grpc_middleware/validator/validator.go:72
# 0x88dd26 google.golang.org/grpc.chainStreamInterceptors.func1.1+0x86 external/org_golang_google_grpc/server.go:1419
# 0x9f0845 gitlab.com/gitlab-org/cluster-integration/gitlab-agent/v14/internal/tool/grpctool.(*JWTAuther).StreamServerInterceptor+0x85 internal/tool/grpctool/jwt_server_auth.go:45
# 0x88dd26 google.golang.org/grpc.chainStreamInterceptors.func1.1+0x86 external/org_golang_google_grpc/server.go:1419
# 0xa36f31 gitlab.com/gitlab-org/cluster-integration/gitlab-agent/v14/internal/module/modserver.StreamRpcApiInterceptor.func1+0x151 internal/module/modserver/rpc_api.go:49
# 0x88dd26 google.golang.org/grpc.chainStreamInterceptors.func1.1+0x86 external/org_golang_google_grpc/server.go:1419
# 0x9e9554 gitlab.com/gitlab-org/labkit/correlation/grpc.StreamServerCorrelationInterceptor.func1+0x1d4 external/com_gitlab_gitlab_org_labkit/correlation/grpc/server_interceptors.go:74
# 0x88dd26 google.golang.org/grpc.chainStreamInterceptors.func1.1+0x86 external/org_golang_google_grpc/server.go:1419
# 0xb81e48 github.com/grpc-ecosystem/go-grpc-prometheus.(*ServerMetrics).StreamServerInterceptor.func1+0x108 external/com_github_grpc_ecosystem_go_grpc_prometheus/server_metrics.go:122
# 0x88dd26 google.golang.org/grpc.chainStreamInterceptors.func1.1+0x86 external/org_golang_google_grpc/server.go:1419
# 0x88dc38 google.golang.org/grpc.chainStreamInterceptors.func1+0x158 external/org_golang_google_grpc/server.go:1421
# 0x88ec19 google.golang.org/grpc.(*Server).processStreamingRPC+0xe99 external/org_golang_google_grpc/server.go:1557
# 0x890124 google.golang.org/grpc.(*Server).handleStream+0x9e4 external/org_golang_google_grpc/server.go:1630
# 0x889e37 google.golang.org/grpc.(*Server).serveStreams.func1.2+0x97 external/org_golang_google_grpc/server.go:941
And, finally, the smoking gun:
1 @ 0x438256 0x405565 0x40511d 0xc80f0d 0xc80ec0 0x9e83ad 0x468be1
# 0xc80f0c gitlab.com/gitlab-org/cluster-integration/gitlab-agent/v14/internal/module/reverse_tunnel.(*TunnelRegistry).handleTunnelUnregister+0x22c internal/module/reverse_tunnel/tunnel_registry.go:202
# 0xc80ebf gitlab.com/gitlab-org/cluster-integration/gitlab-agent/v14/internal/module/reverse_tunnel.(*TunnelRegistry).Run+0x1df internal/module/reverse_tunnel/tunnel_registry.go:86
# 0x9e83ac github.com/ash2k/stager.(*stage).Go.func1+0x2c external/com_github_ash2k_stager/stage.go:25
This is the main TunnelRegistry
goroutine that is matching incoming tunnels and incoming requests to forward. If it blocks, CI tunnel stops working. It did block because of the bug.