Server died forever when overloaded
See original GitHub issueDescription When flooded server with bunch of requests, the server died and no longer responds. Whats even worse, the health probe still indicates healthy.
Triton Information Triton 22.06
Are you using the Triton container or did you build it yourself? No, Dockerfile:
FROM nvcr.io/nvidia/tritonserver:22.06-py3
COPY ./requirements.txt /tmp/requirements.txt
RUN pip install -r /tmp/requirements.txt
ARG MODEL_REPO
ARG TEST_IMAGE
# Copy test image for warmup
ADD ${TEST_IMAGE} /root/input/IMAGE
# Copy model repo
ADD ${MODEL_REPO} /models
To Reproduce
ensemble config
# proto-file: model_config.proto
# proto-message: ModelConfig
name: "ensemble"
platform: "ensemble"
max_batch_size: 16
input [
{
name: "IMAGE"
data_type: TYPE_UINT8
dims: [-1]
}
]
output [
{
name: "IMAGE_EMBEDDING"
data_type: TYPE_FP32
dims: [1]
}
]
ensemble_scheduling {
step [
{
model_name: "preprocess"
model_version: -1
input_map {
key: "IMAGE"
value: "IMAGE"
}
output_map {
key: "IMAGE_PREPROCESSED"
value: "image_preprocessed"
}
},
{
model_name: "image_embedding"
model_version: -1
input_map {
key: "IMAGE_PREPROCESSED"
value: "image_preprocessed"
}
output_map {
key: "IMAGE_CLASSIFICATION"
value: "IMAGE_EMBEDDING"
}
}
]
}
preprocess config
# proto-file: model_config.proto
# proto-message: ModelConfig
name: "preprocess"
backend: "python"
max_batch_size: 16
input [
{
name: "IMAGE"
data_type: TYPE_UINT8
dims: [-1]
}
]
output [
{
name: "IMAGE_PREPROCESSED"
data_type: TYPE_FP32
dims: [3,400,400]
}
]
instance_group [
{
kind: KIND_CPU
}
]
dynamic_batching {
max_queue_delay_microseconds: 25
preferred_batch_size: [1,4,8,16]
default_queue_policy {
default_timeout_microseconds: 1000000
max_queue_size: 10000
}
}
preprocess python code
import asyncio
from asyncio.log import logger
from concurrent import futures
import json
import logging
import numpy as np
import torch.cuda
import torchvision.io
from torch import from_dlpack, to_dlpack
from torchvision.io import ImageReadMode
import torchvision.transforms as transforms
import torchvision.transforms.functional as F
import triton_python_backend_utils as pb_utils
DEVICE = "cpu" #"cuda" if torch.cuda.is_available() else "cpu"
EXECUTOR = futures.ThreadPoolExecutor(max_workers=8)
print(DEVICE)
def _transform(n_px, dtype):
def square_pad(image):
h, w = image.shape[-2:]
max_wh = np.max([w, h])
hp = int((max_wh - w) / 2)
vp = int((max_wh - h) / 2)
padding = [hp, vp, hp, vp]
return F.pad(image, padding, 0, 'constant')
return transforms.Compose([
square_pad,
transforms.Resize(n_px),
transforms.ConvertImageDtype(dtype),
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.226, 0.226, 0.226]),
])
class TritonPythonModel:
def __init__(self):
self.npx = None
self.output_torch_type = None
self.transform = None
def initialize(self, args):
model_config = json.loads(args['model_config'])
output_config = pb_utils.get_output_config_by_name(model_config, 'IMAGE_PREPROCESSED')
self.npx = output_config['dims'][-1]
output_dtype = output_config['data_type']
self.output_torch_type = torch.float16 if output_dtype == "TYPE_FP16" else torch.float32
self.transform = _transform(self.npx, self.output_torch_type)
def load_jpeg(self, img):
decoded = torchvision.io.decode_jpeg(img, mode=ImageReadMode.RGB, device="cpu")
if DEVICE == "cuda":
decoded = decoded.pin_memory().to(DEVICE, non_blocking=True)
transformed = self.transform(decoded)
return transformed
def process_request(self, request):
in_images = pb_utils.get_input_tensor_by_name(request, "IMAGE").to_dlpack()
in_jpeg_tensor = from_dlpack(in_images)
images = torch.stack(list(EXECUTOR.map(self.load_jpeg, in_jpeg_tensor)))
image_processed_dlpack = to_dlpack(images)
return pb_utils.InferenceResponse(output_tensors=[
pb_utils.Tensor.from_dlpack("IMAGE_PREPROCESSED", image_processed_dlpack)
])
async def execute(self, requests):
loop = asyncio.get_running_loop()
image_processed_list = await asyncio.gather(*[loop.run_in_executor(EXECUTOR, self.process_request, request) for request in requests])
return image_processed_list
model config
# proto-file: model_config.proto
# proto-message: ModelConfig
name: "image_embedding"
platform: "onnxruntime_onnx"
max_batch_size: 16
input [{
name: "IMAGE_PREPROCESSED"
data_type: TYPE_FP32
dims: [3,400,400]
}]
output [{
name: "IMAGE_CLASSIFICATION"
data_type: TYPE_FP32
dims: [1]
}]
instance_group {
kind: KIND_AUTO
}
dynamic_batching {
max_queue_delay_microseconds: 25
preferred_batch_size: [1,4,8,16]
default_queue_policy {
default_timeout_microseconds: 1000000
max_queue_size: 10000
}
}
model_warmup {
name: "warmup for batch 1"
batch_size: 1
inputs {
key: "IMAGE_PREPROCESSED"
value: {
data_type: TYPE_FP32
dims: [3,400,400]
random_data: True
}
}
}
model_warmup {
name: "warmup for batch 4"
batch_size: 4
inputs {
key: "IMAGE_PREPROCESSED"
value: {
data_type: TYPE_FP32
dims: [3,400,400]
random_data: True
}
}
}
model_warmup {
name: "warmup for batch 8"
batch_size: 8
inputs {
key: "IMAGE_PREPROCESSED"
value: {
data_type: TYPE_FP32
dims: [3,400,400]
random_data: True
}
}
}
model_warmup {
name: "warmup for batch 16"
batch_size: 16
inputs {
key: "IMAGE_PREPROCESSED"
value: {
data_type: TYPE_FP32
dims: [3,400,400]
random_data: True
}
}
}
onnx model: link
Queried 1000 requests on t4 + 8cpu and then queried 1 request after 10 mins First 1000 failed mostly (as expected) but the 1 request after 10 mins also failed (not expected)
"""
Script for testing
"""
import sys
import time
import grpc
import logging
import numpy as np
from tritonclient.grpc import service_pb2
from tritonclient.grpc import service_pb2_grpc
import tritonclient.grpc as grpcclient
from PIL import Image
import threading
target_url = "localhost:8001"
image = "test_image.jpg"
num_of_requests = 1000
Image.fromarray((np.random.rand(1500,1500,3) * 255).astype('uint8')).convert('RGB').save(image)
input_bytes = np.stack([np.frombuffer(open(image, 'rb').read(), dtype=np.uint8)], axis=0)
# Inference
try:
triton_client = grpcclient.InferenceServerClient(url=target_url, verbose=False)
except Exception as e:
logging.error(f"Live prob: triton client creation failed: ${str(e)}")
def fire_one_request():
inputs = [grpcclient.InferInput("IMAGE", list(input_bytes.shape), "UINT8")]
inputs[0].set_data_from_numpy(input_bytes)
outputs = [grpcclient.InferRequestedOutput("IMAGE_EMBEDDING")]
try:
result = triton_client.infer("ensemble", inputs, outputs=outputs, client_timeout=5)
except Exception as e:
return e
return None
class RequestThread (threading.Thread):
def __init__(self, thread_id, input_bytes):
threading.Thread.__init__(self)
self.thread_id = thread_id
self.input_bytes = input_bytes
def run(self):
t = time.time()
err = fire_one_request()
if err:
print(f"thread:{self.thread_id} failed: {err}")
else:
print(f"thread:{self.thread_id} succeeded")
print("First check")
ret = fire_one_request()
ret = "Succeeded" if ret is None else ret
print(f"First check result: {ret}")
threads = []
for i in range(num_of_requests):
thread = RequestThread(i,input_bytes)
thread.start()
threads.append(thread)
for t in threads:
t.join()
try:
channel = grpc.insecure_channel(target_url)
grpc_stub = service_pb2_grpc.GRPCInferenceServiceStub(channel)
request = service_pb2.ServerLiveRequest()
response = grpc_stub.ServerLive(request)
print(f"##server is still healthy: {response}")
except Exception as ex:
print(ex)
print("Final check")
ret = fire_one_request()
ret = "Succeeded" if ret is None else ret
print(f"Final check result: {ret}")
Expected behavior based on my settings, I would expect most of the requests got time out and the server will recover after digesting the requests in queue, but it stop responding forever until manually restarted.
Is there anything I’m missing? Thanks in advance for going over the issue!
Issue Analytics
- State:
- Created a year ago
- Comments:6 (2 by maintainers)
Top GitHub Comments
It turns out that we’re spawning threads in two places from the same threadpool. When receiving excessive requests, the two places are fighting over threads and no one is able to complete the job. After disable one of it, the server worked as expected.
Thanks @kthui for the help!
some updates I removed the preprocessing part and the server works as expected we can narrow down to the preprocessing part that’s causing this issue
@kthui do you see any potential issue for preprocessing implementation?