Shortcuts

Source code for openrl.supports.opengpu.manager

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright 2023 The TARTRL Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

""""""
import argparse
import logging
import traceback
from typing import List, Union

from openrl.supports.opengpu.gpu_info import get_local_GPU_info, get_remote_GPU_info


[docs]class RemoteGPUManager: def __init__(self, pytorch_config=None, check: bool = False): self.gpu_info_dict = get_remote_GPU_info() self.pytorch_config = pytorch_config self.server_list = [] if self.pytorch_config is not None: for server_address in self.pytorch_config.GPU_usage_dict: self.server_list.append(server_address) if check: self.check_gpus() self.cal_learner_number()
[docs] def check_gpus(self): assert self.pytorch_config is not None assert len(self.server_list) > 0 bad_gpus = [] for server_address in self.server_list: assert ( server_address in self.gpu_info_dict ), "can not get gpu info from {}".format(server_address) assert len(self.gpu_info_dict[server_address]["gpu_infos"]) > 0 for gpu_info in self.gpu_info_dict[server_address]["gpu_infos"]: if ( self.pytorch_config.GPU_usage_dict[server_address]["gpus"] == "all" or gpu_info["gpu"] in self.pytorch_config.GPU_usage_dict[server_address]["gpus"] ): if ( gpu_info["memory"]["total"] - gpu_info["memory"]["used"] < self.pytorch_config.min_memory_per_gpu ): bad_gpus.append( { "server": server_address, "gpu": gpu_info["gpu"], "free": ( gpu_info["memory"]["total"] - gpu_info["memory"]["used"] ), } ) if len(bad_gpus) > 0: for bad_gpu in bad_gpus: print( "server:{} GPU:{}, minimal memory {}GB, but only get {}GB free" " memory.".format( bad_gpu["server"], bad_gpu["gpu"], self.pytorch_config.min_memory_per_gpu, bad_gpu["free"], ) ) assert False, "GPUs not satisfy."
[docs] def cal_learner_number(self): self.server_gpu_mapping = {} gpu_num = 0 for server_address in self.server_list: gpu_mapping = {} for gpu_info in self.gpu_info_dict[server_address]["gpu_infos"]: if ( self.pytorch_config.GPU_usage_dict[server_address]["gpus"] == "all" or gpu_info["gpu"] in self.pytorch_config.GPU_usage_dict[server_address]["gpus"] ): gpu_mapping[gpu_info["gpu"]] = gpu_num gpu_num += 1 self.server_gpu_mapping[server_address] = gpu_mapping self.learner_num = gpu_num
[docs] def get_gpu_info(self, server_list: list): gpu_infos = {} for server_address in server_list: if server_address in self.gpu_info_dict: gpu_infos[server_address] = self.gpu_info_dict[server_address] return gpu_infos
[docs]class LocalGPUManager: def __init__(self, args: argparse.Namespace = None): self.args = args self.gpus = [] self.learner_num = 0 if args is None else args.learner_num if args is None or not self.args.disable_cuda: try: print("LocalGPUManager fetch gpu infos....") self.gpus = get_local_GPU_info() print("LocalGPUManager fetch gpu infos done!") except Exception: print("can not find GPU") traceback.print_exc() exit()
[docs] def get_gpu(self) -> int: if len(self.gpus) == 0: return None if self.args is None: return self.gpus[0].gpu_id if self.args.disable_cuda: return None else: return self.gpus[0].gpu_id
[docs] def get_learner_gpu(self, learner_id: int = 0) -> Union[int, None]: if self.args is None or (self.args.disable_cuda or len(self.gpus) == 0): return None if self.args.gpu_usage_type == "auto": return self.gpus[ learner_id if learner_id < len(self.gpus) else len(self.gpus) - 1 ].gpu_id elif self.args.gpu_usage_type == "single": return self.gpus[0].gpu_id else: logging.warning( "unknown gpu usage type:{}!".format(self.args.gpu_usage_type) ) return None
[docs] def get_learner_gpus(self) -> List[int]: gpus = [] for learner_id in range(self.learner_num): if self.args.gpu_usage_type == "auto": gpus.append( self.gpus[ ( learner_id if learner_id < len(self.gpus) else len(self.gpus) - 1 ) ].gpu_id ) elif self.args.gpu_usage_type == "single": gpus.append(self.gpus[0].gpu_id) else: logging.warning( "unknown gpu usage type:{}!".format(self.args.gpu_usage_type) ) gpus.append(None) return gpus
[docs] def get_worker_gpu(self, worker_id: int = 0) -> int: if self.args is None or (self.args.disable_cuda or len(self.gpus) == 0): return None worker_id += self.args.learner_num if self.args.gpu_usage_type == "auto": return self.gpus[ worker_id if worker_id < len(self.gpus) else len(self.gpus) - 1 ].gpu_id elif self.args.gpu_usage_type == "single": return self.gpus[0].gpu_id else: logging.warning( "unknown gpu usage type:{}!".format(self.args.gpu_usage_type) ) return None
[docs] def log_info(self): if self.args and self.args.disable_cuda: return for gpu in self.gpus: print(gpu)