Shortcuts

Source code for openrl.supports.opengpu.gpu_info

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright 2023 The TARTRL Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

""""""
import json
import os
import subprocess

import requests


[docs]def preserve_decimal(a: float, keep_num: int = 2): mul = 10 ^ keep_num return int(a * mul) / float(mul)
[docs]class GPUInfo: gpu_id: int product_name: str memory_total: int memory_used: int memory_free: int real_id: int def __str__(self): if self.real_id == self.gpu_id: return "GPU:{} free:\t{}Gb used:{}Gb/{}Gb\t{}".format( self.gpu_id, preserve_decimal(self.memory_free), preserve_decimal(self.memory_used), self.memory_total, self.product_name, ) else: return "GPU:{} real id:{} free:\t{}Gb used:{}Gb/{}Gb\t{}".format( self.gpu_id, self.real_id, preserve_decimal(self.memory_free), preserve_decimal(self.memory_used), self.memory_total, self.product_name, ) def __lt__(self, other): if self.memory_free < other.memory_free: return True if self.memory_free > other.memory_free: return False if self.memory_free == other.memory_free: return self.memory_total < other.memory_total
[docs]def get_local_GPU_info(): cmd = "gpustat --json" output = subprocess.getoutput(cmd) if "not found" in output: print( "Can not find gpustat. " "Please install gpustat first! " "You can install gpustat by 'pip install gpustat'" ) return [] # Deal with vGPU output = output.split("\n") new_output = [] for line in output: if "4pdvGPU" not in line: new_output.append(line) output = "\n".join(new_output) if "NVML Shared Library Not Found" in output: return [] gpu_dict = json.loads(output) gpus = gpu_dict["gpus"] gpu_list = [] if "CUDA_VISIBLE_DEVICES" in os.environ: gpu_available_list = list( map(int, os.environ["CUDA_VISIBLE_DEVICES"].split(",")) ) assert ( len(gpu_available_list) > 0 ), "Get {} GPUs, should greater than zero!".format(len(gpu_available_list)) else: gpu_available_list = None if gpu_available_list: gpu_real2id = {} for gpu_id, real_id in enumerate(gpu_available_list): gpu_real2id[real_id] = gpu_id for gpu in gpus: gpu_info = GPUInfo() gpu_info.real_id = int(gpu["index"]) if gpu_available_list: if gpu_info.real_id not in gpu_available_list: continue else: gpu_info.gpu_id = gpu_real2id[gpu_info.real_id] else: gpu_info.gpu_id = gpu_info.real_id gpu_info.product_name = gpu["name"] gpu_info.memory_total = gpu["memory.total"] / 1024.0 gpu_info.memory_used = gpu["memory.used"] / 1024.0 gpu_info.memory_free = gpu_info.memory_total - gpu_info.memory_used gpu_list.append(gpu_info) if gpu_available_list: assert len(gpu_available_list) == len( gpu_list ), 'os.environ["CUDA_VISIBLE_DEVICES"]={}, but get {} GPUs'.format( os.environ["CUDA_VISIBLE_DEVICES"], len(gpu_list) ) gpu_list.sort(reverse=True) return gpu_list
[docs]def get_remote_GPU_info(request_api: str): return_result = requests.get(request_api) gpu_info_dict = json.loads(return_result.content) return gpu_info_dict