PK!wHHgpu_utils/__init__.pyfrom ._version import __version__ from .utils import gpu_init, get_gpus PK!gpu_utils/_scripts/__init__.pyPK!TO@@gpu_utils/_scripts/gpu.pyfrom argparse import ArgumentParser from ..utils import get_gpus, get_gpu_string def print_gpu_info() -> None: parser = ArgumentParser() parser.add_argument( "-mw", "--max_cmd_width", type=int, default=125, help="Width at which to truncate commands.", ) parser.add_argument( "-hc", "--hide_cmd", action="store_true", help="Flag to hide commands running on each gpu.", ) parser.add_argument( "-nc", "--no_color", action="store_true", help="Flag to remove color from output.", ) args = parser.parse_args() gpus = get_gpus(include_processes=not args.hide_cmd) print(get_gpu_string(gpus, args.max_cmd_width, args.hide_cmd, args.no_color)) if __name__ == "__main__": print_gpu_info() PK!#<0gpu_utils/_scripts/kill_interrupted_processes.pyfrom subprocess import run, PIPE from argparse import ArgumentParser from .. import get_gpus def kill_interrupted_processes(sudo: bool = False) -> None: """ Kill processes which `nvidia-smi` doesn't show but which are still using GPUs. Interrupting a process can sometimes lead to it not properly releasing GPU memory. This function tries to regain that memory by killing any processes that `nvidia-smi` doesn't show as running but that you can still see are using the GPUs using `lsof /dev/nvidia{gpu_id}`. `kill -9 {pid}` will be called for all such processes. :param sudo: if True, use sudo to find and kill interruped processes belong to all users """ sudo = "sudo " if sudo else "" gpus = get_gpus() n_gpus = len(gpus) all_pids = set() for gpu_id in range(n_gpus): stdout = run( (sudo + f"lsof -t /dev/nvidia{gpu_id}").split(" "), stdout=PIPE ).stdout pids = stdout.decode().split() all_pids.update(pids) running_pids = [process.pid for gpu in gpus for process in gpu.processes] kill_pids = all_pids.difference(running_pids) run([*(sudo + "kill -9").split(" "), *kill_pids]) def main(): parser = ArgumentParser( description="""Kill processes which `nvidia-smi` doesn't show but which are still using GPUs. Interrupting a process can sometimes lead to it not properly releasing GPU memory. This function tries to regain that memory by killing any processes that `nvidia-smi` doesn't show as running but that you can still see are using the GPUs using `lsof /dev/nvidia{gpu_id}`. `kill -9 {pid}` will be called for all such processes.""" ) parser.add_argument( "-s", "--sudo", action="store_true", help="if True, use sudo to find and kill interrupted processes belong to all users", ) args = parser.parse_args() kill_interrupted_processes(args.sudo) if __name__ == "__main__": main() PK![sojj#gpu_utils/_scripts/tmux_gpu_info.py# I use this to show the utilisation of each GPU in the status bar in tmux # e.g. with this line in ~/.tmux.conf: # set -g status-right '#[fg=yellow]#(tmux_gpu_info.py)' from .. import get_gpus def main(): gpus = get_gpus() # list of util_used for each GPU print([round(gpu.util_used, 2) for gpu in gpus]) if __name__ == "__main__": main() PK! gpu_utils/_version.py__version__ = "0.2.7" PK!0ЀCC gpu_utils/gpu_printing_config.py# TODO - describe the format a bit more here # available attributes: idx, util_used, util_free, mem_used, mem_free base_format = ["[{idx}]", "{util_used: >3} %", "{mem_free: >5}"] sep = " | " # separator between the attributes in the list above # either a color for each attribute or a function which takes in the attribute # and returns a color. Use "" to not add a color. colors = ["{fg('light_cyan')}", "{fg('green')}", "{fg('yellow_1')}"] # either replace directly below or pass in a value from the command line # if set here, it can't be overridden from the command line. The default value is 125 # available attributes: user, gpu_mem_used, command process_base_format = ["{user}", "{gpu_mem_used}", "{command:.}"] process_sep = " " process_colors = ["{fg('magenta_3a')}", "{fg('yellow')}", ""] PK! q$$gpu_utils/utils.pyimport os import psutil import sys import pynvml as nv # nvidia-ml-py3 from collections import namedtuple from contextlib import contextmanager from pathlib import Path from typing import Sequence, List, Optional, Dict, Union from colored import fg, bg, attr, names # https://gitlab.com/dslackw/colored from .gpu_printing_config import ( base_format, sep, colors, process_base_format, process_sep, process_colors, ) _Process = namedtuple("Process", ["user", "command", "gpu_mem_used", "pid"]) class _GPU: """ Memory is in MiB, utilization is percent used/free. """ def __init__( self, idx: int, mem_used: int, mem_total: int, util_used: int, processes: Optional[List[_Process]] = None, ): self.idx = idx self.mem_used = mem_used self.mem_free = mem_total - mem_used self.mem_total = mem_total self.util_used = util_used self.util_free = 100 - util_used self.processes = processes if processes is not None else [] def __repr__(self) -> str: repr_attrs = ["idx", "mem_used", "mem_total", "processes"] attr_str = ", ".join([f"{attr}={getattr(self, attr)}" for attr in repr_attrs]) return f"GPU({attr_str})" class _GPUList(list): """List wrapper that does pretty-printing.""" def __str__( self, max_cmd_width: int = 125, hide_cmd: bool = False, no_color: bool = False ): return get_gpu_string(self, max_cmd_width, hide_cmd, no_color) # define colors (including attributes) that can be used in format strings # for pretty-printing GPU info _attrs = [ "bold", "dim", "underlined", "blink", "reverse", "hidden", "reset", "res_bold", "res_dim", "res_underlined", "res_blink", "res_reverse", "res_hidden", ] _fg_colors = {f"fg('{color.lower()}')": fg(color.lower()) for color in names} _bg_colors = {f"bg('{color.lower()}')": bg(color.lower()) for color in names} _attrs = {f"attr('{name}')": attr(name) for name in _attrs} _colors = {**_fg_colors, **_bg_colors, **_attrs} @contextmanager def _nvml(): """Enter a context manager that will init and shutdown nvml.""" # Copyright (c) 2018 Bohumír Zámečník, Rossum Ltd., MIT license # from https://github.com/rossumai/nvgpu/blob/a66dda5ae816a6a8936645fe0520cb4dc6354137/nvgpu/nvml.py#L5 # Modifications copyright 2019, Nathan Hunt, MIT license nv.nvmlInit() yield nv.nvmlShutdown() def _try_except_nv_error(func, default, *args, **kwargs): try: return func(*args, **kwargs) except nv.NVMLError: return default def _to_mb(mem_in_bytes: int) -> int: bytes_in_mb = 1024 * 1024 return int(mem_in_bytes / bytes_in_mb) def _get_processes(handle: nv.c_nvmlDevice_t) -> List[Dict[str, Union[str, int]]]: nv_processes = [] nv_processes += _try_except_nv_error( nv.nvmlDeviceGetComputeRunningProcesses, [], handle ) nv_processes += _try_except_nv_error( nv.nvmlDeviceGetGraphicsRunningProcesses, [], handle ) processes = [] for nv_process in nv_processes: try: ps_process = psutil.Process(pid=nv_process.pid) process = _Process( ps_process.username(), " ".join(ps_process.cmdline() or ""), _to_mb(nv_process.usedGpuMemory), nv_process.pid, ) processes.append(process) except psutil.NoSuchProcess: pass return processes def get_gpus(include_processes: bool = False) -> List[_GPU]: """ Get a list of the GPUs on this machine. Any GPUs that don't support querying utilization will have util_used == util_free == -1. :param include_processes: whether to include a list of the processes running on each GPU; this takes more time. :returns: a list of the GPUs """ gpus = _GPUList() with _nvml(): for i in range(nv.nvmlDeviceGetCount()): handle = nv.nvmlDeviceGetHandleByIndex(i) memory = nv.nvmlDeviceGetMemoryInfo(handle) mem_used = _to_mb(memory.used) mem_free = _to_mb(memory.free) mem_total = mem_used + mem_free try: util = nv.nvmlDeviceGetUtilizationRates(handle) util_used = util.gpu except nv.NVMLError: util_used = float("nan") processes = _get_processes(handle) if include_processes else [] gpus.append(_GPU(i, mem_used, mem_total, util_used, processes)) return gpus def get_gpus_from_info_string(info_string: str) -> List[_GPU]: """ Get a list of GPUs from output from nvidia-smi. :param info_string: the output from running nvidia-smi --query-gpu=index,memory.used,memory.total,utilization.gpu --format=csv """ gpus = _GPUList() for line in info_string.strip().replace("MiB", "").replace("%", "").split("\n")[1:]: idx, mem_used, mem_total, util_used = line.split(", ") idx, mem_used, mem_total = map(int, (idx, mem_used, mem_total)) try: util_used = int(util_used) except ValueError: # utilization is not supported on all GPUs util_used = 101 gpus.append(_GPU(idx, mem_used, mem_total, util_used)) return gpus def get_best_gpu(metric: str = "util") -> int: """ :param metric: one of {util, mem}; "best" means having the largest amount of the desired resource :returns: id of the best GPU """ gpus = get_gpus() if metric == "util": best_gpu = max(gpus, key=lambda gpu: gpu.util_free) else: assert metric == "mem" best_gpu = max(gpus, key=lambda gpu: gpu.mem_free) return best_gpu.idx def gpu_init( gpu_id: Optional[int] = None, best_gpu_metric: str = "util", ml_library: str = "", verbose: bool = False, ): """ Set up environment variables CUDA_DEVICE_ORDER and CUDA_VISIBLE_DEVICES. If `ml_library` is specified, additional library-specific setup is done. :param gpu_id: the PCI_BUS_ID of the GPU to use (the id shown when you run `nvidia-smi`) if `None`, the "best" GPU is chosen :param best_gpu_metric: one of {util, mem}; which metric to maximize when choosing the best GPU to use :param ml_library: one of {torch, tensorflow}; additional setup specific to this library will be done. torch: create a `device` using the appropriate GPU tensorflow: create a `ConfigProto` that allows soft placement + GPU memory growth :param verbose: whether to print the id of the chosen GPU (or that no GPU was found) :returns: the id of the GPU chosen if `ml_library == ""`, the `torch.device` if `ml_library == "torch"`, or the `tf.ConfigProto` if `ml_library == "tensorflow"` If no GPUs are found, the id will be `None` but a usable `device` or `ConfigProto` will still be returned if one should be. This function should thus be safe to use in code that runs on both GPU- equipped and CPU-only machines. """ try: gpu_id = gpu_id or get_best_gpu(best_gpu_metric) os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id) if verbose: print(f"Running on GPU {gpu_id}.") except ValueError: # no GPUs found gpu_id = None if verbose: print("No GPUs found!") if ml_library == "": pass elif ml_library == "torch": import torch device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") return device elif ml_library == "tensorflow": import tensorflow as tf config = tf.ConfigProto() config.gpu_options.allow_growth = True config.allow_soft_placement = True return config else: raise NotImplementedError(f"Support for {ml_library} is not implemented.") return gpu_id def get_gpu_string( gpus: Sequence[_GPU], max_cmd_width: int = 125, hide_cmd: bool = False, no_color: bool = False, ) -> str: gpu_string = "" # combine base format and color commands into the fstrings if no_color: fstring = sep.join(base_format) process_fstring = process_sep.join(process_base_format) else: fstring = sep.join( [ colors[i] + base_format[i] + "{attr('reset')}" for i in range(len(base_format)) ] ) process_fstring = process_sep.join( [ process_colors[i] + process_base_format[i] + "{attr('reset')}" for i in range(len(process_base_format)) ] ) process_fstring = process_fstring.replace("", str(max_cmd_width)) for gpu in gpus: gpu_string += fstring.format(**vars(gpu), **_colors) gpu_string += os.linesep if hide_cmd or not gpu.processes: continue for i, process in enumerate(gpu.processes): gpu_string += " └─ " if i == len(gpu.processes) - 1 else " ├─ " gpu_string += process_fstring.format( **process._asdict(), **_colors, max_cmd_width=max_cmd_width ) gpu_string += os.linesep return gpu_string.strip() PK!H] `*gpu_utils-0.2.7.dist-info/entry_points.txtuK 0 D=LP$"Ej`?!I[.1,%H*٨M*FOn[1Jk1;؜s%" =ڥT|ǜPK!\ ],,!gpu_utils-0.2.7.dist-info/LICENSEMIT License Copyright (c) 2018 Nathan Hunt Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. PK!HڽTUgpu_utils-0.2.7.dist-info/WHEEL A н#Z;/"d&F[xzw@Zpy3Fv]\fi4WZ^EgM_-]#0(q7PK!HS y "gpu_utils-0.2.7.dist-info/METADATAV]o6}ׯH&kI4Vi/A`%Dh~璲,~s/A: 'Jc:_DE%T4ufy|nڪv(oR Yrs{]a# Νkx4*TWZbm]^T&iڌ`+z֮_JrLx? ^eZ*#=Rx{r?Jaʕtct3Ӧ1z)3~G\]QU륨}OG?k)+ƔRDE|48>kS/Uİ*MwĖ㣗߾cXf^r^ "S7ӵlUNn=65a td톷]|t#kUE, 3EJVrCEgT `CR#Xy\xuu* M`ΠTZNQnt`SFjq~A19V̐,9 kVɤlO'gOtBWg黻|PA _*mFJ ?>eS%+(*8oNVS[qP(ś#<#A0>~NtoVIgTzá.z {AP;Y}jJ1ԜXH<I:!7_qKd%Ɣ7YYGYT$dUR Ɏ?BB;@ PyUY5H}3/@FZM1j§bmJa(PK!H˃4 gpu_utils-0.2.7.dist-info/RECORD}ɮ@}E/AP@uSQ(.@ 47Y|篿ܡ*Q~mzax7 V楚zy}H IF(/JYmؠ{rooɜ" zYN 6'ws4Rn aFQ$-NЖDn1\>\a6dMFn-l'{py;<\΂ٺS|iX[]/A]3Ge\Ml%/8a"g mr'QLzbG[o V wش*'|ԱohuTq/TIVoopQfj*Ve M URq)F}#Ϣ?l-Uw}N9r􃨨n5wOj}}T"hJ>ȁ˽emQ^Yc VCn̐Au);~3}th-D Yٕ{օ Ḩ*<]]{A|PK!wHHgpu_utils/__init__.pyPK!{gpu_utils/_scripts/__init__.pyPK!TO@@큷gpu_utils/_scripts/gpu.pyPK!#<0.gpu_utils/_scripts/kill_interrupted_processes.pyPK![sojj#B gpu_utils/_scripts/tmux_gpu_info.pyPK!  gpu_utils/_version.pyPK!0ЀCC 6gpu_utils/gpu_printing_config.pyPK! q$$큷gpu_utils/utils.pyPK!H] `*6gpu_utils-0.2.7.dist-info/entry_points.txtPK!\ ],,!I7gpu_utils-0.2.7.dist-info/LICENSEPK!HڽTU;gpu_utils-0.2.7.dist-info/WHEELPK!HS y "E<gpu_utils-0.2.7.dist-info/METADATAPK!H˃4 Agpu_utils-0.2.7.dist-info/RECORDPK D