Added the GPU usage monitoring, and all Disk IO R/W monitoring.

40a001dc · Duchaj János · 951b1c5c · 40a001dc · 40a001dc
Commit 40a001dc authored Apr 14, 2024 by Duchaj János
Hide whitespace changes
Inline Side-by-side

Showing with 51 additions and 1 deletions

requirements/base.txt
+1 -1

src/client.py
+50 -0

No files found.
--- a/requirements/base.txt
+++ b/requirements/base.txt
 pika==1.2.0
 psutil==2.1.1
-
+pynvml==11.5.0
--- a/src/client.py
+++ b/src/client.py
@@ -9,6 +9,8 @@ import pika
 import psutil
 import time
 import re
+import subprocess
+from pynvml import *

 logging.basicConfig()
 logger = logging.getLogger(__name__)
@@ -138,6 +140,18 @@ class Client:
                               'bytes_sent', 'bytes_recv'):
                    metrics['network.%s-%s' %
                            (metric, interface)] = getattr(data, metric)
+        try:
+            for deviceCounter in range(nvmlDeviceGetCount()):
+                handle = nvmlDeviceGetHandleByIndex(deviceCounter)
+                deviceName = nvmlDeviceGetName(handle).replace(" ", "_")
+                deviceMemoryInfos = nvmlDeviceGetMemoryInfo(handle)
+                gpu_percent = deviceMemoryInfos.used / deviceMemoryInfos.total * 100
+                gpu_used_bytes = deviceMemoryInfos.used
+                metrics['gpu.percent.%s' % deviceName] = gpu_percent
+                metrics['gpu.used_bytes.%s' % deviceName] = gpu_used_bytes
+        except NVMLError as error:
+            logger.error('Something went wrong with GPU Monitoring:')
+            logger.error('Error: %s' % error)

        return ['%(host)s.%(name)s %(val)f %(time)d' % {'host': self.name,
                                                        'name': name,
@@ -218,6 +232,36 @@ class Client:

        return metrics

+    def startIOmonitor(self):
+        ioTopCall = subprocess.Popen(["sh","-c","sudo iotop -ao  -qqq -b -k --iter=2  -d9.5 | awk '{$1=$1};1' | cut -d'%' -f1 | cut -d' ' -f3,4,6 | sort"],stdout=subprocess.PIPE)
+        return ioTopCall
+
+    def collect_node_IO(self, completedIOShell):
+        now = time.time()
+        metrics = []
+        ProcessOut, ProcessErr = completedIOShell.communicate()
+        decodedIOTopOut = bytes.decode(ProcessOut)
+        decodedLines = decodedIOTopOut.splitlines()
+        IOReadWriteValsDict = {}
+        for line in decodedLines:
+            lineVals = line.split()
+            if (IOReadWriteValsDict.get(lineVals[0]) == None):
+                IOReadWriteValsDict[lineVals[0]] = [float(lineVals[1]),float(lineVals[2])]
+            else:
+                IOReadWriteValsDict[lineVals[0]][0] += float(lineVals[1])
+                IOReadWriteValsDict[lineVals[0]][1] += float(lineVals[2])
+        for metric,valueDuo in IOReadWriteValsDict.items():
+            rw = "read"
+            for value in valueDuo:
+                metrics.append('%(host)s.io.%(rw)s.%(name)s %(val)f %(time)d' % {'host': self.name,
+                                                                'name': metric,
+                                                                'rw': rw,
+                                                                'val': value,
+                                                                'time': now})
+                rw = "write"
+        return metrics
+
+
    @staticmethod
    def _chunker(seq, size):
        """Yield seq in size-long chunks.
@@ -233,9 +277,14 @@ class Client:
        """
        self.connect()
        self.processes = {}
+        nvmlInit()
        try:
+            runningIOshell = self.startIOmonitor()
            while True:
                metrics = self.collect_node() + self.collect_vms()
+                if runningIOshell.poll() != None:
+                    metrics += self.collect_node_IO(runningIOshell)
+                    runningIOshell = self.startIOmonitor()
                if metrics:
                    for chunk in self._chunker(metrics, 100):
                        self.send(chunk)
@@ -244,4 +293,5 @@ class Client:
        except KeyboardInterrupt:
            logger.info("Reporting has stopped by the user. Exiting...")
        finally:
+            nvmlShutdown()
            self.disconnect()