gpu_index change

AlchemistZoro · AlchemistZoro · commit 3f78dd66ae40 · 2023-04-15T23:25:43.000+08:00
diff --git a/exp.csv b/exp.csv
@@ -0,0 +1,19 @@
+label,command
+obqa-base,./run_greaselm.sh obqa --data_dir data/ --emp False --use_wandb True
+csqa-base,./run_greaselm.sh csqa --data_dir data/ --emp False --use_wandb True
+mix-1,./run_greaselm.sh obqa --data_dir data/ --emp False --use_wandb True -k 1
+mix-3,./run_greaselm.sh obqa --data_dir data/ --emp False --use_wandb True -k 3
+mix-7./run_greaselm.sh obqa --data_dir data/ --emp False --use_wandb True -k 7
+mixnodenum-2,./run_greaselm.sh obqa --data_dir data/ --emp False --use_wandb True --mix_number 2
+mixnodenum-3,./run_greaselm.sh obqa --data_dir data/ --emp False --use_wandb True --mix_number 3
+mixnodenum-5,./run_greaselm.sh obqa --data_dir data/ --emp False --use_wandb True --mix_number 5
+mixnodenum-10,./run_greaselm.sh obqa --data_dir data/ --emp False --use_wandb True --mix_number 10 
+mixnodenum-20,./run_greaselm.sh obqa --data_dir data/ --emp False --use_wandb True --mix_number 20
+nodenum-100,./run_greaselm.sh obqa --data_dir data/ --emp False --use_wandb True --gnn_dim 100
+nodenum-300,./run_greaselm.sh obqa --data_dir data/ --emp False --use_wandb True --gnn_dim 300
+entity-mix-1,./run_greaselm.sh obqa --data_dir data/ --emp True --use_wandb True -k 1
+entity-mix-3,./run_greaselm.sh obqa --data_dir data/ --emp True --use_wandb True -k 3
+entity-mix-5,./run_greaselm.sh obqa --data_dir data/ --emp True --use_wandb True -k 5
+entity-mix-7,./run_greaselm.sh obqa --data_dir data/ --emp True --use_wandb True -k 7
+entity-mix-csqa,./run_greaselm.sh csqa --data_dir data/ --emp True --use_wandb True -k 5
+
diff --git a/greaselm.py b/greaselm.py
@@ -521,22 +521,35 @@ def evaluate(args, has_test_split, devices, kg):
     print('-' * 71)
 
 
+# def get_devices(use_cuda):
+#     # """Get the devices to put the data and the model based on whether to use GPUs and, if so, how many of them are available."""
+#     if torch.cuda.device_count() >= 2 and use_cuda:
+#         [device0, device1] = nv_usage.get_gpu_index(2)
+#         print("device0: {}, device1: {}".format(device0, device1))
+#     elif torch.cuda.device_count() == 1 and use_cuda:
+#         device0 = torch.device("cuda:0")
+#         device1 = torch.device("cuda:0")
+#     else:
+#         device0 = torch.device("cpu")
+#         device1 = torch.device("cpu")
+#     [device0] = nv_usage.get_gpu_index(1)
+#     device1 = device0
+#     return device0, device1
+
 def get_devices(use_cuda):
-    # """Get the devices to put the data and the model based on whether to use GPUs and, if so, how many of them are available."""
-    # if torch.cuda.device_count() >= 2 and use_cuda:
-    #     [device0, device1] = nv_usage.get_gpu_index(2)
-    #     print("device0: {}, device1: {}".format(device0, device1))
-    # elif torch.cuda.device_count() == 1 and use_cuda:
-    #     device0 = torch.device("cuda:0")
-    #     device1 = torch.device("cuda:0")
-    # else:
-    #     device0 = torch.device("cpu")
-    #     device1 = torch.device("cpu")
-    [device0] = nv_usage.get_gpu_index(1)
-    device1 = device0
+    """Get the devices to put the data and the model based on whether to use GPUs and, if so, how many of them are available."""
+    if torch.cuda.device_count() >= 2 and use_cuda:
+        device0 = torch.device("cuda:0")
+        device1 = torch.device("cuda:1")
+        print("device0: {}, device1: {}".format(device0, device1))
+    elif torch.cuda.device_count() == 1 and use_cuda:
+        device0 = torch.device("cuda:0")
+        device1 = torch.device("cuda:0")
+    else:
+        device0 = torch.device("cpu")
+        device1 = torch.device("cpu")
     return device0, device1
 
-
 def main(args):
     
     logging.basicConfig(format='%(asctime)s,%(msecs)d %(levelname)-8s [%(name)s:%(funcName)s():%(lineno)d] %(message)s',
diff --git a/main.py b/main.py
@@ -0,0 +1,5 @@
+from utils.nv_usage import get_avail_gpu_index
+import pandas as pd
+import os
+
+get_avail_gpu_index()
diff --git a/test.md b/test.md
@@ -28,4 +28,3 @@
 
 
 
-
diff --git a/utils/nv_usage.py b/utils/nv_usage.py
@@ -21,8 +21,25 @@ def get_gpu_index(gpu_number):
     # Shutdown NVML
     pynvml.nvmlShutdown()
     raise ValueError(f"Only {len(gpu_index_list)} gpu are empty. No adequate gpu found !")
-    
 
+# &#36755;&#20837;&#26159;&#38656;&#35201;&#24182;&#34892;&#35745;&#31639;&#30340;gpu&#25968;&#37327;&#65292;&#36755;&#20986;&#26159;&#31526;&#21512;&#26465;&#20214;&#30340;gpu&#26631;&#21495;list
+def get_avail_gpu_index():
+    # Initialize NVML
+    pynvml.nvmlInit()
+
+    gpu_index_list = []
+    deviceCount = pynvml.nvmlDeviceGetCount()
+    for i in range(deviceCount):
+        handle = pynvml.nvmlDeviceGetHandleByIndex(i)
+        # Get the GPU utilization
+        utilization = pynvml.nvmlDeviceGetUtilizationRates(handle).gpu
+        meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
+        memrate = meminfo.used/meminfo.total
+        if utilization == 0 and memrate < 0.05:
+            gpu_index_list.append(i)
+    # Shutdown NVML
+    pynvml.nvmlShutdown()
+    return gpu_index_list
 
 
 if __name__ == "__main__":