Skip to content

Commit 3f78dd6

Browse files
committed
gpu_index change
1 parent b5f4a57 commit 3f78dd6

5 files changed

Lines changed: 68 additions & 15 deletions

File tree

exp.csv

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
label,command
2+
obqa-base,./run_greaselm.sh obqa --data_dir data/ --emp False --use_wandb True
3+
csqa-base,./run_greaselm.sh csqa --data_dir data/ --emp False --use_wandb True
4+
mix-1,./run_greaselm.sh obqa --data_dir data/ --emp False --use_wandb True -k 1
5+
mix-3,./run_greaselm.sh obqa --data_dir data/ --emp False --use_wandb True -k 3
6+
mix-7./run_greaselm.sh obqa --data_dir data/ --emp False --use_wandb True -k 7
7+
mixnodenum-2,./run_greaselm.sh obqa --data_dir data/ --emp False --use_wandb True --mix_number 2
8+
mixnodenum-3,./run_greaselm.sh obqa --data_dir data/ --emp False --use_wandb True --mix_number 3
9+
mixnodenum-5,./run_greaselm.sh obqa --data_dir data/ --emp False --use_wandb True --mix_number 5
10+
mixnodenum-10,./run_greaselm.sh obqa --data_dir data/ --emp False --use_wandb True --mix_number 10
11+
mixnodenum-20,./run_greaselm.sh obqa --data_dir data/ --emp False --use_wandb True --mix_number 20
12+
nodenum-100,./run_greaselm.sh obqa --data_dir data/ --emp False --use_wandb True --gnn_dim 100
13+
nodenum-300,./run_greaselm.sh obqa --data_dir data/ --emp False --use_wandb True --gnn_dim 300
14+
entity-mix-1,./run_greaselm.sh obqa --data_dir data/ --emp True --use_wandb True -k 1
15+
entity-mix-3,./run_greaselm.sh obqa --data_dir data/ --emp True --use_wandb True -k 3
16+
entity-mix-5,./run_greaselm.sh obqa --data_dir data/ --emp True --use_wandb True -k 5
17+
entity-mix-7,./run_greaselm.sh obqa --data_dir data/ --emp True --use_wandb True -k 7
18+
entity-mix-csqa,./run_greaselm.sh csqa --data_dir data/ --emp True --use_wandb True -k 5
19+

greaselm.py

Lines changed: 26 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -521,22 +521,35 @@ def evaluate(args, has_test_split, devices, kg):
521521
print('-' * 71)
522522

523523

524+
# def get_devices(use_cuda):
525+
# # """Get the devices to put the data and the model based on whether to use GPUs and, if so, how many of them are available."""
526+
# if torch.cuda.device_count() >= 2 and use_cuda:
527+
# [device0, device1] = nv_usage.get_gpu_index(2)
528+
# print("device0: {}, device1: {}".format(device0, device1))
529+
# elif torch.cuda.device_count() == 1 and use_cuda:
530+
# device0 = torch.device("cuda:0")
531+
# device1 = torch.device("cuda:0")
532+
# else:
533+
# device0 = torch.device("cpu")
534+
# device1 = torch.device("cpu")
535+
# [device0] = nv_usage.get_gpu_index(1)
536+
# device1 = device0
537+
# return device0, device1
538+
524539
def get_devices(use_cuda):
525-
# """Get the devices to put the data and the model based on whether to use GPUs and, if so, how many of them are available."""
526-
# if torch.cuda.device_count() >= 2 and use_cuda:
527-
# [device0, device1] = nv_usage.get_gpu_index(2)
528-
# print("device0: {}, device1: {}".format(device0, device1))
529-
# elif torch.cuda.device_count() == 1 and use_cuda:
530-
# device0 = torch.device("cuda:0")
531-
# device1 = torch.device("cuda:0")
532-
# else:
533-
# device0 = torch.device("cpu")
534-
# device1 = torch.device("cpu")
535-
[device0] = nv_usage.get_gpu_index(1)
536-
device1 = device0
540+
"""Get the devices to put the data and the model based on whether to use GPUs and, if so, how many of them are available."""
541+
if torch.cuda.device_count() >= 2 and use_cuda:
542+
device0 = torch.device("cuda:0")
543+
device1 = torch.device("cuda:1")
544+
print("device0: {}, device1: {}".format(device0, device1))
545+
elif torch.cuda.device_count() == 1 and use_cuda:
546+
device0 = torch.device("cuda:0")
547+
device1 = torch.device("cuda:0")
548+
else:
549+
device0 = torch.device("cpu")
550+
device1 = torch.device("cpu")
537551
return device0, device1
538552

539-
540553
def main(args):
541554

542555
logging.basicConfig(format='%(asctime)s,%(msecs)d %(levelname)-8s [%(name)s:%(funcName)s():%(lineno)d] %(message)s',

main.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
from utils.nv_usage import get_avail_gpu_index
2+
import pandas as pd
3+
import os
4+
5+
get_avail_gpu_index()

test.md

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,4 +28,3 @@
2828

2929

3030

31-

utils/nv_usage.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,25 @@ def get_gpu_index(gpu_number):
2121
# Shutdown NVML
2222
pynvml.nvmlShutdown()
2323
raise ValueError(f"Only {len(gpu_index_list)} gpu are empty. No adequate gpu found !")
24-
2524

25+
# 输入是需要并行计算的gpu数量,输出是符合条件的gpu标号list
26+
def get_avail_gpu_index():
27+
# Initialize NVML
28+
pynvml.nvmlInit()
29+
30+
gpu_index_list = []
31+
deviceCount = pynvml.nvmlDeviceGetCount()
32+
for i in range(deviceCount):
33+
handle = pynvml.nvmlDeviceGetHandleByIndex(i)
34+
# Get the GPU utilization
35+
utilization = pynvml.nvmlDeviceGetUtilizationRates(handle).gpu
36+
meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
37+
memrate = meminfo.used/meminfo.total
38+
if utilization == 0 and memrate < 0.05:
39+
gpu_index_list.append(i)
40+
# Shutdown NVML
41+
pynvml.nvmlShutdown()
42+
return gpu_index_list
2643

2744

2845
if __name__ == "__main__":

0 commit comments

Comments
 (0)