medqa configs

michiyasunaga · michiyasunaga · commit ce5c26ee5516 · 2021-11-12T08:32:39.000-08:00
diff --git a/utils/data_utils.py b/utils/data_utils.py
@@ -32,12 +32,16 @@
 
 MODEL_NAME_TO_CLASS = {model_name: model_class for model_class, model_name_list in MODEL_CLASS_TO_NAME.items() for model_name in model_name_list}
 
+#Add SapBERT configuration
+model_name = 'cambridgeltl/SapBERT-from-PubMedBERT-fulltext'
+MODEL_NAME_TO_CLASS[model_name] = 'bert'
+
 GPT_SPECIAL_TOKENS = ['_start_', '_delimiter_', '_classify_']
 
 
 class MultiGPUSparseAdjDataBatchGenerator(object):
     """A data generator that batches the data and moves them to the corresponding devices."""
-    def __init__(self, device0, device1, batch_size, indexes, qids, labels, 
+    def __init__(self, device0, device1, batch_size, indexes, qids, labels,
                  tensors0=[], lists0=[], tensors1=[], lists1=[], adj_data=None):
         self.device0 = device0
         self.device1 = device1
@@ -220,6 +224,28 @@ def load_resources(self, kg):
                 self.id2concept = [w.strip() for w in fin]
             self.concept2id = {w: i for i, w in enumerate(self.id2concept)}
             self.id2relation = conceptnet.merged_relations
+        elif kg == "ddb":
+            cpnet_vocab_path = "data/ddb/vocab.txt"
+            with open(cpnet_vocab_path, "r", encoding="utf8") as fin:
+                self.id2concept = [w.strip() for w in fin]
+            self.concept2id = {w: i for i, w in enumerate(self.id2concept)}
+            self.id2relation = [
+                'belongstothecategoryof',
+                'isacategory',
+                'maycause',
+                'isasubtypeof',
+                'isariskfactorof',
+                'isassociatedwith',
+                'maycontraindicate',
+                'interactswith',
+                'belongstothedrugfamilyof',
+                'child-parent',
+                'isavectorfor',
+                'mabeallelicwith',
+                'seealso',
+                'isaningradientof',
+                'mabeindicatedby'
+            ]
         else:
             raise ValueError("Invalid value for kg.")
 
@@ -406,7 +432,7 @@ def load_sparse_adj_data_with_contextnode(self, adj_pk_path, max_node_num, conce
         #node_scores: (n_questions, num_choice, max_node_num)
         #adj_lengths: (n_questions,&#12288;num_choice)
         return concept_ids, node_type_ids, node_scores, adj_lengths, special_nodes_mask, (edge_index, edge_type) #, half_n_rel * 2 + 1
-    
+
 
 def load_gpt_input_tensors(statement_jsonl_path, max_seq_length):
     def _truncate_seq_pair(tokens_a, tokens_b, max_length):
@@ -526,7 +552,7 @@ def read_examples(input_file):
                         label=label
                     ))
         return examples
-    
+
     def simple_convert_examples_to_features(examples, label_list, max_seq_length, tokenizer):
         """ Loads a data file into a list of `InputBatch`s
             `cls_token_at_end` define the location of the CLS token:
@@ -577,4 +603,4 @@ def convert_features_to_tensors(features):
     features, concepts_by_sents_list = simple_convert_examples_to_features(examples, list(range(len(examples[0].endings))), max_seq_length, tokenizer)
     example_ids = [f.example_id for f in features]
     *data_tensors, all_label = convert_features_to_tensors(features)
-    return example_ids, all_label, data_tensors, concepts_by_sents_list
+    return example_ids, all_label, data_tensors, concepts_by_sents_list
diff --git a/utils/parser_utils.py b/utils/parser_utils.py
@@ -18,13 +18,17 @@
         'bert-large-cased': 1e-4,
         'roberta-large': 1e-5,
     },
+    'medqa_usmle': {
+        'cambridgeltl/SapBERT-from-PubMedBERT-fulltext': 5e-5,
+    },
 }
 
-DATASET_LIST = ['csqa', 'obqa']
+DATASET_LIST = ['csqa', 'obqa', 'medqa_usmle']
 
 DATASET_SETTING = {
     'csqa': 'inhouse',
     'obqa': 'official',
+    'medqa_usmle': 'official',
 }
 
 DATASET_NO_TEST = []
@@ -33,12 +37,13 @@
     'transe': 'data/cpnet/glove.transe.sgd.ent.npy',
     'numberbatch': 'data/cpnet/concept.nb.npy',
     'tzw': 'data/cpnet/tzw.ent.npy',
+    'ddb': 'data/ddb/ent_emb.npy',
 }
 
 
 def add_data_arguments(parser):
     # arguments that all datasets share
-    parser.add_argument('--ent_emb', default=['tzw'], choices=['tzw', "transe", "numberbatch"], nargs='+', help='sources for entity embeddings')
+    parser.add_argument('--ent_emb', default=['tzw'], nargs='+', help='sources for entity embeddings')
     # dataset specific
     parser.add_argument('-ds', '--dataset', default='csqa', choices=DATASET_LIST, help='dataset name')
     parser.add_argument('--data_dir', default='data', type=str, help='Path to the data directory')