forked from snap-stanford/GreaseLM
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathconvert_obqa.py
More file actions
45 lines (36 loc) · 1.74 KB
/
Copy pathconvert_obqa.py
File metadata and controls
45 lines (36 loc) · 1.74 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import json
import re
import sys
from tqdm import tqdm
__all__ = ['convert_to_obqa_statement']
# String used to indicate a blank
BLANK_STR = "___"
def convert_to_obqa_statement(qa_file: str, output_file1: str, output_file2: str):
print(f'converting {qa_file} to entailment dataset...')
nrow = sum(1 for _ in open(qa_file, 'r'))
with open(output_file1, 'w') as output_handle1, open(output_file2, 'w') as output_handle2, open(qa_file, 'r') as qa_handle:
# print("Writing to {} from {}".format(output_file, qa_file))
for line in tqdm(qa_handle, total=nrow):
json_line = json.loads(line)
output_dict = convert_qajson_to_entailment(json_line)
output_handle1.write(json.dumps(output_dict))
output_handle1.write("\n")
output_handle2.write(json.dumps(output_dict))
output_handle2.write("\n")
print(f'converted statements saved to {output_file1}, {output_file2}')
print()
# Convert the QA file json to output dictionary containing premise and hypothesis
def convert_qajson_to_entailment(qa_json: dict):
question_text = qa_json["question"]["stem"]
choices = qa_json["question"]["choices"]
for choice in choices:
choice_text = choice["text"]
statement = question_text + ' ' + choice_text
create_output_dict(qa_json, statement, choice["label"] == qa_json.get("answerKey", "A"))
return qa_json
# Create the output json dictionary from the input json, premise and hypothesis statement
def create_output_dict(input_json: dict, statement: str, label: bool) -> dict:
if "statements" not in input_json:
input_json["statements"] = []
input_json["statements"].append({"label": label, "statement": statement})
return input_json