import json import re import sys from tqdm import tqdm __all__ = ['convert_to_obqa_statement'] # String used to indicate a blank BLANK_STR = "___" def convert_to_obqa_statement(qa_file: str, output_file1: str, output_file2: str): print(f'converting {qa_file} to entailment dataset...') nrow = sum(1 for _ in open(qa_file, 'r')) with open(output_file1, 'w') as output_handle1, open(output_file2, 'w') as output_handle2, open(qa_file, 'r') as qa_handle: # print("Writing to {} from {}".format(output_file, qa_file)) for line in tqdm(qa_handle, total=nrow): json_line = json.loads(line) output_dict = convert_qajson_to_entailment(json_line) output_handle1.write(json.dumps(output_dict)) output_handle1.write("\n") output_handle2.write(json.dumps(output_dict)) output_handle2.write("\n") print(f'converted statements saved to {output_file1}, {output_file2}') print() # Convert the QA file json to output dictionary containing premise and hypothesis def convert_qajson_to_entailment(qa_json: dict): question_text = qa_json["question"]["stem"] choices = qa_json["question"]["choices"] for choice in choices: choice_text = choice["text"] statement = question_text + ' ' + choice_text create_output_dict(qa_json, statement, choice["label"] == qa_json.get("answerKey", "A")) return qa_json # Create the output json dictionary from the input json, premise and hypothesis statement def create_output_dict(input_json: dict, statement: str, label: bool) -> dict: if "statements" not in input_json: input_json["statements"] = [] input_json["statements"].append({"label": label, "statement": statement}) return input_json